In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
data_total = pd.read_csv('data_train/si630w22-hw3-data.csv')
data_train = pd.read_csv('data_train/si630w22-hw3-train.csv')
data_dev = pd.read_csv('data_train/si630w22-hw3-dev.csv')
data_test = pd.read_csv('data_train/si630w22-hw3-test.public.csv')

In [3]:
data_total.head(4)

Unnamed: 0,question_id,question_text,reply_id,reply_text,rlen
0,t3_n27vu3,What's something nice you like to do just to b...,gwhrhmf,Give compliments. It’s extremely easy to do an...,205
1,t3_n2az7m,So what is the best headphones for people who ...,gwiatps,I prefer Raycon Performance Ear Buds. They are...,178
2,t3_n2dzr9,How do you go on knowing a loved one only has ...,gwit1wj,Make it as memorable as the rest of your time ...,278
3,t3_n2iy9q,You’ve been dropped to the year 1800 with all ...,gwjhw8i,They're gonna burn me at the stake for being a...,52


In [4]:
data1 = data_train.groupby('id')["rating"].mean().reset_index()
data2 = data_dev.groupby('id')["rating"].mean().reset_index()

In [5]:
df1 = pd.merge(left=data1, right=data_total, how='left',
               left_on='id', right_on='question_id')
df2 = pd.merge(left=data2, right=data_total, how='left',
               left_on='id', right_on='question_id')

df1.head()

Unnamed: 0,id,rating,question_id,question_text,reply_id,reply_text,rlen
0,t3_n2714y,4.75,t3_n2714y,"Is there someone you turned down in the past, ...",gwhmmsp,Idk if this counts but my when I was younger m...,391
1,t3_n27873,3.75,t3_n27873,"What is, in your opinion, the saddest villain ...",gwhn3bt,My man Dr Heinz Doofenschmirts was born withou...,154
2,t3_n27b1e,3.5,t3_n27b1e,ELI5: How do we still not know how eels reprod...,gwho8nq,"For a long time, it wasn't known how eels mate...",207
3,t3_n27qop,4.0,t3_n27qop,ELI5: Why can’t freshwater fish live in saltwa...,gwht547,A living cell is designed to work at specific ...,1145
4,t3_n27vu3,4.4,t3_n27vu3,What's something nice you like to do just to b...,gwhrhmf,Give compliments. It’s extremely easy to do an...,205


## Gen train and dev for regression problem

In [6]:
# Data process for regression
# def data_proc(x):
#     return round(x)  # change the label to be 0,1,2,3,4

# df1['rate'] = df1['rating'].map(data_proc)
# df2['rate'] = df2['rating'].map(data_proc)
df1.rename(columns={"rating": "rate"}, inplace=True)
df2.rename(columns={"rating": "rate"}, inplace=True)
# df1[['rate']] = df1[['rate']].astype(np.int32)
# df2[['rate']] = df2[['rate']].astype(np.int32)

df1[['question_text', 'reply_text', 'rate']].to_csv('data_train_reg.csv', encoding='utf8', index=False)
df2[['question_text', 'reply_text', 'rate']].to_csv('data_dev_reg.csv', encoding='utf8', index=False)

In [None]:
# If use classification instead of regression
def data_proc(x):
    return np.int32(round(x)-1)  # change the label to be 0,1,2,3,4

df1['rate'] = df1['rating'].map(data_proc)
df2['rate'] = df2['rating'].map(data_proc)

# df1[['rate']] = df1[['rate']].astype(np.int32)
# df2[['rate']] = df2[['rate']].astype(np.int32)

df1[['question_text', 'reply_text', 'rate']].to_csv('data_train_clas.csv', encoding='utf8', index=False)
df2[['question_text', 'reply_text', 'rate']].to_csv('data_dev_clas.csv', encoding='utf8', index=False)

## Data processing for Problem 13

In [7]:
# create the new training data
group_lst = data_train['group'].unique().tolist()
file_path = 'P13_evaluation/'

for g in group_lst:
    new_train = data_train[data_train['group'] != g]
    new_train_mean = new_train.groupby('id')["rating"].mean().reset_index() # get the mean of the rating as the new label
    new_train_merge = pd.merge(left=new_train_mean, right=data_total, how='left',
               left_on='id', right_on='question_id')  # match the corresponding question and reply
    new_train_merge.rename(columns={"rating": "rate"}, inplace=True)
    new_train_merge[['question_text', 'reply_text', 'rate']].to_csv(''.join([file_path, g, '_train', '.csv']), encoding='utf8', index=False)
    
    

In [8]:
# create three new dev sets for each data

for g in tqdm(group_lst):
    # (a) replies without any annotators in the g
    dev_a = data_dev[data_dev['group'] != g]
    dev_a_mean = dev_a.groupby('id')["rating"].mean().reset_index() # get the mean of the rating as the new label
    dev_a_merge = pd.merge(left=dev_a_mean, right=data_total, how='left',
               left_on='id', right_on='question_id')  # match the corresponding question and reply
    dev_a_merge.rename(columns={"rating": "rate"}, inplace=True)
    dev_a_merge.dropna(inplace=True)
    dev_a_merge[['question_text', 'reply_text', 'rate']].to_csv(''.join([file_path, g, '_dev_a', '.csv']), encoding='utf8', index=False)
    
    # (b) replies in group g and their own annotations
    dev_b = data_dev[data_dev['group'] == g]
    dev_b_mean = dev_b.groupby('id')["rating"].mean().reset_index() # get the mean of the rating as the new label
    dev_b_merge = pd.merge(left=dev_b_mean, right=data_total, how='left',
               left_on='id', right_on='question_id')  # match the corresponding question and reply
    dev_b_merge.rename(columns={"rating": "rate"}, inplace=True)
    dev_b_merge.dropna(inplace=True)
    dev_b_merge[['question_text', 'reply_text', 'rate']].to_csv(''.join([file_path, g, '_dev_b', '.csv']), encoding='utf8', index=False)  
    
    # (c) replies in group g but with others' annotations
    dev_c = pd.merge(left=dev_b_mean[['id']], right=dev_a_mean, how='left',
                    left_on='id', right_on='id')
    # dev_b_mean is just the rows of the g group, dev_a_mean is the rate from others, so use left join
    dev_c_merge = pd.merge(left=dev_c, right=data_total, how='left',
                        left_on='id', right_on='question_id')  # match the corresponding question and reply
    dev_c_merge.rename(columns={"rating": "rate"}, inplace=True)
    dev_c_merge.dropna(inplace=True)
    dev_c_merge[['question_text', 'reply_text', 'rate']].to_csv(''.join([file_path, g, '_dev_c', '.csv']), encoding='utf8', index=False)  

  0%|          | 0/24 [00:00<?, ?it/s]

## Generate testset for problem12

In [12]:
data_use = data_test[['id']].drop_duplicates()
data_use = pd.merge(left=data_use[['id']], right=data_total, how='left',
                    left_on='id', right_on='question_id')

data_use[['id', 'question_text', 'reply_text']].to_csv('data_test.csv', encoding='utf8', index=False)