In [1]:
import os
os.chdir('../../')

import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

from preprocessing.triplets import generate_triplet_dataset


In [2]:
DATA_FOLDER = './data/'
TRAIN_VAL_CSV = 'train_val.csv'
SEED = 11

DEBIASED_FOLDER = './data/quora_qp_leakage/' 
DEBIASED_RESULT_FOLDER = './data/debiased/'


### Split train.csv dataset with stratification. Seed fixed.

In [3]:
train_df = pd.read_csv(DATA_FOLDER + TRAIN_VAL_CSV, index_col='id').fillna('')
X_train = np.arange(len(train_df))
y_train = train_df['is_duplicate'].to_numpy(dtype=np.int32)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, stratify=y_train, random_state=SEED)
valid_df = train_df.iloc[X_val]
train_df = train_df.iloc[X_train]

train_df.to_csv(DATA_FOLDER + 'train.csv', index_label='id')
valid_df.to_csv(DATA_FOLDER + 'valid.csv', index_label='id')


### debiased version

In [3]:
names = ['label', 'question1', 'question2', 'id']
train_debiased_df = pd.read_csv(DEBIASED_FOLDER + 'train.tsv', sep='\t', header=None, index_col=3, names=names)
valid_debiased_df = pd.read_csv(DEBIASED_FOLDER + 'dev.tsv', sep='\t', header=None, index_col=3, names=names)
test_debiased_df = pd.read_csv(DEBIASED_FOLDER + 'test.tsv', sep='\t', header=None, index_col=3, names=names)

all_debiased_df = pd.concat([train_debiased_df, valid_debiased_df, test_debiased_df])

weights = np.load(DEBIASED_FOLDER + 'weights.npy')
all_debiased_df['weights'] = weights

In [4]:
all_debiased_df

Unnamed: 0_level_0,label,question1,question2,weights
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11877,1,What is your review of Hidden Figures -LRB- 20...,What are your impressions of Hidden Figures -L...,1.472674
221489,0,"Currently , all Supreme Court Justices come fr...",What 's your type of jungle -LRB- concrete or ...,0.786593
92126,1,Why is saltwater taffy candy imported in Austr...,Why is salt water taffy candy unknown in Japan ?,0.512830
289278,0,"Who has the better stable of characters , DC o...",Which is better - DC or Marvel ?,0.640092
328558,1,What is difference between CAT and GMAT in ter...,What is the difference between the CAT and GMA...,0.539173
...,...,...,...,...
119469,0,How do I learn Java internals ?,What is the best way to learn Java ?,0.635925
399384,1,`` Maybe a bit of levity in this campaign . Do...,`` Is Trump saying '' `` bigly '' '' or '' `` ...,1.455941
77490,0,What should a ceo earn ?,What is CEO ?,0.786286
282388,1,Where can I get affordable party photo booth s...,Where can I find affordable photo booth servic...,0.560713


### read data df

In [15]:
train_df = pd.read_csv(DATA_FOLDER + 'train.csv', index_col='id')
train_df = train_df.join(all_debiased_df['weights'], on='id')
train_df = train_df.fillna(1.0)

In [16]:
train_df[train_df['weights'].isna()]

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate,weights
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [17]:
valid_df = pd.read_csv(DATA_FOLDER + 'valid.csv', index_col='id')
valid_df = valid_df.join(all_debiased_df['weights'], on='id')

In [18]:
valid_df[valid_df['weights'].isna()]

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate,weights
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [20]:
train_df = train_df.to_csv(DEBIASED_RESULT_FOLDER + 'train.csv')
valid_df = valid_df.to_csv(DEBIASED_RESULT_FOLDER + 'valid.csv')
