In [75]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import spacy

import sys
sys.path.append('..')

from utils.load_data import GLOVE_PATH, MNLI_PATH, LABEL_TO_INT

In [76]:
GOLD_LABELS = {'contradiction', 'entailment', 'neutral'}

df = pd.read_json(MNLI_PATH, lines=True)
df = df[df['gold_label'].isin(GOLD_LABELS)]

In [None]:
nlp = spacy.load("en") 

In [None]:
df["tokens1"] = df["sentence1"].apply(lambda s: nlp(s.lower()))
df["tokens2"] = df["sentence2"].apply(lambda s: nlp(s.lower()))

In [None]:
df["token_difference"] = [set(list(tks1)).difference(set(list(tks2))) for tks1, tks2 in zip(df['tokens1'], df['tokens2'])]

In [None]:
df["similarity"] = [tks1.similarity(tks2) for tks1, tks2 in zip(df['tokens1'], df['tokens2'])]
df["difference"] = df["token_difference"].str.len()

In [None]:
for label in GOLD_LABELS:
    similarity = df[df["gold_label"]==label]["similarity"]
    difference = df[df["gold_label"]==label]["difference"]
    print(label, np.mean(similarity), np.mean(difference))
    sns.distplot(similarity)
    plt.show()

In [None]:
df["sentence"] = df["token_difference"].apply(lambda s: ' '.join([str(tk) for tk in s]))

In [None]:
train, validate, test = np.split(
        df.sample(frac=1), [int(.6*len(df)),int(.8*len(df))]
    )

train.to_csv("../utils/train_bow.csv")
validate.to_csv("../utils/val_bow.csv")
test.to_csv("../utils/test_bow.csv")