# Clean XNLI

In [1]:
import pandas as pd

df_xnli = pd.read_csv("../datasets/raw/xnli.dev.tsv", sep="\t")
df_xnli = df_xnli[df_xnli["language"] == "en"]

In [2]:
df_xnli = df_xnli[["gold_label", "sentence1", "sentence2"]]
df_xnli["sentence1_len"] = df_xnli["sentence1"].apply(lambda x: len(x))
df_xnli["sentence2_len"] = df_xnli["sentence2"].apply(lambda x: len(x))

In [3]:
print(df_xnli.count())
df_xnli.head()

gold_label       2490
sentence1        2490
sentence2        2490
sentence1_len    2490
sentence2_len    2490
dtype: int64


Unnamed: 0,gold_label,sentence1,sentence2,sentence1_len,sentence2_len
9960,neutral,"And he said, Mama, I'm home.",He called his mom as soon as the school bus dr...,28,60
9961,contradiction,"And he said, Mama, I'm home.",He didn't say a word.,28,21
9962,entailment,"And he said, Mama, I'm home.",He told his mom he had gotten home.,28,35
9963,neutral,I didn't know what I was going for or anything...,I have never been to Washington so when I was ...,101,97
9964,contradiction,I didn't know what I was going for or anything...,I knew exactly what I needed to do as I marche...,101,62


In [4]:
first_filter_xnli = (
    ((df_xnli['sentence1_len'] > 20) & (df_xnli['sentence1_len'] < 2000)) & 
    ((df_xnli['sentence2_len'] > 20) & (df_xnli['sentence2_len'] < 2000))
)

df_xnli = df_xnli[first_filter_xnli]

In [5]:
df_xnli_neutral = df_xnli[df_xnli["gold_label"] == "neutral"]
df_xnli_contradiction = df_xnli[df_xnli["gold_label"] == "contradiction"]
df_xnli_entailment = df_xnli[df_xnli["gold_label"] == "entailment"]

In [6]:
df_xnli_neutral_sample = df_xnli_neutral.sample(200, random_state=42)
df_xnli_contradiction_sample = df_xnli_contradiction.sample(200, random_state=42)
df_xnli_entailment_sample = df_xnli_entailment.sample(200, random_state=42)

In [7]:
df_xnli_neutral_contradiction = pd.concat([df_xnli_neutral_sample, df_xnli_contradiction_sample], ignore_index=True)
df_xnli_neutral_contadiction_entailment = pd.concat([df_xnli_neutral_contradiction, df_xnli_entailment_sample], ignore_index=True)

In [8]:
df_xnli = df_xnli_neutral_contadiction_entailment.drop(columns=['sentence1_len', 'sentence2_len'])

In [9]:
df_xnli.head()

Unnamed: 0,gold_label,sentence1,sentence2
0,neutral,You may also take advantage of our special 2-y...,It costs $30 only if you join in the next two ...
1,neutral,"Therefore, they persistently create a nonstati...",These hypothetical worlds are used to forecast...
2,neutral,"Anyhow, so I finished came home at 6:30 today ...",I spent most of the day dealing with a complic...
3,neutral,The law redeems not the individual but the com...,The law will redeem America.
4,neutral,Your resolve delivered me from a horrible dang...,"She did not want him to help her, though was g..."


In [10]:
df_xnli.to_csv("../datasets/cleaned/cleaned_xnli.csv", index=False)

end