# Building a multi-lingual model locally (Mac)

## Data Preparation

In [1]:
import pandas as pd
import numpy as np
import os
import re
WORKING_DIR = "ml-training/SECRECY-EN/"
TARGET_LANG = "en"
TEST_LANG = "en" if TARGET_LANG == "es" else "es"

In [2]:
# load training data
en_train = pd.read_csv("ml-training/SECRECY-EN/en_secrecy_train_audited_smart_downsampled.csv")
en_test = pd.read_csv("ml-training/SECRECY-EN/en_secrecy_test_audited.csv")
es_test = pd.read_csv("ml-training/SECRECY-EN/es_secrecy_test.csv")

In [3]:
# NO DOWNSAMPLING
# training set has already been downsampled (keeping hard negatives)
# test set will not be downsampled
en_train_neg = en_train[en_train.label == 0]#.sample(frac=0.1, replace=False)
en_test_neg = en_test[en_test.label == "no"]#.sample(frac=0.05, replace=False)
es_test_neg = es_test[es_test.label == 0]#.sample(frac=0.05, replace=False)

In [4]:
en_train = pd.concat([en_train_neg, en_train[en_train.label != 0]])
en_test = pd.concat([en_test_neg, en_test[en_test.label != "no"]])
es_test = pd.concat([es_test_neg, es_test[es_test.label != 0]])

In [5]:
en_train = en_train[["label", "text"]]
en_train['text'] = en_train['text'].str.lower()
en_train['text'] = en_train['text'].str.replace('[^\w\s]','')
en_train['text'] = en_train['text'].str.replace('\n', ' ')
en_train = en_train[~en_train.text.str.contains('^\s+$', regex= True)]
en_train['text'] = [re.sub(r"\s+", " ", x) for x in en_train['text'].tolist()]
en_train["label"] = en_train["label"].astype(str)
en_train = en_train[(en_train.text.str.split().str.len() >=2) & (en_train.text.str.split().str.len() <= 50)]
set(en_train.label.tolist())

{'0.0', '1.0'}

In [6]:
# optional if labels need to be fixed
en_train["label"] = ["0" if x == '0.0' else "1" for x in en_train.label.tolist()]

In [7]:
# create seed language splits
train, validate = np.split(en_train.sample(frac=1), [int(0.8*len(en_train))])

In [8]:
en_test = en_test[["text", "label"]]
en_test['text'] = en_test['text'].str.lower()
en_test['text'] = en_test['text'].str.replace('[^\w\s]','')
en_test['text'] = en_test['text'].str.replace('\n', ' ')
en_test = en_test[~en_test.text.str.contains('^\s+$', regex= True)]
en_test['text'] = [re.sub(r"\s+", " ", x) for x in en_test['text'].tolist()]
en_test["label"] = en_test["label"].astype(str)
en_test = en_test[(en_test.text.str.split().str.len() >=2) & (en_test.text.str.split().str.len() <= 50)]
set(en_test.label.tolist())

{'no', 'secrecy'}

In [9]:
en_test["label"] = ["0" if x == "no" else "1" for x in en_test.label.tolist()]

In [10]:
es_test = es_test[["text", "label"]]
es_test['text'] = es_test['text'].str.lower()
es_test['text'] = es_test['text'].str.replace('[^\w\s]','')
es_test['text'] = es_test['text'].str.replace('\n', ' ')
es_test = es_test[~es_test.text.str.contains('^\s+$', regex= True)]
es_test['text'] = [re.sub(r"\s+", " ", x) for x in es_test['text'].tolist()]
es_test["label"] = es_test["label"].astype(str)
es_test = es_test[(es_test.text.str.split().str.len() >=2) & (es_test.text.str.split().str.len() <= 50)]
set(es_test.label.tolist())

{'0', '1'}

In [11]:
set(en_train.label.tolist()), set(en_test.label.tolist()), set(es_test.label.tolist())

({'0', '1'}, {'0', '1'}, {'0', '1'})

In [12]:
DATA_DIR = "data_hard_negatives/"

with open(WORKING_DIR + DATA_DIR + "train.txt." + TARGET_LANG, "w") as fl:
    fl.write("\n".join(train.text.tolist()))
    
with open(WORKING_DIR + DATA_DIR + "train.lbl." + TARGET_LANG, "w") as fl:
    fl.write("\n".join(train.label.tolist()))

with open(WORKING_DIR + DATA_DIR + "dev.txt." + TARGET_LANG, "w") as fl:
    fl.write("\n".join(validate.text.tolist()))
    
with open(WORKING_DIR + DATA_DIR + "dev.lbl." + TARGET_LANG, "w") as fl:
    fl.write("\n".join(validate.label.tolist()))

with open(WORKING_DIR + DATA_DIR + "test.txt." + TARGET_LANG, "w") as fl:
    fl.write("\n".join(en_test.text.tolist()))
    
with open(WORKING_DIR + DATA_DIR + "test.lbl." + TARGET_LANG, "w") as fl:
    fl.write("\n".join(en_test.label.tolist()))

with open(WORKING_DIR + DATA_DIR + "test.txt." + TEST_LANG, "w") as fl:
    fl.write("\n".join(es_test.text.tolist()))
    
with open(WORKING_DIR + DATA_DIR + "test.lbl." + TEST_LANG, "w") as fl:
    fl.write("\n".join(es_test.label.tolist()))