In [77]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [78]:
import torch

if torch.cuda.is_available():
  dev = "cuda"
else:
  dev = "cpu"
device = torch.device(dev)

# device = torch.device('cpu')
print(device)

cuda


In [79]:
# %load par3/par3_align/similarity/sim_models.py
import torch
import torch.nn as nn
from torch.nn.modules.distance import CosineSimilarity
import numpy as np

class ParaModel(nn.Module):

    def __init__(self, args, vocab):
        super(ParaModel, self).__init__()

        self.args = args
        self.vocab = vocab
        self.gpu = args.gpu

        self.cosine = CosineSimilarity()

    def compute_mask(self, lengths):

        lengths = lengths.cpu()
        max_len = torch.max(lengths)
        range_row = torch.arange(0, max_len).long()[None, :].expand(lengths.size()[0], max_len)
        mask = lengths[:, None].expand_as(range_row)
        mask = range_row < mask
        mask = mask.float()
        if self.gpu >= 0:
            mask = mask.cuda()
        return mask

    def torchify_batch(self, batch):

        max_len = 0
        for i in batch:
            if len(i.embeddings) > max_len:
                max_len = len(i.embeddings)

        batch_len = len(batch)

        np_sents = np.zeros((batch_len, max_len), dtype='int32')
        np_lens = np.zeros((batch_len,), dtype='int32')

        for i, ex in enumerate(batch):
            np_sents[i, :len(ex.embeddings)] = ex.embeddings
            np_lens[i] = len(ex.embeddings)

        idxs, lengths, masks = torch.from_numpy(np_sents).long(), \
                               torch.from_numpy(np_lens).float().long(), \
                               self.compute_mask(torch.from_numpy(np_lens).long())

        if self.gpu >= 0:
            idxs = idxs.cuda()
            lengths = lengths.cuda()
            masks = masks.cuda()
    
        return idxs, lengths, masks

    def scoring_function(self, g_idxs1, g_mask1, g_lengths1, g_idxs2, g_mask2, g_lengths2):

        g1 = self.encode(g_idxs1, g_mask1, g_lengths1)
        g2 = self.encode(g_idxs2, g_mask2, g_lengths2)
        return self.cosine(g1, g2)

class WordAveraging(ParaModel):

    def __init__(self, args, vocab):
        super(WordAveraging, self).__init__(args, vocab)

        self.vocab = vocab
        self.embedding = nn.Embedding(len(self.vocab), self.args.dim)

        if args.gpu >= 0:
           self.cuda()

    def encode(self, idxs, mask, lengths):
        word_embs = self.embedding(idxs)
        word_embs = word_embs * mask[:, :, None]
        g = word_embs.sum(dim=1) / lengths[:, None].float()
        return g

In [80]:
# %load par3/par3_align/similarity/sim_utils.py
import io
import numpy as np
import torch

def get_wordmap(textfile):
    words={}
    We = []
    f = io.open(textfile, 'r', encoding='utf-8')
    lines = f.readlines()
    if len(lines[0].split()) == 2:
        lines.pop(0)
    ct = 0
    for (n,i) in enumerate(lines):
        word = i.split(' ', 1)[0]
        vec = i.split(' ', 1)[1].split(' ')
        j = 0
        v = []
        while j < len(vec):
            v.append(float(vec[j]))
            j += 1
        words[word] = ct
        ct += 1
        We.append(v)
    return words, np.array(We)

def get_minibatches_idx(n, minibatch_size, shuffle=False):
    idx_list = np.arange(n, dtype="int32")

    if shuffle:
        np.random.shuffle(idx_list)

    minibatches = []
    minibatch_start = 0
    for i in range(n // minibatch_size):
        minibatches.append(idx_list[minibatch_start:
                                    minibatch_start + minibatch_size])
        minibatch_start += minibatch_size

    if (minibatch_start != n):
        # Make a minibatch out of what is left
        minibatches.append(idx_list[minibatch_start:])

    return zip(range(len(minibatches)), minibatches)

def max_pool(x, lengths, gpu):
    out = torch.FloatTensor(x.size(0), x.size(2)).zero_()
    if gpu >= 0:
        out = out.cuda()
    for i in range(len(lengths)):
        out[i] = torch.max(x[i][0:lengths[i]], 0)[0]
    return out

def mean_pool(x, lengths, gpu):
    out = torch.FloatTensor(x.size(0), x.size(2)).zero_()
    if gpu >= 0:
        out = out.cuda()
    for i in range(len(lengths)):
        out[i] = torch.mean(x[i][0:lengths[i]], 0)
    return out

def lookup(words, w):
    w = w.lower()
    if w in words:
        return words[w]

class Example(object):

    def __init__(self, sentence):
        self.sentence = sentence.strip().lower()
        self.embeddings = []
        self.representation = None

    def populate_embeddings(self, words):
        sentence = self.sentence.lower()
        arr = sentence.split()
        for i in arr:
            emb = lookup(words, i)
            if emb:
                self.embeddings.append(emb)
        if len(self.embeddings) == 0:
            self.embeddings.append(words['UUUNKKK'])

In [81]:
# %load par3/par3_align/similarity/test_sim.py
import torch
from nltk.tokenize import TreebankWordTokenizer
import sentencepiece as spm

tok = TreebankWordTokenizer()

model = torch.load('/home/kkatsy/par3/par3_align/similarity/sim/sim.pt')
state_dict = model['state_dict']
vocab_words = model['vocab_words']
args = model['args']
# turn off gpu
model = WordAveraging(args, vocab_words)
model.load_state_dict(state_dict, strict=True)
sp = spm.SentencePieceProcessor()
sp.Load('/home/kkatsy/par3/par3_align/similarity/sim/sim.sp.30k.model')
model.eval()

def make_example(sentence, model):
    sentence = sentence.lower()
    sentence = " ".join(tok.tokenize(sentence))
    sentence = sp.EncodeAsPieces(sentence)
    wp1 = Example(" ".join(sentence))
    wp1.populate_embeddings(model.vocab)
    return wp1

def find_similarity(s1, s2):
    with torch.no_grad():
        s1 = [make_example(x, model) for x in s1]
        s2 = [make_example(x, model) for x in s2]
        wx1, wl1, wm1 = model.torchify_batch(s1)
        wx2, wl2, wm2 = model.torchify_batch(s2)
        BATCH_SIZE = 512
        all_scores = []
        for i in range(0, len(wx1), BATCH_SIZE):
            scores = model.scoring_function(wx1[i:i + BATCH_SIZE], wm1[i:i + BATCH_SIZE], wl1[i:i + BATCH_SIZE],
                                            wx2[i:i + BATCH_SIZE], wm2[i:i + BATCH_SIZE], wl2[i:i + BATCH_SIZE])
            all_scores.extend([x.item() for x in scores])
        return all_scores

def find_similarity_matrix(s1, s2):
    with torch.no_grad():
        s1 = [make_example(x, model) for x in s1]
        s2 = [make_example(x, model) for x in s2]
        wx1, wl1, wm1 = model.torchify_batch(s1)
        wx2, wl2, wm2 = model.torchify_batch(s2)

        BATCH_SIZE = 2000
        vecs1 = []
        vecs2 = []
        for i in range(0, len(wx1), BATCH_SIZE):
            curr_vecs1 = model.encode(idxs=wx1[i:i + BATCH_SIZE],
                                      mask=wm1[i:i + BATCH_SIZE],
                                      lengths=wl1[i:i + BATCH_SIZE])
            vecs1.append(curr_vecs1)
        for i in range(0, len(wx2), BATCH_SIZE):
            curr_vecs2 = model.encode(idxs=wx2[i:i + BATCH_SIZE],
                                      mask=wm2[i:i + BATCH_SIZE],
                                      lengths=wl2[i:i + BATCH_SIZE])
            vecs2.append(curr_vecs2)
        vecs1 = torch.cat(vecs1)
        vecs2 = torch.cat(vecs2)
        dot_product = torch.matmul(vecs1, vecs2.t())

        vecs1_norm = torch.norm(vecs1, dim=1, keepdim=True)
        vecs2_norm = torch.norm(vecs2, dim=1, keepdim=True)
        norm_product = torch.matmul(vecs1_norm, vecs2_norm.t())
    return torch.div(dot_product, norm_product)

def encode_text(s1):
    with torch.no_grad():
        s1 = [make_example(x, model) for x in s1]
        wx1, wl1, wm1 = model.torchify_batch(s1)
        vecs1 = model.encode(idxs=wx1, mask=wm1, lengths=wl1)
        return vecs1


In [82]:
tok = TreebankWordTokenizer()

model = torch.load('/home/kkatsy/par3/par3_align/similarity/sim/sim.pt')
state_dict = model['state_dict']
vocab_words = model['vocab_words']
args = model['args']
# turn off gpu
model = WordAveraging(args, vocab_words)
model.load_state_dict(state_dict, strict=True)
sp = spm.SentencePieceProcessor()
sp.Load('/home/kkatsy/par3/par3_align/similarity/sim/sim.sp.30k.model')
model.eval()

WordAveraging(
  (cosine): CosineSimilarity()
  (embedding): Embedding(65733, 300)
)

In [83]:
def get_score(refs, cands, metric='sim'):
    return find_similarity(refs,cands)

In [84]:
import pickle

with open('aligned_paragraph_dataset.pickle', 'rb') as fp:
  aligned_paragraph_dataset = pickle.load(fp)

with open('source_paragraph_dataset.pickle', 'rb') as fp:
  source_paragraph_dataset = pickle.load(fp)

In [85]:
import itertools
from statistics import mean
from operator import itemgetter

def get_best_alignments(par_list, source_par_list, top_k_percent, num_k, drop_top, metric, min_len, max_len, align_scale):

    # dict -> score:par_set
    # iter thru par_list, prune by length, get metric for set
    keep_index_list = []
    i2score = {}
    for i in range(len(par_list)):
        keep_index_list.append(i)
        par_set = par_list[i]

        max_par_len = len(max(par_set, key = len))
        min_par_len = len(min(par_set, key = len))
        source_len = len(source_par_list[i])

        if (min_par_len >= min_len) and (max_par_len) <= max_len and not all(x==par_set[0] for x in par_set) and (max_par_len <= align_scale*source_len) and (min_par_len*align_scale >= source_len):

            pairs = list(itertools.combinations(par_set, 2))
            refs, cands = [], []
            for s1, s2 in pairs:
                refs.append(s1)
                cands.append(s2)
                
            pair_scores = get_score(refs, cands, metric)

            average_score = mean(pair_scores)
            i2score[i] = average_score

    # get top k par sets
    num_pars = len(list(i2score))
    top_k = int(top_k_percent * num_pars)
    if top_k >= num_k:
        top_k_scores = sorted(i2score.items(), key=itemgetter(1), reverse=True)[int(num_pars*drop_top):int(num_pars*drop_top) + num_k]
    else:
        top_k_scores = sorted(i2score.items(), key=itemgetter(1), reverse=True)[int(num_pars*drop_top):int(num_pars*drop_top) + top_k]
    
    i2score = sorted(i2score.items(), key=itemgetter(1), reverse=True)
    return i2score, top_k_scores, keep_index_list

In [86]:
min_paragraph_len = 20
max_paragraph_len = 1000000000000
top_k_percent = 0.9
num_k = 50000
drop_top = 0.02
align_scale = 3

In [87]:
# Holdout
# NotesFromUnderground - Katz, PV, Garnett, Hogarth
# PoorFolk - McDuff, Hogarth, Garnett
# TheIdiot - Garnett, McDuff, PV
# CrimeAndPunishment - Katz, McDuff, PV, Garnett
holdout_books = ['TheIdiot', 'NotesFromUnderground']
ignore_books = []
translator_to_pars = {}
translator_to_pars_holdout = {}

# for each book in train:
for book in sorted(list(aligned_paragraph_dataset.keys())):
    # get par list of aligned sentences, best k alignments
    book_par_list = [list(aligned_paragraph_dataset[book][p].values()) for p in range(len(aligned_paragraph_dataset[book]))]
    source_par_list = source_paragraph_dataset[book]

    if book in holdout_books:
        i2score, top_k, keep_idx = get_best_alignments(book_par_list, source_par_list, 1.0, 5000, 0, 'sim', min_paragraph_len, max_paragraph_len, 100)
    elif book not in ignore_books:
        i2score, top_k, keep_idx = get_best_alignments(book_par_list, source_par_list, top_k_percent, num_k, drop_top, 'sim', min_paragraph_len, max_paragraph_len, align_scale)
    else:
        top_k = []

    for i, sim in top_k:
        par_trans_dict = aligned_paragraph_dataset[book][i]
        par_source = source_paragraph_dataset[book][i]

        for translator, t in par_trans_dict.items():
            t = t.replace('\\\'', '\'')
            datum_dict = {'source':par_source, 'translation': t, 'idx': i, 'book': book, 'sim': sim, 'translator': translator}

            if translator not in translator_to_pars.keys():
                translator_to_pars[translator] = []
                translator_to_pars_holdout[translator] = []
                
            if book in holdout_books:
                translator_to_pars_holdout[translator].append(datum_dict)
            else:
                translator_to_pars[translator].append(datum_dict)

In [88]:
min_len = len(translator_to_pars['Hogarth'])
print(min_len)
for t in translator_to_pars.keys():
    keep = sorted(translator_to_pars[t], key=lambda d: d['sim'], reverse=True)[:min_len]
    translator_to_pars[t] = keep

3547


In [89]:
from random import sample

min_len_h = len(translator_to_pars_holdout['Hogarth'])
print(min_len_h)
for t in translator_to_pars_holdout.keys():
    keep = sample(translator_to_pars_holdout[t], min_len_h) 
    translator_to_pars_holdout[t] = keep

470


In [90]:
newlist = sorted(translator_to_pars['PV'], key=lambda d: d['sim'], reverse=True) 
newlist[0:5]

[{'source': 'Из лицея молодой человек в первые два года приезжал на вакацию. Во время поездки в Петербург Варвары Петровны и Степана Трофимовича он присутствовал иногда на литературных вечерах, бывавших у мамаши, слушал и наблюдал. Говорил мало и всё по-прежнему был тих и застенчив. К Степану Трофимовичу относился с прежним нежным вниманием, но уже как-то сдержаннее: о высоких предметах и о воспоминаниях прошлого видимо удалялся с ним заговаривать. Кончив курс, он, по желанию мамаши, поступил в военную службу и вскоре был зачислен в один из самых видных гвардейских кавалерийских полков. Показаться мамаше в мундире он не приехал и редко стал писать из Петербурга. Денег Варвара Петровна посылала ему не жалея, несмотря на то что после реформы доход с ее имений упал до того, что в первое время она и половины прежнего дохода не получала. У ней, впрочем, накоплен был долгою экономией некоторый, не совсем маленький капитал. Ее очень интересовали успехи сына в высшем петербургском обществе. Чт

In [92]:
abs_total = 0
print('\nAll')
for k in translator_to_pars_holdout.keys():
    both = len(translator_to_pars_holdout[k]) + len(translator_to_pars[k])
    print(k, both)
    abs_total += both
print('Total', abs_total)

train_total = 0
min_class = 100000000000
print('\nTrain')
for k in translator_to_pars.keys():
    print(k, len(translator_to_pars[k]))
    if len(translator_to_pars[k]) < min_class:
        min_class = len(translator_to_pars[k])
    
train_total = len(translator_to_pars.keys()) * min_class

holdout_total = 0
min_class_h = 100000000000
print('\nHoldout')
for k in translator_to_pars_holdout.keys():
    print(k, len(translator_to_pars_holdout[k]))
    if len(translator_to_pars_holdout[k]) < min_class_h:
        min_class_h = len(translator_to_pars_holdout[k])

holdout_total = len(translator_to_pars.keys()) * min_class_h

print('Train total: ', min_class*5)
print('Val/Test total: ', min_class_h*5)
print()
print('train % = ', train_total/(holdout_total+train_total))
print('holdout % = ', holdout_total/(holdout_total+train_total))



All
PV 4017
Garnett 4017
Katz 4017
McDuff 4017
Hogarth 4017
Total 20085

Train
PV 3547
Garnett 3547
Katz 3547
McDuff 3547
Hogarth 3547

Holdout
PV 470
Garnett 470
Katz 470
McDuff 470
Hogarth 470
Train total:  17735
Val/Test total:  2350

train % =  0.8829972616380384
holdout % =  0.11700273836196166


In [93]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(list(translator_to_pars.keys()))
print(le.transform(["Garnett", "McDuff", "PV", "Katz", "Hogarth"]))

[0 3 4 2 1]


In [94]:
data_list = []
i = 0
for tr in translator_to_pars.keys():
    label = le.transform([tr])[0]
    for d in translator_to_pars[tr]:
        src, tgt = d['source'], d['translation']
        concat = src + ' <SEP> ' + tgt
        sent_dict = {'idx': d['idx'], 'book':d['book'], 'labels': label, 'concat': concat,  'translator': d['translator'], 'sim': d['sim'], 'src': src, 'tgt': tgt}
        data_list.append(sent_dict)
        i += 1


data_list_holdout = []
i = 0
for tr in translator_to_pars_holdout.keys():
    label = le.transform([tr])[0]
    for d in translator_to_pars_holdout[tr]:
        src, tgt = d['source'], d['translation']
        concat = src + ' <SEP> ' + tgt
        sent_dict = {'idx': d['idx'], 'book':d['book'], 'labels': label, 'concat': concat, 'translator': d['translator'], 'sim': d['sim'], 'src': src, 'tgt': tgt}
        data_list_holdout.append(sent_dict)
        i += 1
        

In [95]:
import pandas as pd

df = pd.DataFrame(data_list)
df_holdout = pd.DataFrame(data_list_holdout)
df_holdout_X = df_holdout[['idx','book', 'concat', 'translator', 'sim', 'src', 'tgt']]

df.head()

Unnamed: 0,idx,book,labels,concat,translator,sim,src,tgt
0,91,Demons,4,Из лицея молодой человек в первые два года при...,PV,0.976574,Из лицея молодой человек в первые два года при...,For the first two years the young man came hom...
1,1263,Demons,4,"– Вещь короткая; даже, если хотите, по-настоящ...",PV,0.976535,"– Вещь короткая; даже, если хотите, по-настоящ...","""It's a short matter; in fact, if you like, it..."
2,1228,Demons,4,– Да кто? Кто велел вам сюда приходить? – допр...,PV,0.976279,– Да кто? Кто велел вам сюда приходить? – допр...,"""But, who? Who told you to come here?"" Varvara..."
3,289,Demons,4,Так называемое у нас имение Степана Трофимович...,PV,0.976099,Так называемое у нас имение Степана Трофимович...,"Stepan Trofimovich's estate, as we used to cal..."
4,528,Demons,4,"– Шатов? Это брат Дарьи Павловны… <SEP> ""Shato...",PV,0.97594,– Шатов? Это брат Дарьи Павловны…,"""Shatov? He is Darya Pavlovna's brother..."""


In [96]:
print(df.shape)
print(df_holdout.shape)

(17735, 8)
(2350, 8)


In [97]:
from sklearn.model_selection import train_test_split

test_texts, val_texts, test_labels, val_labels = train_test_split(
    df_holdout_X, df_holdout['labels'],
    stratify = df_holdout['labels'], shuffle=True, test_size=0.5
)

aligned_train_df = df
test_df = pd.concat([test_texts, test_labels], axis=1)
val_df = pd.concat([val_texts, val_labels], axis=1)
print('train size: ', aligned_train_df.shape)
print('val size: ', val_df.shape)
print('test size: ', test_df.shape)

train size:  (17735, 8)
val size:  (1175, 8)
test size:  (1175, 8)


In [98]:
# SAVE ALIGNED TRAIN
aligned_train_df.to_pickle("/home/kkatsy/litMT/experiment_dataset/aligned_train_df.pickle")  

# SAVE HOLDOUT VAL
val_df.to_pickle("/home/kkatsy/litMT/experiment_dataset/experiment_val_df.pickle")  

# SAVE HOLDOUT TEST
test_df.to_pickle("/home/kkatsy/litMT/experiment_dataset/experiment_test_df.pickle")  

## Generate Random Train

In [99]:
min_paragraph_len = 20
max_paragraph_len = 1000000000000
top_k_percent = 1
num_k = 5000
drop_top = 0.00
align_scale = 100

In [100]:
holdout_books = ['TheIdiot', 'NotesFromUnderground']
ignore_books = []
translator_to_pars = {}
translator_to_pars_holdout = {}

# for each book in train:
for book in sorted(list(aligned_paragraph_dataset.keys())):
    # get par list of aligned sentences, best k alignments
    book_par_list = [list(aligned_paragraph_dataset[book][p].values()) for p in range(len(aligned_paragraph_dataset[book]))]
    source_par_list = source_paragraph_dataset[book]

    if book in holdout_books:
        i2score, top_k, keep_idx = get_best_alignments(book_par_list, source_par_list, 1.0, 5000, 0, 'sim', min_paragraph_len, max_paragraph_len, align_scale)
    elif book not in ignore_books:
        i2score, top_k, keep_idx = get_best_alignments(book_par_list, source_par_list, top_k_percent, num_k, drop_top, 'sim', min_paragraph_len, max_paragraph_len, align_scale)
    else:
        top_k = []

    for i, sim in top_k:
        par_trans_dict = aligned_paragraph_dataset[book][i]
        par_source = source_paragraph_dataset[book][i]

        for translator, t in par_trans_dict.items():
            t = t.replace('\\\'', '\'')
            datum_dict = {'source':par_source, 'translation': t, 'idx': i, 'book': book, 'sim': sim, 'translator': translator}

            if translator not in translator_to_pars.keys():
                translator_to_pars[translator] = []
                translator_to_pars_holdout[translator] = []
                
            if book in holdout_books:
                translator_to_pars_holdout[translator].append(datum_dict)
                # print('len par_list: ', len(book_par_list))
                # print('len top_k: ', len(top_k))
            else:
                translator_to_pars[translator].append(datum_dict)

In [101]:
train_entire_dataset = 0
holdout_entire_dataset = 0
for t in translator_to_pars.keys():
    train_entire_dataset += len(translator_to_pars[t])
    holdout_entire_dataset += len(translator_to_pars_holdout[t])

In [102]:
min_len = int(aligned_train_df.shape[0]/5)
print(min_len)
for t in translator_to_pars.keys():
    keep = sample(translator_to_pars[t], min_len) 
    translator_to_pars[t] = keep

min_len_h = len(translator_to_pars_holdout['Hogarth'])
print(min_len_h)
for t in translator_to_pars_holdout.keys():
    keep = sample(translator_to_pars_holdout[t], min_len_h) 
    translator_to_pars_holdout[t] = keep

3547
470


In [103]:
abs_total = 0
print('\nTrain + Holdout')
for k in translator_to_pars_holdout.keys():
    both = len(translator_to_pars_holdout[k]) + len(translator_to_pars[k])
    print(k, both)
    abs_total += both
print('Total', abs_total)

train_total = 0
min_class = 100000000000
print('\nTrain')
for k in translator_to_pars.keys():
    print(k, len(translator_to_pars[k]))
    if len(translator_to_pars[k]) < min_class:
        min_class = len(translator_to_pars[k])
    
train_total = len(translator_to_pars.keys()) * min_class

holdout_total = 0
min_class_h = 100000000000
print('\nHoldout')
for k in translator_to_pars_holdout.keys():
    print(k, len(translator_to_pars_holdout[k]))
    if len(translator_to_pars_holdout[k]) < min_class_h:
        min_class_h = len(translator_to_pars_holdout[k])

holdout_total = len(translator_to_pars.keys()) * min_class_h

print('Train total: ', min_class*5)
print('Val/Test total: ', min_class_h*5)
print()
print('train % = ', train_total/(holdout_total+train_total))
print('holdout % = ', holdout_total/(holdout_total+train_total))
print()
print('entire dataset % = ', (holdout_total+train_total)/(train_entire_dataset + holdout_entire_dataset))
print('entire train % = ', (train_total)/(train_entire_dataset))
print('holdout train % = ', (holdout_total)/(holdout_entire_dataset))


Train + Holdout
PV 4017
Garnett 4017
Katz 4017
McDuff 4017
Hogarth 4017
Total 20085

Train
PV 3547
Garnett 3547
Katz 3547
McDuff 3547
Hogarth 3547

Holdout
PV 470
Garnett 470
Katz 470
McDuff 470
Hogarth 470
Train total:  17735
Val/Test total:  2350

train % =  0.8829972616380384
holdout % =  0.11700273836196166

entire dataset % =  0.2643945975831293
entire train % =  0.28937149197232737
holdout train % =  0.160103556342826


In [104]:
data_list = []
i = 0
for tr in translator_to_pars.keys():
    label = le.transform([tr])[0]
    for d in translator_to_pars[tr]:
        src, tgt = d['source'], d['translation']
        concat = src + ' <SEP> ' + tgt
        sent_dict = {'idx': d['idx'], 'book':d['book'], 'labels': label, 'concat': concat,  'translator': d['translator'], 'sim': d['sim'], 'src': src, 'tgt': tgt}
        data_list.append(sent_dict)
        i += 1

In [105]:
df = pd.DataFrame(data_list)
random_train_df = df
random_train_df.shape

(17735, 8)

In [106]:
# SAVE ALIGNED TRAIN
random_train_df.to_pickle("/home/kkatsy/litMT/experiment_dataset/random_train_df.pickle")  