## Imports

In [1]:
import importlib, re, os, math
import pandas as pd
from datetime import datetime

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.features import text_based
importlib.reload(text_based)

from ir_crosslingual.features import vector_based
importlib.reload(vector_based)

from ir_crosslingual.embeddings import embeddings
importlib.reload(embeddings)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

<module 'ir_crosslingual.sentences.sentences' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/06_Documents/ir-crosslingual/ir_crosslingual/sentences/sentences.py'>

In [3]:
def time(start, stop, message):
    print(f'---- TIME {datetime.now()}: Computation time {message}: {stop-start}')

### Load documents

In [4]:
def delete_duplicate_beginnings(data):
    tokenized_data = [re.findall(r"\w+|[^\w\s]", sen.lower(), re.UNICODE) for sen in data]
    tokenized_data = [tokens[1:] if (len(tokens) > 1) and (tokens[0] == tokens[1]) else tokens for tokens in tokenized_data]
    return [' '.join(tokens) for tokens in tokenized_data] 

In [5]:
def get_queries(dataset):
    start = datetime.now()
    german_ids, queries = list(), list()
    with open(f'{paths.data_path}wikiclir_v1/{dataset}.queries') as f:
        lines = f.readlines()
        for line in lines:
            split = line.split('\t')
            german_ids.append(split[0])
            queries.append(split[1])
    time(start, datetime.now(), f'loading queries on {dataset} data')
    return pd.DataFrame({'german_id': german_ids, 'query': delete_duplicate_beginnings(queries)})


In [6]:
def get_docs(dataset):
    start = datetime.now()
    english_ids, documents = list(), list()
    with open(f'{paths.data_path}wikiclir_v1/{dataset}.docs') as f:
        lines = f.readlines()
        for line in lines:
            split = line.split('\t')
            english_ids.append(split[0])
            documents.append(split[1])
    time(start, datetime.now(), f'loading documents on {dataset} data')
    return pd.DataFrame({'english_id': english_ids, 'document': delete_duplicate_beginnings(documents)})


In [7]:
def get_qrels(dataset):
    start = datetime.now()
    rel_german_ids, rel_english_ids, relevance_levels = list(), list(), list()
    with open(f'{paths.data_path}wikiclir_v1/{dataset}.qrel') as f:
        lines = f.readlines()
        for line in lines:
            split = line.split('\t')
            rel_german_ids.append(split[0])
            rel_english_ids.append(split[2])
            relevance_levels.append(split[3][0])
    time(start, datetime.now(), f'loading qrels on {dataset} data')
    return pd.DataFrame({'german_id': rel_german_ids, 'english_id': rel_english_ids, 'relevance_level': relevance_levels})


In [8]:
queries_train = get_queries('train')
docs_train = get_docs('train')
qrel_train = get_qrels('train')

---- TIME 2020-05-24 07:40:33.058566: Computation time loading queries on train data: 0:00:00.343730
---- TIME 2020-05-24 07:40:45.398901: Computation time loading documents on train data: 0:00:08.699685
---- TIME 2020-05-24 08:05:36.902874: Computation time loading qrels on train data: 0:00:02.960174


In [9]:
queries_dev = get_queries('dev')
docs_dev = get_docs('dev')
qrel_dev = get_qrels('dev')

---- TIME 2020-05-24 08:05:38.645653: Computation time loading queries on dev data: 0:00:00.030387
---- TIME 2020-05-24 08:05:40.081606: Computation time loading documents on dev data: 0:00:01.186323
---- TIME 2020-05-24 08:06:01.036076: Computation time loading qrels on dev data: 0:00:00.082432


In [10]:
queries_test = get_queries('test')
docs_test = get_docs('test')
qrel_test = get_qrels('test')

---- TIME 2020-05-24 08:06:01.100489: Computation time loading queries on test data: 0:00:00.016515
---- TIME 2020-05-24 08:06:01.981791: Computation time loading documents on test data: 0:00:00.741527
---- TIME 2020-05-24 08:06:24.704552: Computation time loading qrels on test data: 0:00:00.083691


In [32]:
train_data = pd.merge(left=queries_train, right=qrel_train, on='german_id').merge(docs_train, on='english_id')
dev_data = pd.merge(left=queries_dev, right=qrel_dev, on='german_id').merge(docs_dev, on='english_id')
test_data = pd.merge(left=queries_test, right=qrel_test, on='german_id').merge(docs_test, on='english_id')

In [33]:
for dataset in [train_data, dev_data, test_data]:
    dataset['relevance_level'].replace('3', 1, inplace=True)
    dataset['relevance_level'].replace('2', 0, inplace=True)

In [34]:
print('Number of queries in training data: {}'.format(len(train_data[train_data['relevance_level'] == 1])))
print('Number of queries in dev data: {}'.format(len(dev_data[dev_data['relevance_level'] == 1])))
print('Number of queries in test data: {}'.format(len(test_data[test_data['relevance_level'] == 1])))

Number of queries in training data: 225294
Number of queries in dev data: 10000
Number of queries in test data: 10000


In [36]:
train_data[train_data['relevance_level'] == 1]['query'].iloc[1242]

'das ( auch vektorprodukt , vektorielles produkt oder äußeres produkt genannt ) ist eine verknüpfung im euklidischen vektorraum , die im dreidimensionalen fall zwei vektoren wieder einen vektor zuordnet .'

In [38]:
train_data[train_data['relevance_level'] == 1]['document'].iloc[1242]

'cross product in mathematics the cross product or vector product is a binary operation on two vectors in three dimensional space it results in a vector which is perpendicular to both and therefore normal to the plane containing them it has many applications in mathematics physics and engineering if the vectors have the same direction or one has zero length then their cross product is zero more generally the magnitude of the product equals the area of a parallelogram with the vectors for sides in particular for perpendicular vectors this is a rectangle and the magnitude of the product is the product of their lengths the cross product is anticommutative distributive over addition and satisfies the jacobi identity the space and product form an algebra over a field which is neither commutative nor associative but is a lie algebra with the cross product being the lie bracket like the dot product it depends on the metric of euclidean space but unlike the dot product it also depends on the c

In [40]:
train_data.drop_duplicates(['german_id', 'relevance_level'], inplace=True, ignore_index=True)
test_data = test_data[test_data['relevance_level'] == 1]

In [45]:
train_translations = list(train_data['relevance_level'])
dev_translations = list(dev_data['relevance_level'])
test_translations = list(test_data['relevance_level'])

In [46]:
train_queries = list(train_data['query'])
train_documents = list(train_data['document'])
test_queries = list(test_data['query'])
test_documents = list(test_data['document'])

In [60]:
print('Number of queries in train dataset: {}'.format(len(train_queries)))
print('Number of documents in train dataset: {}'.format(len(train_documents)))
print('-' * 50)
print('Number of queries in test dataset: {}'.format(len(test_queries)))
print('Number of documents in test dataset: {}'.format(len(test_documents)))

Number of queries in train dataset: 433966
Number of documents in train dataset: 433966
--------------------------------------------------
Number of queries in test dataset: 10000
Number of documents in test dataset: 10000


In [61]:
train_data['relevance_level'].value_counts()

1    225294
0    208672
Name: relevance_level, dtype: int64

### Load word embeddings

In [62]:
german = embeddings.WordEmbeddings('de')
german.load_embeddings()

english = embeddings.WordEmbeddings('en')
english.load_embeddings()

In [63]:
W_ende, W_deen = embeddings.WordEmbeddings.learn_projection_matrix(src_lang='en', trg_lang='de')

---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- DONE: Seed dictionary extracted for the languages: en-de
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix learned from en to de
---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- DONE: Seed dictionary extracted for the languages: de-en
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix learned from de to en


### Load sentence embeddings for training data

In [91]:
start = datetime.now()
train_sens = sentences.Sentences(src_words=german, trg_words=english)
prepared_features = list(text_based.PREPARED_FEATURES.keys())
train_data = train_sens.load_data(src_sentences=train_queries, trg_sentences=train_documents, single_source=False, features=prepared_features, agg_method='average', documents=True)
time(start, datetime.now(), 'loading sentences object')

---- DONE: Target sentences loaded
---- DONE: Source sentences loaded
---- DONE: Sentences preprocessed
---- INFO: Sentences embeddings extracted in de
---- INFO: Sentences embeddings extracted in en
---- INFO: Extracted word embeddings of found words
---- DONE: Sentences transformed
---- INFO: Embedding space of source language transformed according to projection matrix
---- DONE: Source words extracted
---- DONE: Target words extracted
---- INFO: Embeddings of found words added as a column
---- INFO: Start preparation of text-based feature translated_words
---- INFO: Start preparation of text-based feature num_words
---- INFO: Start preparation of text-based feature num_punctuation
---- INFO: Start preparation of text-based feature occ_question_mark
---- INFO: Start preparation of text-based feature occ_exclamation_mark
---- DONE: All features prepared
---- INFO: Dropped duplicates
---- INFO: Delete sentences containing only a '.'
---- DONE: Data loaded. Length of dataset after prepr

In [92]:
train_data.head()

Unnamed: 0,src_embedding,src_embedding_aligned,trg_embedding,src_sentence,trg_sentence,src_preprocessed,trg_preprocessed,src_words,trg_words,src_words_found_embedding,trg_words_found_embedding,src_translated_words,trg_translated_words,src_num_words,trg_num_words,src_num_punctuation,trg_num_punctuation,src_occ_question_mark,trg_occ_question_mark,src_occ_exclamation_mark,trg_occ_exclamation_mark
0,"[-0.14160268235294118, 0.22068174705882354, -0...","[-0.12276546643300858, 0.047733963953612756, -...","[-0.08721950426540277, -0.07013083881516587, -...",steht als pseudonym für einen fiktiven regisse...,alan smithee alan smithee also allen smithee w...,"[steht, pseudonym, fiktiven, regisseur, ,, fil...","[alan, smithee, alan, smithee, also, allen, sm...","[steht, pseudonym, fiktiven, regisseur, filme,...","[alan, smithee, alan, smithee, also, allen, sm...","[[-0.06589690229429432, 0.11700426091010997, -...","[[0.22971, -0.28394, 0.070427, -0.074201, -0.0...","[stands, alias, pseudonym, films, movies, name...","[alan, alan, ausserdem, ebenso, außerdem, eben...",14,236,3,5,False,False,False,False
1,"[-0.09725738031250002, 0.10076061875, -0.10252...","[-0.13914918744447388, 0.016411580786968003, -...","[-0.08721950426540277, -0.07013083881516587, -...","das ( el - latn ‚ tarn - ' , ‚ deckname ' ) is...",alan smithee alan smithee also allen smithee w...,"[(, el, -, latn, ‚, tarn, -, ', ,, ‚, deckname...","[alan, smithee, alan, smithee, also, allen, sm...","[el, latn, tarn, deckname, fingierter, name, m...","[alan, smithee, alan, smithee, also, allen, sm...","[[-0.15780262235327164, -0.1363245252964868, 0...","[[0.22971, -0.28394, 0.070427, -0.074201, -0.0...","[name, authors, creator, copyright, identities...","[alan, alan, ausserdem, ebenso, außerdem, eben...",21,236,15,5,False,False,False,False
2,"[-0.024419767999999998, 0.2503159047619048, -0...","[-0.07746652968120009, -0.0106397692086347, -0...","[-0.08721950426540277, -0.07013083881516587, -...","der ( engl . film editing ) , oft auch synonym...",alan smithee alan smithee also allen smithee w...,"[(, engl, ., film, editing, ), ,, oft, synonym...","[alan, smithee, alan, smithee, also, allen, sm...","[engl, film, editing, oft, synonym, montage, b...","[alan, smithee, alan, smithee, also, allen, sm...","[[-0.15780262235327164, -0.1363245252964868, 0...","[[0.22971, -0.28394, 0.070427, -0.074201, -0.0...","[engl, movie, film, frequently, oft, often, fr...","[alan, alan, ausserdem, ebenso, außerdem, eben...",15,236,7,5,False,False,False,False
3,"[-0.3159817777777778, 0.2578707777777778, -0.4...","[-0.12345786112681063, -0.1265453990910237, -0...","[-0.08721950426540277, -0.07013083881516587, -...",der originaltitel : dune ist ein us - amerikan...,alan smithee alan smithee also allen smithee w...,"[originaltitel, :, dune, us, -, amerikanischer...","[alan, smithee, alan, smithee, also, allen, sm...","[originaltitel, dune, us, amerikanischer, scie...","[alan, smithee, alan, smithee, also, allen, sm...","[[0.0011561304204515677, -0.12259614130011787,...","[[0.22971, -0.28394, 0.070427, -0.074201, -0.0...","[sci, fiction, year, david, roman, novel, fran...","[alan, alan, ausserdem, ebenso, außerdem, eben...",15,236,5,5,False,False,False,False
4,"[-0.25748315624999996, 0.16391171875000002, -0...","[-0.11601057613976422, -0.13505681219389926, -...","[-0.08721950426540277, -0.07013083881516587, -...",mit ( originaltitel : meet ) ist ein us - amer...,alan smithee alan smithee also allen smithee w...,"[(, originaltitel, :, meet, ), us, -, amerikan...","[alan, smithee, alan, smithee, also, allen, sm...","[originaltitel, meet, us, amerikanischer, spie...","[alan, smithee, alan, smithee, also, allen, sm...","[[-0.15780262235327164, -0.1363245252964868, 0...","[[0.22971, -0.28394, 0.070427, -0.074201, -0.0...","[drama, movie, film, martin, year, death, tod,...","[alan, alan, ausserdem, ebenso, außerdem, eben...",23,236,11,5,False,False,False,False


#### Feature extraction

In [93]:
features_dict = {'text_based': list(text_based.FEATURES.keys()), 
                 'vector_based': list(vector_based.FEATURES.keys())}

In [94]:
start = datetime.now()
train_data = train_sens.extract_features(features_dict=features_dict, data='all', drop_prepared=True)
time(start, datetime.now(), 'extracting features on training data')

---- INFO: No vector elements as features specified
---- INFO: Started extraction of text-based feature norm_diff_translated_words
---- INFO: Started extraction of text-based feature abs_diff_num_words
---- INFO: Started extraction of text-based feature abs_diff_num_punctuation
---- INFO: Started extraction of text-based feature abs_diff_occ_question_mark
---- INFO: Started extraction of text-based feature abs_diff_occ_exclamation_mark
---- INFO: Started extraction of text-based feature rel_diff_num_words
---- INFO: Started extraction of text-based feature rel_diff_num_punctuation
---- INFO: Started extraction of text-based feature norm_diff_num_words
---- INFO: Started extraction of text-based feature norm_diff_num_punctuation
---- INFO: Started extraction of vector-based feature euclidean_distance
---- INFO: Started extraction of vector-based feature cosine_similarity
---- TIME 2020-05-24 12:06:25.746468: Computation time extracting features on training data: 0:09:25.602591


In [95]:
train_data_raw = pd.merge(left=queries_train, right=qrel_train, on='german_id').merge(docs_train, on='english_id')

In [96]:
train_data_raw['relevance_level'].replace('3', 1, inplace=True)
train_data_raw['relevance_level'].replace('2', 0, inplace=True)

In [97]:
train_data = pd.merge(left=train_data, right=train_data_raw[['query', 'document', 'relevance_level']], left_on=['src_sentence', 'trg_sentence'], right_on=['query', 'document'])



In [98]:
train_data.drop(['query', 'document'], inplace=True, axis=1)

In [99]:
train_data.rename(columns={'relevance_level': 'translation'}, inplace=True)

In [102]:
train_data.head()

Unnamed: 0,src_embedding,src_embedding_aligned,trg_embedding,src_sentence,trg_sentence,src_preprocessed,trg_preprocessed,src_words,trg_words,src_words_found_embedding,trg_words_found_embedding,norm_diff_translated_words,abs_diff_num_words,abs_diff_num_punctuation,abs_diff_occ_question_mark,abs_diff_occ_exclamation_mark,rel_diff_num_words,rel_diff_num_punctuation,norm_diff_num_words,norm_diff_num_punctuation,euclidean_distance,cosine_similarity,translation
0,"[-0.14160268235294118, 0.22068174705882354, -0...","[-0.12276546643300858, 0.047733963953612756, -...","[-0.08721950426540277, -0.07013083881516587, -...",steht als pseudonym für einen fiktiven regisse...,alan smithee alan smithee also allen smithee w...,"[steht, pseudonym, fiktiven, regisseur, ,, fil...","[alan, smithee, alan, smithee, also, allen, sm...","[steht, pseudonym, fiktiven, regisseur, filme,...","[alan, smithee, alan, smithee, also, allen, sm...","[[-0.06589690229429432, 0.11700426091010997, -...","[[0.22971, -0.28394, 0.070427, -0.074201, -0.0...",0.083293,222,2,1,1,-222,-2,-1.776,-0.5,0.9713,0.873807,1
1,"[-0.09725738031250002, 0.10076061875, -0.10252...","[-0.13914918744447388, 0.016411580786968003, -...","[-0.08721950426540277, -0.07013083881516587, -...","das ( el - latn ‚ tarn - ' , ‚ deckname ' ) is...",alan smithee alan smithee also allen smithee w...,"[(, el, -, latn, ‚, tarn, -, ', ,, ‚, deckname...","[alan, smithee, alan, smithee, also, allen, sm...","[el, latn, tarn, deckname, fingierter, name, m...","[alan, smithee, alan, smithee, also, allen, sm...","[[-0.15780262235327164, -0.1363245252964868, 0...","[[0.22971, -0.28394, 0.070427, -0.074201, -0.0...",0.088502,215,10,1,1,-215,10,-1.679688,1.0,1.279038,0.752012,0
2,"[-0.024419767999999998, 0.2503159047619048, -0...","[-0.07746652968120009, -0.0106397692086347, -0...","[-0.08721950426540277, -0.07013083881516587, -...","der ( engl . film editing ) , oft auch synonym...",alan smithee alan smithee also allen smithee w...,"[(, engl, ., film, editing, ), ,, oft, synonym...","[alan, smithee, alan, smithee, also, allen, sm...","[engl, film, editing, oft, synonym, montage, b...","[alan, smithee, alan, smithee, also, allen, sm...","[[-0.15780262235327164, -0.1363245252964868, 0...","[[0.22971, -0.28394, 0.070427, -0.074201, -0.0...",0.077798,221,2,1,1,-221,2,-1.768,0.333333,1.157203,0.810826,0
3,"[-0.3159817777777778, 0.2578707777777778, -0.4...","[-0.12345786112681063, -0.1265453990910237, -0...","[-0.08721950426540277, -0.07013083881516587, -...",der originaltitel : dune ist ein us - amerikan...,alan smithee alan smithee also allen smithee w...,"[originaltitel, :, dune, us, -, amerikanischer...","[alan, smithee, alan, smithee, also, allen, sm...","[originaltitel, dune, us, amerikanischer, scie...","[alan, smithee, alan, smithee, also, allen, sm...","[[0.0011561304204515677, -0.12259614130011787,...","[[0.22971, -0.28394, 0.070427, -0.074201, -0.0...",0.0,221,0,1,1,-221,0,-1.768,0.0,1.713225,0.746799,0
4,"[-0.25748315624999996, 0.16391171875000002, -0...","[-0.11601057613976422, -0.13505681219389926, -...","[-0.08721950426540277, -0.07013083881516587, -...",mit ( originaltitel : meet ) ist ein us - amer...,alan smithee alan smithee also allen smithee w...,"[(, originaltitel, :, meet, ), us, -, amerikan...","[alan, smithee, alan, smithee, also, allen, sm...","[originaltitel, meet, us, amerikanischer, spie...","[alan, smithee, alan, smithee, also, allen, sm...","[[-0.15780262235327164, -0.1363245252964868, 0...","[[0.22971, -0.28394, 0.070427, -0.074201, -0.0...",0.051824,213,6,1,1,-213,6,-1.651163,0.75,1.271268,0.803858,0


### Load sentence embeddings for test collection

In [104]:
start = datetime.now()
test_sens = sentences.Sentences(src_words=german, trg_words=english)
prepared_features = list(text_based.PREPARED_FEATURES.keys())
data = test_sens.load_data(src_sentences=test_queries, trg_sentences=test_documents, single_source=False, features=prepared_features, agg_method='average', documents=True)
time(start, datetime.now(), 'loading sentences object')

---- DONE: Target sentences loaded
---- DONE: Source sentences loaded
---- DONE: Sentences preprocessed
---- INFO: Sentences embeddings extracted in de
---- ERROR: Sentence embedding failed in en on ID 8835: hospet
---- INFO: Sentences embeddings extracted in en
---- INFO: Extracted word embeddings of found words
---- DONE: Sentences transformed
---- INFO: Embedding space of source language transformed according to projection matrix
---- DONE: Source words extracted
---- DONE: Target words extracted
---- INFO: Embeddings of found words added as a column
---- INFO: Start preparation of text-based feature translated_words
---- INFO: Start preparation of text-based feature num_words
---- INFO: Start preparation of text-based feature num_punctuation
---- INFO: Start preparation of text-based feature occ_question_mark
---- INFO: Start preparation of text-based feature occ_exclamation_mark
---- DONE: All features prepared
---- INFO: Dropped duplicates
---- INFO: Delete sentences containing o

In [105]:
data.head()

Unnamed: 0,src_embedding,src_embedding_aligned,trg_embedding,src_sentence,trg_sentence,src_preprocessed,trg_preprocessed,src_words,trg_words,src_words_found_embedding,trg_words_found_embedding,src_translated_words,trg_translated_words,src_num_words,trg_num_words,src_num_punctuation,trg_num_punctuation,src_occ_question_mark,trg_occ_question_mark,src_occ_exclamation_mark,trg_occ_exclamation_mark
0,"[-0.2362156923076923, 0.08864512307692306, -0....","[-0.1801198842327313, -0.139909188680879, -0.0...","[-0.16333919276836156, -0.19536563276836164, -...",die afroasiatischen ( früher auch als hamito -...,afroasiatic languages afroasiatic afro asiatic...,"[afroasiatischen, (, früher, hamito, -, semiti...","[afroasiatic, languages, afroasiatic, afro, as...","[afroasiatischen, früher, hamito, semitisch, s...","[afroasiatic, languages, afroasiatic, afro, as...","[[-0.15780262235327164, -0.1363245252964868, 0...","[[-0.043671, -0.29505, -0.20986, 0.18004, 0.02...","[previously, early, earlier, formerly, designa...","[programmiersprachen, fremdsprachen, sprachen,...",13,215,6,3,False,False,False,False
1,"[-0.18111494444444443, 0.1288052222222222, -0....","[-0.1606634734184596, -0.16624261872403223, 0....","[-0.09051589493670884, -0.10883404240506324, -...","auch dima , eigenbezeichnung dim - aaf , dim -...",dime language dime or dima is an afro asiatic ...,"[dima, ,, eigenbezeichnung, dim, -, aaf, ,, di...","[dime, language, dime, dima, afro, asiatic, la...","[dima, eigenbezeichnung, dim, aaf, dim, ko, aa...","[dime, language, dime, dima, afro, asiatic, la...","[[-0.17940864170372575, 0.0737311403301378, -0...","[[0.011212, 0.17819, 0.1576, -0.15137, 0.24949...","[southwestern, southwest, region, river, spoke...","[amtssprache, sprachen, sprache, amtssprache, ...",16,184,8,14,False,False,False,False
2,"[-0.13061995714285712, 0.08085614285714286, -0...","[-0.12835306032659194, -0.09953887462795796, -...","[-0.05739868661691542, -0.11036642383084572, -...","( von ‚ begnadigung , straferlass , amnestie '...",amnesty international amnesty international co...,"[(, ‚, begnadigung, ,, straferlass, ,, amnesti...","[amnesty, international, amnesty, internationa...","[begnadigung, straferlass, amnestie, nichtstaa...","[amnesty, international, amnesty, internationa...","[[-0.15780262235327164, -0.1363245252964868, 0...","[[-0.49984, -0.18512, 0.20571, -0.148, -0.2802...","[non, organization, organisation, organizing, ...","[international, internationalen, international...",12,220,11,4,False,False,False,False
3,"[-0.10104247894736841, 0.07025387368421053, -0...","[-0.04052728631907548, 0.026286764620934443, -...","[0.026596005943396228, 0.007910488207547186, -...",die ( bíos ‚ leben ' ; auch als synonym zu bio...,biotechnology is the use of living systems and...,"[(, bíos, ‚, leben, ', ;, synonym, biotechnik,...","[biotechnology, use, living, systems, organism...","[bíos, leben, synonym, biotechnik, kurz, biote...","[biotechnology, use, living, systems, organism...","[[-0.15780262235327164, -0.1363245252964868, 0...","[[0.17028, 0.1322, 0.21386, 0.28217, -0.50662,...","[living, life, lifestyle, live, leben, synonym...","[verwendung, benutzen, nutzung, verwenden, geb...",17,220,7,2,False,False,False,False
4,"[-0.0073867083333333345, 0.09049094583333334, ...","[-0.060032190728019505, -0.05749486353309292, ...","[-0.03945234717948716, -0.002273775692307691, ...",wortkreuzung aus engl . pharmaceutical enginee...,pharming ( genetics ) for pharming in internet...,"[wortkreuzung, engl, ., pharmaceutical, engine...","[pharming, (, genetics, ), pharming, internet,...","[wortkreuzung, engl, pharmaceutical, engineeri...","[pharming, genetics, pharming, internet, see, ...","[[0.5040755879349637, -0.08830535297655097, -0...","[[-0.23309, -0.15296, 0.18574, -0.052825, -0.0...","[engl, development, evolution, farming, agricu...","[internetseiten, internetzugang, internet, seh...",19,225,10,5,False,False,False,False


In [106]:
start = datetime.now()
test_collection = test_sens.create_test_collection(n_queries=1000, n_docs=10000)
time(start, datetime.now(), 'creating the test collection')

---- INFO: Preliminary queries dataframe created
---- INFO: Preliminary documentes dataframe created
---- INFO: Merged queries and documents dataframe
---- INFO: Merged with test dataframe
---- INFO: Added translation indicator
---- DONE: Test collection created
---- TIME 2020-05-24 12:22:53.851715: Computation time creating the test collection: 0:01:23.949866


#### Feature extraction

In [107]:
start = datetime.now()
test_collection = test_sens.extract_features(features_dict=features_dict, data='test', drop_prepared=True)
time(start, datetime.now(), 'extracting features on test collection')

---- INFO: No vector elements as features specified
---- INFO: Started extraction of text-based feature norm_diff_translated_words
---- INFO: Started extraction of text-based feature abs_diff_num_words
---- INFO: Started extraction of text-based feature abs_diff_num_punctuation
---- INFO: Started extraction of text-based feature abs_diff_occ_question_mark
---- INFO: Started extraction of text-based feature abs_diff_occ_exclamation_mark
---- INFO: Started extraction of text-based feature rel_diff_num_words
---- INFO: Started extraction of text-based feature rel_diff_num_punctuation
---- INFO: Started extraction of text-based feature norm_diff_num_words
---- INFO: Started extraction of text-based feature norm_diff_num_punctuation
---- INFO: Started extraction of vector-based feature euclidean_distance
---- INFO: Started extraction of vector-based feature cosine_similarity
---- TIME 2020-05-24 16:00:20.557666: Computation time extracting features on test collection: 3:36:09.428113


In [108]:
test_collection.head()

Unnamed: 0,src_sentence,src_preprocessed,src_embedding,src_embedding_aligned,src_words,src_words_found_embedding,trg_sentence,trg_preprocessed,trg_embedding,trg_words,trg_words_found_embedding,translation,norm_diff_translated_words,abs_diff_num_words,abs_diff_num_punctuation,abs_diff_occ_question_mark,abs_diff_occ_exclamation_mark,rel_diff_num_words,rel_diff_num_punctuation,norm_diff_num_words,norm_diff_num_punctuation,euclidean_distance,cosine_similarity
0,die afroasiatischen ( früher auch als hamito -...,"[afroasiatischen, (, früher, hamito, -, semiti...","[-0.2362156923076923, 0.08864512307692306, -0....","[-0.1801198842327313, -0.139909188680879, -0.0...","[afroasiatischen, früher, hamito, semitisch, s...","[[-0.15780262235327164, -0.1363245252964868, 0...",afroasiatic languages afroasiatic afro asiatic...,"[afroasiatic, languages, afroasiatic, afro, as...","[-0.16333919276836156, -0.19536563276836164, -...","[afroasiatic, languages, afroasiatic, afro, as...","[[-0.043671, -0.29505, -0.20986, 0.18004, 0.02...",1,0.046726,202,3,1,1,-202,3,-1.77193,0.75,1.340791,0.792458
1,die afroasiatischen ( früher auch als hamito -...,"[afroasiatischen, (, früher, hamito, -, semiti...","[-0.2362156923076923, 0.08864512307692306, -0....","[-0.1801198842327313, -0.139909188680879, -0.0...","[afroasiatischen, früher, hamito, semitisch, s...","[[-0.15780262235327164, -0.1363245252964868, 0...",dime language dime or dima is an afro asiatic ...,"[dime, language, dime, dima, afro, asiatic, la...","[-0.09051589493670884, -0.10883404240506324, -...","[dime, language, dime, dima, afro, asiatic, la...","[[0.011212, 0.17819, 0.1576, -0.15137, 0.24949...",0,0.042699,171,8,1,1,-171,-8,-1.744898,-0.8,1.288895,0.791433
2,die afroasiatischen ( früher auch als hamito -...,"[afroasiatischen, (, früher, hamito, -, semiti...","[-0.2362156923076923, 0.08864512307692306, -0....","[-0.1801198842327313, -0.139909188680879, -0.0...","[afroasiatischen, früher, hamito, semitisch, s...","[[-0.15780262235327164, -0.1363245252964868, 0...",amnesty international amnesty international co...,"[amnesty, international, amnesty, internationa...","[-0.05739868661691542, -0.11036642383084572, -...","[amnesty, international, amnesty, internationa...","[[-0.49984, -0.18512, 0.20571, -0.148, -0.2802...",0,0.0,207,2,1,1,-207,2,-1.784483,0.4,1.640474,0.651392
3,die afroasiatischen ( früher auch als hamito -...,"[afroasiatischen, (, früher, hamito, -, semiti...","[-0.2362156923076923, 0.08864512307692306, -0....","[-0.1801198842327313, -0.139909188680879, -0.0...","[afroasiatischen, früher, hamito, semitisch, s...","[[-0.15780262235327164, -0.1363245252964868, 0...",biotechnology is the use of living systems and...,"[biotechnology, use, living, systems, organism...","[0.026596005943396228, 0.007910488207547186, -...","[biotechnology, use, living, systems, organism...","[[0.17028, 0.1322, 0.21386, 0.28217, -0.50662,...",0,0.041606,207,4,1,1,-207,4,-1.784483,1.0,1.787893,0.638994
4,die afroasiatischen ( früher auch als hamito -...,"[afroasiatischen, (, früher, hamito, -, semiti...","[-0.2362156923076923, 0.08864512307692306, -0....","[-0.1801198842327313, -0.139909188680879, -0.0...","[afroasiatischen, früher, hamito, semitisch, s...","[[-0.15780262235327164, -0.1363245252964868, 0...",pharming ( genetics ) for pharming in internet...,"[pharming, (, genetics, ), pharming, internet,...","[-0.03945234717948716, -0.002273775692307691, ...","[pharming, genetics, pharming, internet, see, ...","[[-0.23309, -0.15296, 0.18574, -0.052825, -0.0...",0,0.04151,212,1,1,1,-212,1,-1.781513,0.2,1.700643,0.652745


## Drop irrelevant columns to enable saving the files

In [116]:
drop_columns = [f'{prefix}_{feature}' for prefix in ['src', 'trg'] for feature in ['preprocessed', 'words', 'words_found_embedding']]

In [117]:
train_data.drop(drop_columns, inplace=True, axis=1)
test_collection.drop(drop_columns, inplace=True, axis=1)

## Save data

In [120]:
# Drop sentence, word embeddings, words found columns

In [131]:
path = f'{paths.data_path}extracted_data/global/documents_v0.2'
if not os.path.exists(path):
    os.makedirs(f'{path}/train')
    os.makedirs(f'{path}/test')

In [132]:
train_chunks = [train_data[i:(i+math.ceil(len(train_data)/13))] 
          for i in range(0,len(train_data),math.ceil(len(train_data)/13))]

In [133]:
start = datetime.now()
for idx, data in enumerate(train_chunks):
    data.to_pickle(f'{path}/train/training_data_{idx:02d}_avg.pkl')
    print(f'---- INFO: Chunk {idx:02d} saved')
time(start, datetime.now(), 'saving training data')

---- INFO: Chunk 00 saved
---- INFO: Chunk 01 saved
---- INFO: Chunk 02 saved
---- INFO: Chunk 03 saved
---- INFO: Chunk 04 saved
---- INFO: Chunk 05 saved
---- INFO: Chunk 06 saved
---- INFO: Chunk 07 saved
---- INFO: Chunk 08 saved
---- INFO: Chunk 09 saved
---- INFO: Chunk 10 saved
---- INFO: Chunk 11 saved
---- INFO: Chunk 12 saved
---- TIME 2020-05-24 16:09:28.952292: Computation time saving training data: 0:01:17.033286


In [136]:
test_chunks = [test_collection[i:(i+math.ceil(len(test_collection)/40))] 
          for i in range(0,len(test_collection),math.ceil(len(test_collection)/40))]

In [138]:
start = datetime.now()
for idx, data in enumerate(test_chunks):
    data.to_pickle(f'{path}/test/test_collection_{idx:02d}_avg.pkl')
    print(f'---- INFO: Chunk {idx:02d} saved')
time(start, datetime.now(), 'saving test collection')

---- INFO: Chunk 00 saved
---- INFO: Chunk 01 saved
---- INFO: Chunk 02 saved
---- INFO: Chunk 03 saved
---- INFO: Chunk 04 saved
---- INFO: Chunk 05 saved
---- INFO: Chunk 06 saved
---- INFO: Chunk 07 saved
---- INFO: Chunk 08 saved
---- INFO: Chunk 09 saved
---- INFO: Chunk 10 saved
---- INFO: Chunk 11 saved
---- INFO: Chunk 12 saved
---- INFO: Chunk 13 saved
---- INFO: Chunk 14 saved
---- INFO: Chunk 15 saved
---- INFO: Chunk 16 saved
---- INFO: Chunk 17 saved
---- INFO: Chunk 18 saved
---- INFO: Chunk 19 saved
---- INFO: Chunk 20 saved
---- INFO: Chunk 21 saved
---- INFO: Chunk 22 saved
---- INFO: Chunk 23 saved
---- INFO: Chunk 24 saved
---- INFO: Chunk 25 saved
---- INFO: Chunk 26 saved
---- INFO: Chunk 27 saved
---- INFO: Chunk 28 saved
---- INFO: Chunk 29 saved
---- INFO: Chunk 30 saved
---- INFO: Chunk 31 saved
---- INFO: Chunk 32 saved
---- INFO: Chunk 33 saved
---- INFO: Chunk 34 saved
---- INFO: Chunk 35 saved
---- INFO: Chunk 36 saved
---- INFO: Chunk 37 saved
---- INFO: C