## Imports

In [1]:
import io, os, importlib
import datetime
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.features import text_based
importlib.reload(text_based)

from ir_crosslingual.features import vector_based
importlib.reload(vector_based)

from ir_crosslingual.embeddings import embeddings
importlib.reload(embeddings)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

<module 'ir_crosslingual.sentences.sentences' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/03_Feature-Selection/ir-crosslingual/ir_crosslingual/sentences/sentences.py'>

In [3]:
def time(start, stop):
    print('Computation time loading the data: {}'.format(stop-start))
    print('Finished at: {}'.format(datetime.datetime.now()))

## Load word embeddings

In [4]:
overall_start = datetime.datetime.now()

In [5]:
german = embeddings.WordEmbeddings('de')
german.load_embeddings()

english = embeddings.WordEmbeddings('en')
english.load_embeddings()

In [6]:
W_ende, W_deen = embeddings.WordEmbeddings.learn_projection_matrix(src_lang='en', trg_lang='de')

Learn projection matrix for en-de
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Resulting subspace dimension: (13700, 300)
Resulting subspace dimension: (13700, 300)
Learn projection matrix for de-en
Found 10604 valid translation pairs in expert dictionary.
262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
Resulting subspace dimension: (10604, 300)
Resulting subspace dimension: (10604, 300)


## Load sentence embeddings

In [7]:
start = datetime.datetime.now()
sens = sentences.Sentences(src_words=english, trg_words=german)
prepared_features = list(text_based.PREPARED_FEATURES.keys())
data = sens.load_data(single_source=False, n_max=600000, features=prepared_features, agg_method='average')
stop = datetime.datetime.now()

---- DONE: Target sentences loaded
---- DONE: Source sentences loaded
---- DONE: Sentences preprocessed
---- INFO: Sentences embeddings extracted in en
---- ERROR: Sentence embedding failed in de on ID 162: Sicherheitsberater für den Gefahrguttransport
---- ERROR: Sentence embedding failed in de on ID 66: Arbeitsplan
---- INFO: Sentences embeddings extracted in de
---- INFO: Sentences transformed
---- INFO: Embedding space of source language transformed according to projection matrix
---- DONE: Source words extracted
---- DONE: Target words extracted
---- INFO: Start preparation of text-based feature translated_words
---- INFO: Start preparation of text-based feature num_words
---- INFO: Start preparation of text-based feature num_punctuation
---- INFO: Start preparation of text-based feature occ_question_mark
---- INFO: Start preparation of text-based feature occ_exclamation_mark
---- INFO: Start preparation of text-based feature num_noun
---- INFO: Start preparation of text-based fea

In [8]:
time(start, stop)

Computation time loading the data: 0:00:06.787439
Finished at: 2020-05-11 19:07:35.985834


In [9]:
data.head()

Unnamed: 0,src_embedding,src_embedding_aligned,trg_embedding,src_sentence,trg_sentence,src_preprocessed,trg_preprocessed,src_words,trg_words,src_translated_words,trg_translated_words,src_num_words,trg_num_words,src_num_punctuation,trg_num_punctuation,src_occ_question_mark,trg_occ_question_mark,src_occ_exclamation_mark,trg_occ_exclamation_mark,src_num_noun,trg_num_noun,src_num_verb,trg_num_verb,src_num_adverb,trg_num_adverb,src_num_adjective,trg_num_adjective,src_num_wh,trg_num_wh,src_num_pronoun,trg_num_pronoun
0,"[-0.07040999999999999, 0.094428, 0.1178755, -0...","[-0.30886147324594976, 0.34727272417056637, -0...","[-0.22081, 0.49136, -0.09375700000000001, -0.6...",Resumption of the session,Wiederaufnahme der Sitzungsperiode,"[resumption, session]","[wiederaufnahme, sitzungsperiode]","[resumption, session]","[wiederaufnahme, sitzungsperiode]","[sitzungsperiode, sitzung, tagung, session]",[],2,2,0,0,False,False,False,False,2,2,0,0,0,0,0,0,0,0,0,0
1,"[-0.05200671428571431, 0.016875466666666665, -...","[-0.19811859957522948, 0.2886577921005607, -0....","[-0.1368885, 0.29018044444444446, -0.152462499...",I declare resumed the session of the European ...,"Ich erkläre die am Freitag, dem 17. Dezember u...","[declare, resumed, session, european, parliame...","[erkläre, freitag, ,, 17, ., dezember, unterbr...","[declare, resumed, session, european, parliame...","[erkläre, freitag, dezember, unterbrochene, si...","[sitzungsperiode, sitzung, tagung, session, eu...","[friday, fridays, december, dec, european, goo...",21,16,2,5,False,False,False,False,6,11,6,2,1,0,5,2,0,0,0,0
2,"[-0.08509222727272729, -0.03279808636363638, -...","[-0.19086391830712127, 0.07257094264933792, -0...","[-0.1190726470588235, 0.13479876470588237, -0....","Although, as you will have seen, the dreaded '...","Wie Sie feststellen konnten, ist der gefürchte...","[although, ,, seen, ,, dreaded, ', millennium,...","[feststellen, konnten, ,, gefürchtete, "", mill...","[although, seen, dreaded, millennium, bug, fai...","[feststellen, konnten, gefürchtete, millenium,...","[zwar, obgleich, obwohl, gesehen, sah, geschei...","[detect, couldn, citizens, citizen, our, ours,...",17,13,6,6,False,False,False,False,8,11,4,2,2,0,2,2,0,0,1,0
3,"[-0.064901, -0.08438565454545456, -0.008655272...","[-0.16132638013366773, 0.22534862126716065, -0...","[-0.12862244444444446, 0.33829177777777786, -0...",You have requested a debate on this subject in...,Im Parlament besteht der Wunsch nach einer Aus...,"[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...","[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...","[beantragt, angefordert, beantragte, gewünscht...","[parliament, parliaments, ep, consists, insist...",8,8,3,1,False,False,False,False,5,7,1,0,0,0,2,1,0,0,0,0
4,"[-0.07612752, -0.011232175999999993, -0.131726...","[-0.1976057223610604, 0.15322830212421176, -0....","[-0.21589275000000002, 0.14882689999999996, -0...","In the meantime, I should like to observe a mi...",Heute möchte ich Sie bitten - das ist auch der...,"[meantime, ,, like, observe, minute, ', silenc...","[heute, möchte, bitten, -, wunsch, kolleginnen...","[meantime, like, observe, minute, silence, num...","[heute, möchte, bitten, wunsch, kolleginnen, k...","[mögen, gefällt, wie, like, minute, nummer, an...","[today, wants, pleas, beg, desire, wish, colle...",18,15,7,6,False,False,False,False,9,9,3,1,2,1,3,3,0,0,0,0


## Create training set

In [10]:
start = datetime.datetime.now()
train_data = sens.create_train_set(n_train=500000, frac_pos=0.5)
stop = datetime.datetime.now()

---- INFO: Translation dataframe created
---- INFO: Non-translation dataframe created
---- INFO: Determined and shuffled non-translation indices
---- INFO: Feature column trg_sentence appended
---- INFO: Feature column trg_preprocessed appended
---- INFO: Feature column trg_embedding appended
---- INFO: Feature column trg_words appended
---- INFO: Feature column trg_translated_words appended
---- INFO: Feature column trg_num_words appended
---- INFO: Feature column trg_num_punctuation appended
---- INFO: Feature column trg_occ_question_mark appended
---- INFO: Feature column trg_occ_exclamation_mark appended
---- INFO: Feature column trg_num_noun appended
---- INFO: Feature column trg_num_verb appended
---- INFO: Feature column trg_num_adverb appended
---- INFO: Feature column trg_num_adjective appended
---- INFO: Feature column trg_num_wh appended
---- INFO: Feature column trg_num_pronoun appended
---- INFO: All features appended
---- INFO: Added non-translation indicator
---- DONE: T

In [11]:
time(start, stop)

Computation time loading the data: 0:00:00.036564
Finished at: 2020-05-11 19:07:36.282370


## Create test collection

In [12]:
start = datetime.datetime.now()
test_collection = sens.create_test_collection(n_queries=50, n_docs=996)
stop = datetime.datetime.now()

---- INFO: Preliminary queries dataframe created
---- INFO: Preliminary documentes dataframe created
---- INFO: Merged queries and documents dataframe
---- INFO: Merged with test dataframe
---- INFO: Added translation indicator
---- DONE: Test collection created


In [13]:
time(start, stop)

Computation time loading the data: 0:00:00.126788
Finished at: 2020-05-11 19:07:36.428823


## Feature extraction

In [14]:
features_dict = {'text_based': list(text_based.FEATURES.keys()), 
                 'vector_based': list(vector_based.FEATURES.keys())}

In [15]:
start = datetime.datetime.now()
train_test, test_collection = sens.extract_features(features_dict=features_dict, data='train_test', drop_prepared=False)
stop = datetime.datetime.now()


---- INFO: Started extraction of text-based feature norm_diff_translated_words
---- INFO: Started extraction of text-based feature diff_num_words
---- INFO: Started extraction of text-based feature diff_num_punctuation
---- INFO: Started extraction of text-based feature diff_occ_question_mark
---- INFO: Started extraction of text-based feature diff_occ_exclamation_mark
---- INFO: Started extraction of text-based feature diff_num_noun
---- INFO: Started extraction of text-based feature diff_num_verb
---- INFO: Started extraction of text-based feature diff_num_adverb
---- INFO: Started extraction of text-based feature diff_num_adjective
---- INFO: Started extraction of text-based feature diff_num_wh
---- INFO: Started extraction of text-based feature diff_num_pronoun
---- INFO: Started extraction of text-based feature rel_diff_num_words
---- INFO: Started extraction of text-based feature rel_diff_num_punctuation
---- INFO: Started extraction of text-based feature rel_diff_num_noun
---- I

In [16]:
time(start, stop)

Computation time loading the data: 0:01:03.114992
Finished at: 2020-05-11 19:08:39.570036


## View data

In [17]:
train_data.head()

Unnamed: 0,src_embedding,src_embedding_aligned,trg_embedding,src_sentence,trg_sentence,src_preprocessed,trg_preprocessed,src_words,trg_words,src_translated_words,trg_translated_words,src_num_words,trg_num_words,src_num_punctuation,trg_num_punctuation,src_occ_question_mark,trg_occ_question_mark,src_occ_exclamation_mark,trg_occ_exclamation_mark,src_num_noun,trg_num_noun,src_num_verb,trg_num_verb,src_num_adverb,trg_num_adverb,src_num_adjective,trg_num_adjective,src_num_wh,trg_num_wh,src_num_pronoun,trg_num_pronoun,translation,norm_diff_translated_words,diff_num_words,diff_num_punctuation,diff_occ_question_mark,diff_occ_exclamation_mark,diff_num_noun,diff_num_verb,diff_num_adverb,diff_num_adjective,diff_num_wh,diff_num_pronoun,rel_diff_num_words,rel_diff_num_punctuation,rel_diff_num_noun,rel_diff_num_verb,rel_diff_num_adverb,rel_diff_num_adjective,rel_diff_num_wh,rel_diff_num_pronoun,norm_diff_num_words,norm_diff_num_punctuation,norm_diff_num_noun,norm_diff_num_verb,norm_diff_num_adverb,norm_diff_num_adjective,norm_diff_num_wh,norm_diff_num_pronoun,euclidean_distance,cosine_similarity
0,"[-0.07040999999999999, 0.094428, 0.1178755, -0...","[-0.30886147324594976, 0.34727272417056637, -0...","[-0.22081, 0.49136, -0.09375700000000001, -0.6...",Resumption of the session,Wiederaufnahme der Sitzungsperiode,"[resumption, session]","[wiederaufnahme, sitzungsperiode]","[resumption, session]","[wiederaufnahme, sitzungsperiode]","[sitzungsperiode, sitzung, tagung, session]",[],2,2,0,0,False,False,False,False,2,2,0,0,0,0,0,0,0,0,0,0,1,0.25,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,2.806763,0.69953
1,"[-0.05200671428571431, 0.016875466666666665, -...","[-0.19811859957522948, 0.2886577921005607, -0....","[-0.1368885, 0.29018044444444446, -0.152462499...",I declare resumed the session of the European ...,"Ich erkläre die am Freitag, dem 17. Dezember u...","[declare, resumed, session, european, parliame...","[erkläre, freitag, ,, 17, ., dezember, unterbr...","[declare, resumed, session, european, parliame...","[erkläre, freitag, dezember, unterbrochene, si...","[sitzungsperiode, sitzung, tagung, session, eu...","[friday, fridays, december, dec, european, goo...",21,16,2,5,False,False,False,False,6,11,6,2,1,0,5,2,0,0,0,0,1,0.305263,5,3,1,1,5,4,1,3,0,0,5,-3,-5,4,1,3,0,0,0.277778,-1.0,-0.625,1.0,0.0,1.0,0,0.0,0.989671,0.87535
2,"[-0.08509222727272729, -0.03279808636363638, -...","[-0.19086391830712127, 0.07257094264933792, -0...","[-0.1190726470588235, 0.13479876470588237, -0....","Although, as you will have seen, the dreaded '...","Wie Sie feststellen konnten, ist der gefürchte...","[although, ,, seen, ,, dreaded, ', millennium,...","[feststellen, konnten, ,, gefürchtete, "", mill...","[although, seen, dreaded, millennium, bug, fai...","[feststellen, konnten, gefürchtete, millenium,...","[zwar, obgleich, obwohl, gesehen, sah, geschei...","[detect, couldn, citizens, citizen, our, ours,...",17,13,6,6,False,False,False,False,8,11,4,2,2,0,2,2,0,0,1,0,1,0.0,4,0,1,1,3,2,2,0,0,1,4,0,-3,2,2,0,0,1,0.266667,0.0,-0.333333,0.666667,2.0,0.0,0,0.0,1.046691,0.842287
3,"[-0.064901, -0.08438565454545456, -0.008655272...","[-0.16132638013366773, 0.22534862126716065, -0...","[-0.12862244444444446, 0.33829177777777786, -0...",You have requested a debate on this subject in...,Im Parlament besteht der Wunsch nach einer Aus...,"[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...","[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...","[beantragt, angefordert, beantragte, gewünscht...","[parliament, parliaments, ep, consists, insist...",8,8,3,1,False,False,False,False,5,7,1,0,0,0,2,1,0,0,0,0,1,0.375,0,2,1,1,2,1,0,1,0,0,0,2,-2,1,0,1,0,0,0.0,1.0,-0.333333,0.0,0.0,1.0,0,0.0,1.517368,0.749109
4,"[-0.07612752, -0.011232175999999993, -0.131726...","[-0.1976057223610604, 0.15322830212421176, -0....","[-0.21589275000000002, 0.14882689999999996, -0...","In the meantime, I should like to observe a mi...",Heute möchte ich Sie bitten - das ist auch der...,"[meantime, ,, like, observe, minute, ', silenc...","[heute, möchte, bitten, -, wunsch, kolleginnen...","[meantime, like, observe, minute, silence, num...","[heute, möchte, bitten, wunsch, kolleginnen, k...","[mögen, gefällt, wie, like, minute, nummer, an...","[today, wants, pleas, beg, desire, wish, colle...",18,15,7,6,False,False,False,False,9,9,3,1,2,1,3,3,0,0,0,0,1,0.338889,3,1,1,1,0,2,1,0,0,0,3,1,0,2,1,0,0,0,0.1875,0.166667,0.0,1.0,1.0,0.0,0,0.0,0.871747,0.887995


In [18]:
test_collection.head()

Unnamed: 0,src_sentence,src_preprocessed,src_embedding,src_embedding_aligned,src_words,src_translated_words,src_num_words,src_num_punctuation,src_occ_question_mark,src_occ_exclamation_mark,src_num_noun,src_num_verb,src_num_adverb,src_num_adjective,src_num_wh,src_num_pronoun,trg_sentence,trg_preprocessed,trg_embedding,trg_words,trg_translated_words,trg_num_words,trg_num_punctuation,trg_occ_question_mark,trg_occ_exclamation_mark,trg_num_noun,trg_num_verb,trg_num_adverb,trg_num_adjective,trg_num_wh,trg_num_pronoun,translation,norm_diff_translated_words,diff_num_words,diff_num_punctuation,diff_occ_question_mark,diff_occ_exclamation_mark,diff_num_noun,diff_num_verb,diff_num_adverb,diff_num_adjective,diff_num_wh,diff_num_pronoun,rel_diff_num_words,rel_diff_num_punctuation,rel_diff_num_noun,rel_diff_num_verb,rel_diff_num_adverb,rel_diff_num_adjective,rel_diff_num_wh,rel_diff_num_pronoun,norm_diff_num_words,norm_diff_num_punctuation,norm_diff_num_noun,norm_diff_num_verb,norm_diff_num_adverb,norm_diff_num_adjective,norm_diff_num_wh,norm_diff_num_pronoun,euclidean_distance,cosine_similarity
0,Resumption of the session,"[resumption, session]","[-0.07040999999999999, 0.094428, 0.1178755, -0...","[-0.30886147324594976, 0.34727272417056637, -0...","[resumption, session]","[sitzungsperiode, sitzung, tagung, session]",2,0,False,False,2,0,0,0,0,0,Wiederaufnahme der Sitzungsperiode,"[wiederaufnahme, sitzungsperiode]","[-0.22081, 0.49136, -0.09375700000000001, -0.6...","[wiederaufnahme, sitzungsperiode]",[],2,0,False,False,2,0,0,0,0,0,1,0.25,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,2.806763,0.69953
1,Resumption of the session,"[resumption, session]","[-0.07040999999999999, 0.094428, 0.1178755, -0...","[-0.30886147324594976, 0.34727272417056637, -0...","[resumption, session]","[sitzungsperiode, sitzung, tagung, session]",2,0,False,False,2,0,0,0,0,0,"Ich erkläre die am Freitag, dem 17. Dezember u...","[erkläre, freitag, ,, 17, ., dezember, unterbr...","[-0.1368885, 0.29018044444444446, -0.152462499...","[erkläre, freitag, dezember, unterbrochene, si...","[friday, fridays, december, dec, european, goo...",16,5,False,False,11,2,0,2,0,0,0,0.033333,14,5,1,1,9,2,0,2,0,0,-14,-5,-9,-2,0,-2,0,0,-1.555556,-2.5,-1.5,-2.0,0.0,-2.0,0,0.0,2.720701,0.551078
2,Resumption of the session,"[resumption, session]","[-0.07040999999999999, 0.094428, 0.1178755, -0...","[-0.30886147324594976, 0.34727272417056637, -0...","[resumption, session]","[sitzungsperiode, sitzung, tagung, session]",2,0,False,False,2,0,0,0,0,0,"Wie Sie feststellen konnten, ist der gefürchte...","[feststellen, konnten, ,, gefürchtete, "", mill...","[-0.1190726470588235, 0.13479876470588237, -0....","[feststellen, konnten, gefürchtete, millenium,...","[detect, couldn, citizens, citizen, our, ours,...",13,6,False,False,11,2,0,2,0,0,0,0.0,11,6,1,1,9,2,0,2,0,0,-11,-6,-9,-2,0,-2,0,0,-1.571429,-2.0,-1.5,-2.0,0.0,-2.0,0,0.0,2.91594,0.460577
3,Resumption of the session,"[resumption, session]","[-0.07040999999999999, 0.094428, 0.1178755, -0...","[-0.30886147324594976, 0.34727272417056637, -0...","[resumption, session]","[sitzungsperiode, sitzung, tagung, session]",2,0,False,False,2,0,0,0,0,0,Im Parlament besteht der Wunsch nach einer Aus...,"[parlament, besteht, wunsch, aussprache, verla...","[-0.12862244444444446, 0.33829177777777786, -0...","[parlament, besteht, wunsch, aussprache, verla...","[parliament, parliaments, ep, consists, insist...",8,1,False,False,7,0,0,1,0,0,0,0.0625,6,1,1,1,5,0,0,1,0,0,-6,-1,-5,0,0,-1,0,0,-1.2,0.0,-1.25,0.0,0.0,0.0,0,0.0,2.556774,0.621284
4,Resumption of the session,"[resumption, session]","[-0.07040999999999999, 0.094428, 0.1178755, -0...","[-0.30886147324594976, 0.34727272417056637, -0...","[resumption, session]","[sitzungsperiode, sitzung, tagung, session]",2,0,False,False,2,0,0,0,0,0,Heute möchte ich Sie bitten - das ist auch der...,"[heute, möchte, bitten, -, wunsch, kolleginnen...","[-0.21589275000000002, 0.14882689999999996, -0...","[heute, möchte, bitten, wunsch, kolleginnen, k...","[today, wants, pleas, beg, desire, wish, colle...",15,6,False,False,9,1,1,3,0,0,0,0.0,13,6,1,1,7,1,1,3,0,0,-13,-6,-7,-1,-1,-3,0,0,-1.625,-2.0,-1.4,0.0,0.0,-3.0,0,0.0,2.964772,0.433846


## Save data

In [19]:
path = f'{paths.data_path}extracted_data/global'
if not os.path.exists(path):
    os.makedirs(path)

In [20]:
train_data.to_pickle(f'{path}/training_data_avg.pkl')
'Finished at: {}'.format(datetime.datetime.now())

'Finished at: 2020-05-11 19:08:39.911648'

In [21]:
test_collection.to_pickle(f'{path}/test_collection_avg.pkl')
'Finished at: {}'.format(datetime.datetime.now())

'Finished at: 2020-05-11 19:08:39.993649'

In [22]:
overall_stop = datetime.datetime.now()
'Total computation time: {}'.format(overall_stop-overall_start)

'Total computation time: 0:01:11.524554'

## TO DO ! For tfidf: Save idf values