## Imports

In [31]:
import io, os, importlib, pickle
import datetime
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [32]:
from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.features import text_based
importlib.reload(text_based)

from ir_crosslingual.features import vector_based
importlib.reload(vector_based)

from ir_crosslingual.embeddings import embeddings
importlib.reload(embeddings)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

<module 'ir_crosslingual.sentences.sentences' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/03_Feature-Selection/ir-crosslingual/ir_crosslingual/sentences/sentences.py'>

In [33]:
def time(start, stop):
    print('Computation time loading the data: {}'.format(stop-start))
    print('Finished at: {}'.format(datetime.datetime.now()))

## Load word embeddings

In [34]:
overall_start = datetime.datetime.now()

In [35]:
german = embeddings.WordEmbeddings('de')
german.load_embeddings()

english = embeddings.WordEmbeddings('en')
english.load_embeddings()

In [36]:
W_ende, W_deen = embeddings.WordEmbeddings.learn_projection_matrix(src_lang='en', trg_lang='de')

---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- DONE: Seed dictionary extracted for the languages: en-de
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix from en to de
---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- DONE: Seed dictionary extracted for the languages: de-en
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix from de to en


## Load sentence embeddings

In [37]:
start = datetime.datetime.now()
sens = sentences.Sentences(src_words=english, trg_words=german)
prepared_features = list(text_based.PREPARED_FEATURES.keys())
data = sens.load_data(single_source=False, n_max=600000, features=prepared_features, agg_method='average')
stop = datetime.datetime.now()

---- DONE: Target sentences loaded
---- DONE: Source sentences loaded
---- DONE: Sentences preprocessed
---- INFO: Sentences embeddings extracted in en
---- ERROR: Sentence embedding failed in de on ID 162: Sicherheitsberater für den Gefahrguttransport
---- ERROR: Sentence embedding failed in de on ID 2585: Lebensmittelsicherheit
---- ERROR: Sentence embedding failed in de on ID 66: Arbeitsplan
---- ERROR: Sentence embedding failed in de on ID 4925: Kapitalsteuer
---- INFO: Sentences embeddings extracted in de
---- DONE: Sentences transformed
---- INFO: Embedding space of source language transformed according to projection matrix
---- DONE: Source words extracted
---- DONE: Target words extracted
---- INFO: Start preparation of text-based feature translated_words
---- INFO: Start preparation of text-based feature num_words
---- INFO: Start preparation of text-based feature num_punctuation
---- INFO: Start preparation of text-based feature occ_question_mark
---- INFO: Start preparation 

In [38]:
time(start, stop)

Computation time loading the data: 0:00:01.870446
Finished at: 2020-05-12 18:52:09.408573


In [39]:
data.head()

Unnamed: 0,src_embedding,src_embedding_aligned,trg_embedding,src_sentence,trg_sentence,src_preprocessed,trg_preprocessed,src_words,trg_words,src_translated_words,trg_translated_words,src_num_words,trg_num_words,src_num_punctuation,trg_num_punctuation,src_occ_question_mark,trg_occ_question_mark,src_occ_exclamation_mark,trg_occ_exclamation_mark
0,"[-0.07040999999999999, 0.094428, 0.1178755, -0...","[-0.30886147324594976, 0.34727272417056637, -0...","[-0.22081, 0.49136, -0.09375700000000001, -0.6...",Resumption of the session,Wiederaufnahme der Sitzungsperiode,"[resumption, session]","[wiederaufnahme, sitzungsperiode]","[resumption, session]","[wiederaufnahme, sitzungsperiode]","[sitzungsperiode, sitzung, tagung, session]",[],2,2,0,0,False,False,False,False
1,"[-0.05200671428571431, 0.016875466666666665, -...","[-0.19811859957522948, 0.2886577921005607, -0....","[-0.1368885, 0.29018044444444446, -0.152462499...",I declare resumed the session of the European ...,"Ich erkläre die am Freitag, dem 17. Dezember u...","[declare, resumed, session, european, parliame...","[erkläre, freitag, ,, 17, ., dezember, unterbr...","[declare, resumed, session, european, parliame...","[erkläre, freitag, dezember, unterbrochene, si...","[sitzungsperiode, sitzung, tagung, session, eu...","[friday, fridays, december, dec, european, goo...",21,16,2,5,False,False,False,False
2,"[-0.08509222727272729, -0.03279808636363638, -...","[-0.19086391830712127, 0.07257094264933792, -0...","[-0.1190726470588235, 0.13479876470588237, -0....","Although, as you will have seen, the dreaded '...","Wie Sie feststellen konnten, ist der gefürchte...","[although, ,, seen, ,, dreaded, ', millennium,...","[feststellen, konnten, ,, gefürchtete, "", mill...","[although, seen, dreaded, millennium, bug, fai...","[feststellen, konnten, gefürchtete, millenium,...","[zwar, obgleich, obwohl, gesehen, sah, geschei...","[detect, couldn, citizens, citizen, our, ours,...",17,13,6,6,False,False,False,False
3,"[-0.064901, -0.08438565454545456, -0.008655272...","[-0.16132638013366773, 0.22534862126716065, -0...","[-0.12862244444444446, 0.33829177777777786, -0...",You have requested a debate on this subject in...,Im Parlament besteht der Wunsch nach einer Aus...,"[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...","[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...","[beantragt, angefordert, beantragte, gewünscht...","[parliament, parliaments, ep, consists, insist...",8,8,3,1,False,False,False,False
4,"[-0.07612752, -0.011232175999999993, -0.131726...","[-0.1976057223610604, 0.15322830212421176, -0....","[-0.21589275000000002, 0.14882689999999996, -0...","In the meantime, I should like to observe a mi...",Heute möchte ich Sie bitten - das ist auch der...,"[meantime, ,, like, observe, minute, ', silenc...","[heute, möchte, bitten, -, wunsch, kolleginnen...","[meantime, like, observe, minute, silence, num...","[heute, möchte, bitten, wunsch, kolleginnen, k...","[mögen, gefällt, wie, like, minute, nummer, an...","[today, wants, pleas, beg, desire, wish, colle...",18,15,7,6,False,False,False,False


## Create training set

In [40]:
start = datetime.datetime.now()
train_data = sens.create_train_set(n_train=500000, frac_pos=0.5)
stop = datetime.datetime.now()

---- INFO: Translation dataframe created
---- INFO: Non-translation dataframe created
---- INFO: Determined and shuffled non-translation indices
---- INFO: Feature column trg_sentence appended
---- INFO: Feature column trg_preprocessed appended
---- INFO: Feature column trg_embedding appended
---- INFO: Feature column trg_words appended
---- INFO: Feature column trg_translated_words appended
---- INFO: Feature column trg_num_words appended
---- INFO: Feature column trg_num_punctuation appended
---- INFO: Feature column trg_occ_question_mark appended
---- INFO: Feature column trg_occ_exclamation_mark appended
---- INFO: All features appended
---- INFO: Added non-translation indicator
---- DONE: Training dataset created


In [41]:
time(start, stop)

Computation time loading the data: 0:00:00.053593
Finished at: 2020-05-12 18:52:15.504173


## Create test collection

In [42]:
start = datetime.datetime.now()
test_collection = sens.create_test_collection(n_queries=1000, n_docs=10000)
stop = datetime.datetime.now()

---- INFO: Preliminary queries dataframe created
---- INFO: Preliminary documentes dataframe created
---- INFO: Merged queries and documents dataframe
---- INFO: Merged with test dataframe
---- INFO: Added translation indicator
---- DONE: Test collection created


In [43]:
time(start, stop)

Computation time loading the data: 0:00:00.183988
Finished at: 2020-05-12 18:52:31.159330


## Feature extraction

In [44]:
features_dict = {'text_based': list(text_based.FEATURES.keys()), 
                 'vector_based': list(vector_based.FEATURES.keys())}

In [45]:
start = datetime.datetime.now()
train_test, test_collection = sens.extract_features(features_dict=features_dict, data='train_test', drop_prepared=False)
stop = datetime.datetime.now()


---- INFO: No vector elements as features specified
---- INFO: Started extraction of text-based feature norm_diff_translated_words
---- INFO: Started extraction of text-based feature abs_diff_num_words
---- INFO: Started extraction of text-based feature abs_diff_num_punctuation
---- INFO: Started extraction of text-based feature abs_diff_occ_question_mark
---- INFO: Started extraction of text-based feature abs_diff_occ_exclamation_mark
---- INFO: Started extraction of text-based feature rel_diff_num_words
---- INFO: Started extraction of text-based feature rel_diff_num_punctuation
---- INFO: Started extraction of text-based feature norm_diff_num_words
---- INFO: Started extraction of text-based feature norm_diff_num_punctuation
---- INFO: Started extraction of vector-based feature euclidean_distance
---- INFO: Started extraction of vector-based feature cosine_similarity
---- DONE: All given features extracted on train dataset
-----------------------
---- INFO: Started extraction of tex

In [46]:
time(start, stop)

Computation time loading the data: 0:00:45.344105
Finished at: 2020-05-12 18:53:22.291500


## View data

In [26]:
train_data.head()

Unnamed: 0,src_embedding,src_embedding_aligned,trg_embedding,src_sentence,trg_sentence,src_preprocessed,trg_preprocessed,src_words,trg_words,src_translated_words,trg_translated_words,src_num_words,trg_num_words,src_num_punctuation,trg_num_punctuation,src_occ_question_mark,trg_occ_question_mark,src_occ_exclamation_mark,trg_occ_exclamation_mark,src_num_noun,trg_num_noun,src_num_verb,trg_num_verb,src_num_adverb,trg_num_adverb,src_num_adjective,trg_num_adjective,src_num_wh,trg_num_wh,src_num_pronoun,trg_num_pronoun,translation,norm_diff_translated_words,abs_diff_num_words,abs_diff_num_punctuation,abs_diff_occ_question_mark,abs_diff_occ_exclamation_mark,abs_diff_num_noun,abs_diff_num_verb,abs_diff_num_adverb,abs_diff_num_adjective,abs_diff_num_wh,abs_diff_num_pronoun,rel_diff_num_words,rel_diff_num_punctuation,rel_diff_num_noun,rel_diff_num_verb,rel_diff_num_adverb,rel_diff_num_adjective,rel_diff_num_wh,rel_diff_num_pronoun,norm_diff_num_words,norm_diff_num_punctuation,norm_diff_num_noun,norm_diff_num_verb,norm_diff_num_adverb,norm_diff_num_adjective,norm_diff_num_wh,norm_diff_num_pronoun,euclidean_distance,cosine_similarity
0,"[-0.07040999999999999, 0.094428, 0.1178755, -0...","[-0.30886147324594976, 0.34727272417056637, -0...","[-0.22081, 0.49136, -0.09375700000000001, -0.6...",Resumption of the session,Wiederaufnahme der Sitzungsperiode,"[resumption, session]","[wiederaufnahme, sitzungsperiode]","[resumption, session]","[wiederaufnahme, sitzungsperiode]","[sitzungsperiode, sitzung, tagung, session]",[],2,2,0,0,False,False,False,False,2,2,0,0,0,0,0,0,0,0,0,0,1,0.25,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.806763,0.69953
1,"[-0.05200671428571431, 0.016875466666666665, -...","[-0.19811859957522948, 0.2886577921005607, -0....","[-0.1368885, 0.29018044444444446, -0.152462499...",I declare resumed the session of the European ...,"Ich erkläre die am Freitag, dem 17. Dezember u...","[declare, resumed, session, european, parliame...","[erkläre, freitag, ,, 17, ., dezember, unterbr...","[declare, resumed, session, european, parliame...","[erkläre, freitag, dezember, unterbrochene, si...","[sitzungsperiode, sitzung, tagung, session, eu...","[friday, fridays, december, dec, european, goo...",21,16,2,5,False,False,False,False,6,11,6,2,1,0,5,2,0,0,0,0,1,0.305263,5,3,1,1,5,4,1,3,0,0,5,-3,-5,4,1,3,0,0,0.277778,-1.0,-0.625,1.0,0.0,1.0,0.0,0.0,0.989671,0.87535
2,"[-0.08509222727272729, -0.03279808636363638, -...","[-0.19086391830712127, 0.07257094264933792, -0...","[-0.1190726470588235, 0.13479876470588237, -0....","Although, as you will have seen, the dreaded '...","Wie Sie feststellen konnten, ist der gefürchte...","[although, ,, seen, ,, dreaded, ', millennium,...","[feststellen, konnten, ,, gefürchtete, "", mill...","[although, seen, dreaded, millennium, bug, fai...","[feststellen, konnten, gefürchtete, millenium,...","[zwar, obgleich, obwohl, gesehen, sah, geschei...","[detect, couldn, citizens, citizen, our, ours,...",17,13,6,6,False,False,False,False,8,11,4,2,2,0,2,2,0,0,1,0,1,0.0,4,0,1,1,3,2,2,0,0,1,4,0,-3,2,2,0,0,1,0.266667,0.0,-0.333333,0.666667,2.0,0.0,0.0,0.0,1.046691,0.842287
3,"[-0.064901, -0.08438565454545456, -0.008655272...","[-0.16132638013366773, 0.22534862126716065, -0...","[-0.12862244444444446, 0.33829177777777786, -0...",You have requested a debate on this subject in...,Im Parlament besteht der Wunsch nach einer Aus...,"[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...","[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...","[beantragt, angefordert, beantragte, gewünscht...","[parliament, parliaments, ep, consists, insist...",8,8,3,1,False,False,False,False,5,7,1,0,0,0,2,1,0,0,0,0,1,0.375,0,2,1,1,2,1,0,1,0,0,0,2,-2,1,0,1,0,0,0.0,1.0,-0.333333,0.0,0.0,1.0,0.0,0.0,1.517368,0.749109
4,"[-0.07612752, -0.011232175999999993, -0.131726...","[-0.1976057223610604, 0.15322830212421176, -0....","[-0.21589275000000002, 0.14882689999999996, -0...","In the meantime, I should like to observe a mi...",Heute möchte ich Sie bitten - das ist auch der...,"[meantime, ,, like, observe, minute, ', silenc...","[heute, möchte, bitten, -, wunsch, kolleginnen...","[meantime, like, observe, minute, silence, num...","[heute, möchte, bitten, wunsch, kolleginnen, k...","[mögen, gefällt, wie, like, minute, nummer, an...","[today, wants, pleas, beg, desire, wish, colle...",18,15,7,6,False,False,False,False,9,9,3,1,2,1,3,3,0,0,0,0,1,0.338889,3,1,1,1,0,2,1,0,0,0,3,1,0,2,1,0,0,0,0.1875,0.166667,0.0,1.0,1.0,0.0,0.0,0.0,0.871747,0.887995


In [27]:
test_collection.head()

Unnamed: 0,src_sentence,src_preprocessed,src_embedding,src_embedding_aligned,src_words,src_translated_words,src_num_words,src_num_punctuation,src_occ_question_mark,src_occ_exclamation_mark,src_num_noun,src_num_verb,src_num_adverb,src_num_adjective,src_num_wh,src_num_pronoun,trg_sentence,trg_preprocessed,trg_embedding,trg_words,trg_translated_words,trg_num_words,trg_num_punctuation,trg_occ_question_mark,trg_occ_exclamation_mark,trg_num_noun,trg_num_verb,trg_num_adverb,trg_num_adjective,trg_num_wh,trg_num_pronoun,translation,norm_diff_translated_words,abs_diff_num_words,abs_diff_num_punctuation,abs_diff_occ_question_mark,abs_diff_occ_exclamation_mark,abs_diff_num_noun,abs_diff_num_verb,abs_diff_num_adverb,abs_diff_num_adjective,abs_diff_num_wh,abs_diff_num_pronoun,rel_diff_num_words,rel_diff_num_punctuation,rel_diff_num_noun,rel_diff_num_verb,rel_diff_num_adverb,rel_diff_num_adjective,rel_diff_num_wh,rel_diff_num_pronoun,norm_diff_num_words,norm_diff_num_punctuation,norm_diff_num_noun,norm_diff_num_verb,norm_diff_num_adverb,norm_diff_num_adjective,norm_diff_num_wh,norm_diff_num_pronoun,euclidean_distance,cosine_similarity
0,We have made progress in recent months and wee...,"[made, progress, recent, months, weeks, ,, mr,...","[-0.049795181818181815, -0.12149147272727272, ...","[-0.15659878031849248, 0.13822096928117522, -0...","[made, progress, recent, months, weeks, mr, ga...","[hergestellt, machte, gemacht, made, geschafft...",9,2,False,False,4,2,1,2,0,0,"Wie Herr Gama eingangs sagte, sind in den letz...","[herr, gama, eingangs, sagte, ,, letzten, mona...","[-0.16466908333333333, 0.19261133333333336, -0...","[herr, gama, eingangs, sagte, letzten, monaten...","[lord, herr, sir, mister, mr, said, told, last...",10,2,False,False,8,0,1,1,0,0,1,0.316667,1,0,1,1,4,2,0,1,0,0,-1,0,-4,2,0,1,0,0,-0.111111,0.0,-0.666667,2.0,0.0,1.0,0.0,0.0,1.015293,0.86423
1,We have made progress in recent months and wee...,"[made, progress, recent, months, weeks, ,, mr,...","[-0.049795181818181815, -0.12149147272727272, ...","[-0.15659878031849248, 0.13822096928117522, -0...","[made, progress, recent, months, weeks, mr, ga...","[hergestellt, machte, gemacht, made, geschafft...",9,2,False,False,4,2,1,2,0,0,Dennoch wird dieser schwierige Prozeß zwangslä...,"[dennoch, schwierige, prozeß, zwangsläufig, pr...","[-0.2440902857142857, 0.15871371428571426, -0....","[dennoch, schwierige, prozeß, zwangsläufig, pr...","[nevertheless, nonetheless, yet, however, prob...",7,1,False,False,6,1,0,0,0,0,0,0.0,2,1,1,1,2,1,1,2,0,0,2,1,-2,1,1,2,0,0,0.25,1.0,-0.4,1.0,0.0,2.0,0.0,0.0,1.763604,0.633771
2,We have made progress in recent months and wee...,"[made, progress, recent, months, weeks, ,, mr,...","[-0.049795181818181815, -0.12149147272727272, ...","[-0.15659878031849248, 0.13822096928117522, -0...","[made, progress, recent, months, weeks, mr, ga...","[hergestellt, machte, gemacht, made, geschafft...",9,2,False,False,4,2,1,2,0,0,"Wir werden alles tun, damit dieser Prozeß erfo...","[tun, ,, prozeß, erfolgreich, abgeschlossen, a...","[-0.23712691666666666, 0.017124166666666666, -...","[tun, prozeß, erfolgreich, abgeschlossen, aufg...","[doing, do, tun, successful, successfully, suc...",9,3,False,False,7,0,0,2,0,0,0,0.0,0,1,1,1,3,2,1,0,0,0,0,-1,-3,2,1,0,0,0,0.0,-0.5,-0.6,2.0,0.0,0.0,0.0,0.0,1.566951,0.675959
3,We have made progress in recent months and wee...,"[made, progress, recent, months, weeks, ,, mr,...","[-0.049795181818181815, -0.12149147272727272, ...","[-0.15659878031849248, 0.13822096928117522, -0...","[made, progress, recent, months, weeks, mr, ga...","[hergestellt, machte, gemacht, made, geschafft...",9,2,False,False,4,2,1,2,0,0,"Abschließend möchte ich für diejenigen, die er...","[abschließend, möchte, diejenigen, ,, erst, le...","[-0.10956108787878788, 0.17325041212121217, -0...","[abschließend, möchte, diejenigen, erst, letzt...","[wants, last, mins, minutes, min, came, alread...",28,6,False,False,20,0,1,7,0,0,0,0.0,19,4,1,1,16,2,0,5,0,0,-19,-4,-16,2,0,-5,0,0,-1.055556,-1.0,-1.333333,2.0,0.0,-1.25,0.0,0.0,1.202448,0.790852
4,We have made progress in recent months and wee...,"[made, progress, recent, months, weeks, ,, mr,...","[-0.049795181818181815, -0.12149147272727272, ...","[-0.15659878031849248, 0.13822096928117522, -0...","[made, progress, recent, months, weeks, mr, ga...","[hergestellt, machte, gemacht, made, geschafft...",9,2,False,False,4,2,1,2,0,0,"Ich hoffe, die Mitglieder des Hauses haben daf...","[hoffe, ,, mitglieder, hauses, dafür, verständ...","[-0.009993571428571432, 0.15969957142857144, -...","[hoffe, mitglieder, hauses, dafür, verständnis]","[hoping, hope, member, membership, members, sy...",5,2,False,False,4,1,0,0,0,0,0,0.0,4,0,1,1,0,1,1,2,0,0,4,0,0,1,1,2,0,0,0.571429,0.0,0.0,1.0,0.0,2.0,0.0,0.0,1.640156,0.65404


## Save data

In [19]:
path = f'{paths.data_path}extracted_data/global'
if not os.path.exists(path):
    os.makedirs(path)

In [20]:
train_data.to_pickle(f'{path}/training_data_tfidf.pkl')
'Finished at: {}'.format(datetime.datetime.now())

'Finished at: 2020-05-12 12:11:05.748077'

In [21]:
test_collection.to_pickle(f'{path}/test_collection_tfidf.pkl')
'Finished at: {}'.format(datetime.datetime.now())

'Finished at: 2020-05-12 12:11:44.187262'

In [22]:
overall_stop = datetime.datetime.now()
'Total computation time: {}'.format(overall_stop-overall_start)

'Total computation time: 16:52:53.781197'

## For tfidf only: Save idf values

In [50]:
idf_en = {k: sens.vectorizer['en'].idf_[v] for k,v in sens.vectorizer['en'].vocabulary_.items()}

In [54]:
idf_de = {k: sens.vectorizer['de'].idf_[v] for k,v in sens.vectorizer['de'].vocabulary_.items()}

In [60]:
with open(f'{path}/idf_english.json', 'wb') as fp:
    pickle.dump(idf_en, fp, protocol=pickle.HIGHEST_PROTOCOL)
'Finished at: {}'.format(datetime.datetime.now())

'Finished at: 2020-05-12 13:02:05.264231'

In [61]:
with open(f'{path}/idf_german.json', 'wb') as fp:
    pickle.dump(idf_de, fp, protocol=pickle.HIGHEST_PROTOCOL)
'Finished at: {}'.format(datetime.datetime.now())

'Finished at: 2020-05-12 13:02:06.882645'