## Imports

In [1]:
import io, os, importlib, pickle
import datetime
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.features import text_based
importlib.reload(text_based)

from ir_crosslingual.features import vector_based
importlib.reload(vector_based)

from ir_crosslingual.embeddings import embeddings
importlib.reload(embeddings)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

<module 'ir_crosslingual.sentences.sentences' from '/Users/jani/PycharmProjects/ir-crosslingual/ir_crosslingual/sentences/sentences.py'>

In [3]:
def time(start, stop):
    print('Computation time loading the data: {}'.format(stop-start))
    print('Finished at: {}'.format(datetime.datetime.now()))

## Load word embeddings

In [4]:
overall_start = datetime.datetime.now()

In [5]:
german = embeddings.WordEmbeddings('de')
german.load_embeddings()

english = embeddings.WordEmbeddings('en')
english.load_embeddings()

In [6]:
W_ende, W_deen = embeddings.WordEmbeddings.learn_projection_matrix(src_lang='en', trg_lang='de')

---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- DONE: Seed dictionary extracted for the languages: en-de
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix from en to de
---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- DONE: Seed dictionary extracted for the languages: de-en
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix from de to en


## Load sentence embeddings

In [7]:
start = datetime.datetime.now()
sens = sentences.Sentences(src_words=english, trg_words=german)
prepared_features = list(text_based.PREPARED_FEATURES.keys())
data = sens.load_data(single_source=False, n_max=600000, features=prepared_features, agg_method='average')
stop = datetime.datetime.now()

---- DONE: Target sentences loaded
---- DONE: Source sentences loaded
---- DONE: Sentences preprocessed
---- ERROR: Sentence embedding failed in en on ID 299392: and
---- ERROR: Sentence embedding failed in en on ID 548101: And:
---- ERROR: Sentence embedding failed in en on ID 543751: and
---- ERROR: Sentence embedding failed in en on ID 289416: and
---- ERROR: Sentence embedding failed in en on ID 407945: and
---- ERROR: Sentence embedding failed in en on ID 246928: and
---- ERROR: Sentence embedding failed in en on ID 279696: Banotti: 369
---- ERROR: Sentence embedding failed in en on ID 279698: Balfe: 259
---- ERROR: Sentence embedding failed in en on ID 15251: UCITS
---- ERROR: Sentence embedding failed in en on ID 77204: Hooliganism
---- ERROR: Sentence embedding failed in en on ID 181268: and
---- ERROR: Sentence embedding failed in en on ID 26774: Eurodac
---- ERROR: Sentence embedding failed in en on ID 185366: Transmissible spongiform encephalopathies
---- ERROR: Sentence emb

---- INFO: Sentences embeddings extracted in de
---- DONE: Sentences transformed
---- INFO: Embedding space of source language transformed according to projection matrix
---- DONE: Source words extracted
---- DONE: Target words extracted
---- INFO: Start preparation of text-based feature translated_words
---- INFO: Start preparation of text-based feature num_words
---- INFO: Start preparation of text-based feature num_punctuation
---- INFO: Start preparation of text-based feature occ_question_mark
---- INFO: Start preparation of text-based feature occ_exclamation_mark
---- DONE: All features prepared
---- DONE: Dropped duplicates and created full dataset
---- INFO: Length of dataset after preprocessing and duplicate handling: 589164


In [9]:
len(data)

589164

In [10]:
time(start, stop)

Computation time loading the data: 0:09:10.688021
Finished at: 2020-05-12 22:12:17.003642


In [11]:
data.head()

Unnamed: 0,src_embedding,src_embedding_aligned,trg_embedding,src_sentence,trg_sentence,src_preprocessed,trg_preprocessed,src_words,trg_words,src_translated_words,trg_translated_words,src_num_words,trg_num_words,src_num_punctuation,trg_num_punctuation,src_occ_question_mark,trg_occ_question_mark,src_occ_exclamation_mark,trg_occ_exclamation_mark
0,"[-0.07040999999999999, 0.094428, 0.1178755, -0...","[-0.3088614732459478, 0.3472727241705731, -0.2...","[-0.22081, 0.49136, -0.09375700000000001, -0.6...",Resumption of the session,Wiederaufnahme der Sitzungsperiode,"[resumption, session]","[wiederaufnahme, sitzungsperiode]","[resumption, session]","[wiederaufnahme, sitzungsperiode]","[sitzungsperiode, sitzung, tagung, session]",[],2,2,0,0,False,False,False,False
1,"[-0.05200671428571431, 0.016875466666666665, -...","[-0.19811859957522993, 0.2886577921005622, -0....","[-0.1368885, 0.29018044444444446, -0.152462499...",I declare resumed the session of the European ...,"Ich erkläre die am Freitag, dem 17. Dezember u...","[declare, resumed, session, european, parliame...","[erkläre, freitag, ,, 17, ., dezember, unterbr...","[declare, resumed, session, european, parliame...","[erkläre, freitag, dezember, unterbrochene, si...","[sitzungsperiode, sitzung, tagung, session, eu...","[friday, fridays, december, dec, european, goo...",21,16,2,5,False,False,False,False
2,"[-0.08509222727272729, -0.03279808636363638, -...","[-0.19086391830712227, 0.07257094264933953, -0...","[-0.1190726470588235, 0.13479876470588237, -0....","Although, as you will have seen, the dreaded '...","Wie Sie feststellen konnten, ist der gefürchte...","[although, ,, seen, ,, dreaded, ', millennium,...","[feststellen, konnten, ,, gefürchtete, "", mill...","[although, seen, dreaded, millennium, bug, fai...","[feststellen, konnten, gefürchtete, millenium,...","[zwar, obgleich, obwohl, gesehen, sah, geschei...","[detect, couldn, citizens, citizen, our, ours,...",17,13,6,6,False,False,False,False
3,"[-0.064901, -0.08438565454545456, -0.008655272...","[-0.1613263801336713, 0.22534862126715746, -0....","[-0.12862244444444446, 0.33829177777777786, -0...",You have requested a debate on this subject in...,Im Parlament besteht der Wunsch nach einer Aus...,"[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...","[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...","[beantragt, angefordert, beantragte, gewünscht...","[parliament, parliaments, ep, consists, insist...",8,8,3,1,False,False,False,False
4,"[-0.07612752, -0.011232175999999993, -0.131726...","[-0.19760572236106183, 0.15322830212421143, -0...","[-0.21589275000000002, 0.14882689999999996, -0...","In the meantime, I should like to observe a mi...",Heute möchte ich Sie bitten - das ist auch der...,"[meantime, ,, like, observe, minute, ', silenc...","[heute, möchte, bitten, -, wunsch, kolleginnen...","[meantime, like, observe, minute, silence, num...","[heute, möchte, bitten, wunsch, kolleginnen, k...","[mögen, gefällt, wie, like, minute, nummer, an...","[today, wants, pleas, beg, desire, wish, colle...",18,15,7,6,False,False,False,False


## Create training set

In [12]:
start = datetime.datetime.now()
train_data = sens.create_train_set(n_train=500000, frac_pos=0.5)
stop = datetime.datetime.now()

---- INFO: Translation dataframe created
---- INFO: Non-translation dataframe created
---- INFO: Determined and shuffled non-translation indices
---- INFO: Feature column trg_sentence appended
---- INFO: Feature column trg_preprocessed appended
---- INFO: Feature column trg_embedding appended
---- INFO: Feature column trg_words appended
---- INFO: Feature column trg_translated_words appended
---- INFO: Feature column trg_num_words appended
---- INFO: Feature column trg_num_punctuation appended
---- INFO: Feature column trg_occ_question_mark appended
---- INFO: Feature column trg_occ_exclamation_mark appended
---- INFO: All features appended
---- INFO: Added non-translation indicator
---- DONE: Training dataset created


In [None]:
time(start, stop)

Computation time loading the data: 0:00:04.477559
Finished at: 2020-05-12 22:12:21.589882


## Create test collection

In [None]:
start = datetime.datetime.now()
test_collection = sens.create_test_collection(n_queries=1000, n_docs=10000)
stop = datetime.datetime.now()

---- INFO: Preliminary queries dataframe created
---- INFO: Preliminary documentes dataframe created
---- INFO: Merged queries and documents dataframe
---- INFO: Merged with test dataframe
---- INFO: Added translation indicator
---- DONE: Test collection created


In [None]:
time(start, stop)

Computation time loading the data: 0:01:04.869158
Finished at: 2020-05-12 22:13:26.498023


## Feature extraction

In [None]:
features_dict = {'text_based': list(text_based.FEATURES.keys()), 
                 'vector_based': list(vector_based.FEATURES.keys())}

In [None]:
start = datetime.datetime.now()
train_test, test_collection = sens.extract_features(features_dict=features_dict, data='train_test', drop_prepared=False)
stop = datetime.datetime.now()


---- INFO: No vector elements as features specified
---- INFO: Started extraction of text-based feature norm_diff_translated_words
---- INFO: Started extraction of text-based feature abs_diff_num_words
---- INFO: Started extraction of text-based feature abs_diff_num_punctuation
---- INFO: Started extraction of text-based feature abs_diff_occ_question_mark
---- INFO: Started extraction of text-based feature abs_diff_occ_exclamation_mark
---- INFO: Started extraction of text-based feature rel_diff_num_words
---- INFO: Started extraction of text-based feature rel_diff_num_punctuation
---- INFO: Started extraction of text-based feature norm_diff_num_words
---- INFO: Started extraction of text-based feature norm_diff_num_punctuation
---- INFO: Started extraction of vector-based feature euclidean_distance
---- INFO: Started extraction of vector-based feature cosine_similarity
---- DONE: All given features extracted on train dataset
-----------------------
---- INFO: Started extraction of tex

In [None]:
time(start, stop)

Computation time loading the data: 2:51:42.662933
Finished at: 2020-05-13 01:05:09.303508


## View data

In [None]:
train_data.head()

Unnamed: 0,src_embedding,src_embedding_aligned,trg_embedding,src_sentence,trg_sentence,src_preprocessed,trg_preprocessed,src_words,trg_words,src_translated_words,trg_translated_words,src_num_words,trg_num_words,src_num_punctuation,trg_num_punctuation,src_occ_question_mark,trg_occ_question_mark,src_occ_exclamation_mark,trg_occ_exclamation_mark,translation,norm_diff_translated_words,abs_diff_num_words,abs_diff_num_punctuation,abs_diff_occ_question_mark,abs_diff_occ_exclamation_mark,rel_diff_num_words,rel_diff_num_punctuation,norm_diff_num_words,norm_diff_num_punctuation,euclidean_distance,cosine_similarity
0,"[-0.07040999999999999, 0.094428, 0.1178755, -0...","[-0.3088614732459478, 0.3472727241705731, -0.2...","[-0.22081, 0.49136, -0.09375700000000001, -0.6...",Resumption of the session,Wiederaufnahme der Sitzungsperiode,"[resumption, session]","[wiederaufnahme, sitzungsperiode]","[resumption, session]","[wiederaufnahme, sitzungsperiode]","[sitzungsperiode, sitzung, tagung, session]",[],2,2,0,0,False,False,False,False,1,0.25,0,0,1,1,0,0,0.0,0.0,2.806763,0.69953
1,"[-0.05200671428571431, 0.016875466666666665, -...","[-0.19811859957522993, 0.2886577921005622, -0....","[-0.1368885, 0.29018044444444446, -0.152462499...",I declare resumed the session of the European ...,"Ich erkläre die am Freitag, dem 17. Dezember u...","[declare, resumed, session, european, parliame...","[erkläre, freitag, ,, 17, ., dezember, unterbr...","[declare, resumed, session, european, parliame...","[erkläre, freitag, dezember, unterbrochene, si...","[sitzungsperiode, sitzung, tagung, session, eu...","[friday, fridays, december, dec, european, goo...",21,16,2,5,False,False,False,False,1,0.305263,5,3,1,1,5,-3,0.277778,-1.0,0.989671,0.87535
2,"[-0.08509222727272729, -0.03279808636363638, -...","[-0.19086391830712227, 0.07257094264933953, -0...","[-0.1190726470588235, 0.13479876470588237, -0....","Although, as you will have seen, the dreaded '...","Wie Sie feststellen konnten, ist der gefürchte...","[although, ,, seen, ,, dreaded, ', millennium,...","[feststellen, konnten, ,, gefürchtete, "", mill...","[although, seen, dreaded, millennium, bug, fai...","[feststellen, konnten, gefürchtete, millenium,...","[zwar, obgleich, obwohl, gesehen, sah, geschei...","[detect, couldn, citizens, citizen, our, ours,...",17,13,6,6,False,False,False,False,1,0.0,4,0,1,1,4,0,0.266667,0.0,1.046691,0.842287
3,"[-0.064901, -0.08438565454545456, -0.008655272...","[-0.1613263801336713, 0.22534862126715746, -0....","[-0.12862244444444446, 0.33829177777777786, -0...",You have requested a debate on this subject in...,Im Parlament besteht der Wunsch nach einer Aus...,"[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...","[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...","[beantragt, angefordert, beantragte, gewünscht...","[parliament, parliaments, ep, consists, insist...",8,8,3,1,False,False,False,False,1,0.375,0,2,1,1,0,2,0.0,1.0,1.517368,0.749109
4,"[-0.07612752, -0.011232175999999993, -0.131726...","[-0.19760572236106183, 0.15322830212421143, -0...","[-0.21589275000000002, 0.14882689999999996, -0...","In the meantime, I should like to observe a mi...",Heute möchte ich Sie bitten - das ist auch der...,"[meantime, ,, like, observe, minute, ', silenc...","[heute, möchte, bitten, -, wunsch, kolleginnen...","[meantime, like, observe, minute, silence, num...","[heute, möchte, bitten, wunsch, kolleginnen, k...","[mögen, gefällt, wie, like, minute, nummer, an...","[today, wants, pleas, beg, desire, wish, colle...",18,15,7,6,False,False,False,False,1,0.338889,3,1,1,1,3,1,0.1875,0.166667,0.871747,0.887995


In [None]:
test_collection.head()

Unnamed: 0,src_sentence,src_preprocessed,src_embedding,src_embedding_aligned,src_words,src_translated_words,src_num_words,src_num_punctuation,src_occ_question_mark,src_occ_exclamation_mark,trg_sentence,trg_preprocessed,trg_embedding,trg_words,trg_translated_words,trg_num_words,trg_num_punctuation,trg_occ_question_mark,trg_occ_exclamation_mark,translation,norm_diff_translated_words,abs_diff_num_words,abs_diff_num_punctuation,abs_diff_occ_question_mark,abs_diff_occ_exclamation_mark,rel_diff_num_words,rel_diff_num_punctuation,norm_diff_num_words,norm_diff_num_punctuation,euclidean_distance,cosine_similarity
0,I would also like to comment on the budget for...,"[would, also, like, comment, budget, culture, .]","[-0.13882914285714285, -0.017381557142857138, ...","[-0.05674507137870465, 0.04452169555670625, -0...","[would, also, like, comment, budget, culture]","[würden, wäre, hätte, würde, gäbe, ausserdem, ...",6,1,False,False,Eine Bemerkung zum Kulturetat.,"[bemerkung, kulturetat, .]","[-0.031845, 0.229315, 0.021962000000000002, -0...","[bemerkung, kulturetat]",[],2,1,False,False,1,0.25,4,0,1,1,4,0,1.0,0.0,1.88329,0.634981
1,I would also like to comment on the budget for...,"[would, also, like, comment, budget, culture, .]","[-0.13882914285714285, -0.017381557142857138, ...","[-0.05674507137870465, 0.04452169555670625, -0...","[would, also, like, comment, budget, culture]","[würden, wäre, hätte, würde, gäbe, ausserdem, ...",6,1,False,False,Ich bin nun seit ganz kurzer Zeit zweite stell...,"[seit, ganz, kurzer, zeit, zweite, stellvertre...","[-0.1876736827586207, 0.10062845862068968, -0....","[seit, ganz, kurzer, zeit, zweite, stellvertre...","[since, ganz, quite, short, brief, zeit, perio...",24,9,False,False,0,0.107143,18,8,1,1,-18,-8,-1.2,-1.6,1.194338,0.796495
2,I would also like to comment on the budget for...,"[would, also, like, comment, budget, culture, .]","[-0.13882914285714285, -0.017381557142857138, ...","[-0.05674507137870465, 0.04452169555670625, -0...","[would, also, like, comment, budget, culture]","[würden, wäre, hätte, würde, gäbe, ausserdem, ...",6,1,False,False,"Genau so ist es mit unseren Möglichkeiten, den...","[genau, möglichkeiten, ,, klassischen, teil, a...","[-0.07780442857142857, 0.045667428571428574, -...","[genau, möglichkeiten, klassischen, teil, agra...","[precisely, exact, exactly, possibilities, opt...",7,2,False,False,0,0.0,1,1,1,1,-1,-1,-0.166667,-1.0,1.461955,0.697996
3,I would also like to comment on the budget for...,"[would, also, like, comment, budget, culture, .]","[-0.13882914285714285, -0.017381557142857138, ...","[-0.05674507137870465, 0.04452169555670625, -0...","[would, also, like, comment, budget, culture]","[würden, wäre, hätte, würde, gäbe, ausserdem, ...",6,1,False,False,Wenn wir denn wirklich mit der Lissabon-Strate...,"[wirklich, lissabon, -, strategie, ,, nämlich,...","[-0.1879221111111111, 0.030884555555555557, -0...","[wirklich, lissabon, strategie, nämlich, ökono...","[truly, really, lisboa, lisbon, namely, rebuil...",13,6,False,False,0,0.0,7,5,1,1,-7,-5,-0.777778,-1.666667,1.394792,0.729719
4,I would also like to comment on the budget for...,"[would, also, like, comment, budget, culture, .]","[-0.13882914285714285, -0.017381557142857138, ...","[-0.05674507137870465, 0.04452169555670625, -0...","[would, also, like, comment, budget, culture]","[würden, wäre, hätte, würde, gäbe, ausserdem, ...",6,1,False,False,"Denn in Jugend und Intelligenz zu investieren,...","[jugend, intelligenz, investieren, ,, wirklich...","[-0.15326981818181815, 0.07748509090909092, -0...","[jugend, intelligenz, investieren, wirklichen,...","[youth, teens, teenager, young, adolescence, m...",9,3,False,False,0,0.0,3,2,1,1,-3,-2,-0.428571,-1.0,1.389512,0.733129


## Save data

In [None]:
path = f'{paths.data_path}extracted_data/global'
if not os.path.exists(path):
    os.makedirs(path)

In [None]:
train_data.to_pickle(f'{path}/training_data_tfidf.pkl')
'Finished at: {}'.format(datetime.datetime.now())

In [None]:
test_collection.to_pickle(f'{path}/test_collection_tfidf.pkl')
'Finished at: {}'.format(datetime.datetime.now())

In [None]:
overall_stop = datetime.datetime.now()
'Total computation time: {}'.format(overall_stop-overall_start)

## For tfidf only: Save idf values

In [None]:
idf_en = {k: sens.vectorizer['en'].idf_[v] for k,v in sens.vectorizer['en'].vocabulary_.items()}

In [None]:
idf_de = {k: sens.vectorizer['de'].idf_[v] for k,v in sens.vectorizer['de'].vocabulary_.items()}

In [None]:
with open(f'{path}/idf_english.json', 'wb') as fp:
    pickle.dump(idf_en, fp, protocol=pickle.HIGHEST_PROTOCOL)
'Finished at: {}'.format(datetime.datetime.now())

In [None]:
with open(f'{path}/idf_german.json', 'wb') as fp:
    pickle.dump(idf_de, fp, protocol=pickle.HIGHEST_PROTOCOL)
'Finished at: {}'.format(datetime.datetime.now())