## Imports

In [6]:
import io, os, importlib, pickle, math
import datetime
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [7]:
from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.features import text_based
importlib.reload(text_based)

from ir_crosslingual.features import vector_based
importlib.reload(vector_based)

from ir_crosslingual.embeddings import embeddings
importlib.reload(embeddings)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

<module 'ir_crosslingual.sentences.sentences' from '/Users/jani/PycharmProjects/ir-crosslingual/ir_crosslingual/sentences/sentences.py'>

In [8]:
def time(start, stop, message):
    print('Computation time {}: {}'.format(message, stop-start))
    print('Finished at: {}'.format(datetime.datetime.now()))

## Load word embeddings

In [9]:
overall_start = datetime.datetime.now()

In [10]:
french = embeddings.WordEmbeddings('fr')
french.load_embeddings()

english = embeddings.WordEmbeddings('en')
english.load_embeddings()

In [11]:
W_enfr, W_fren = embeddings.WordEmbeddings.learn_projection_matrix(src_lang='en', trg_lang='fr')

---- INFO: Learn projection matrix for en-fr
---- INFO: Found 10369 valid translation pairs in expert dictionary.
---- INFO: 503 other pairs contained at least one unknown word (0 in source language, 503 in target language).
---- DONE: Seed dictionary extracted for the languages: en-fr
---- INFO: Resulting subspace dimension: (10369, 300)
---- INFO: Resulting subspace dimension: (10369, 300)
---- DONE: Projection matrix learned from en to fr
---- INFO: Learn projection matrix for fr-en
---- INFO: Found 7938 valid translation pairs in expert dictionary.
---- INFO: 332 other pairs contained at least one unknown word (0 in source language, 332 in target language).
---- DONE: Seed dictionary extracted for the languages: fr-en
---- INFO: Resulting subspace dimension: (7938, 300)
---- INFO: Resulting subspace dimension: (7938, 300)
---- DONE: Projection matrix learned from fr to en


## Load sentence embeddings

In [12]:
start = datetime.datetime.now()
sens = sentences.Sentences(src_words=english, trg_words=french)
prepared_features = list(text_based.PREPARED_FEATURES.keys())
data = sens.load_data(single_source=False, n_max=15000, features=prepared_features, agg_method='average')
stop = datetime.datetime.now()

---- DONE: Target sentences loaded
---- DONE: Source sentences loaded
---- DONE: Sentences preprocessed
---- ERROR: Sentence embedding failed in en on ID 6544: ALTENER
---- ERROR: Sentence embedding failed in en on ID 10259: Scrapie
---- ERROR: Sentence embedding failed in en on ID 7439: and
---- INFO: Sentences embeddings extracted in en
---- ERROR: Sentence embedding failed in fr on ID 6544: Altener
---- ERROR: Sentence embedding failed in fr on ID 10259: La tremblante
---- ERROR: Sentence embedding failed in fr on ID 7439: et
---- INFO: Sentences embeddings extracted in fr
---- INFO: Shape of source word embeddings
---- INFO: Extracted word embeddings of found words
---- DONE: Sentences transformed
---- INFO: Embedding space of source language transformed according to projection matrix
---- DONE: Source words extracted
---- DONE: Target words extracted
---- INFO: Embeddings of found words added as a column
---- INFO: Start preparation of text-based feature translated_words
---- INFO

In [13]:
len(data)

14864

In [14]:
time(start, stop, 'loading the data')

Computation time loading the data: 0:00:05.836677
Finished at: 2020-05-23 02:57:26.054744


In [15]:
data.head()

Unnamed: 0,src_embedding,src_embedding_aligned,trg_embedding,src_sentence,trg_sentence,src_preprocessed,trg_preprocessed,src_words,trg_words,src_words_found_embedding,trg_words_found_embedding,src_translated_words,trg_translated_words,src_num_words,trg_num_words,src_num_punctuation,trg_num_punctuation,src_occ_question_mark,trg_occ_question_mark,src_occ_exclamation_mark,trg_occ_exclamation_mark
0,"[-0.07040999999999999, 0.094428, 0.1178755, -0...","[-0.12310489002602223, -0.09114010555187518, -...","[-0.24092, 0.0007800000000000012, -0.203353, 0...",Resumption of the session,Reprise de la session,"[resumption, session]","[reprise, session]","[resumption, session]","[reprise, session]","[[0.13287063559975887, -0.039487647615057006, ...","[[-0.11368, -0.016772, -0.097256, -0.14195, 0....","[session, séance]","[resumed, resume, reprise, session]",2,2,0,0,False,False,False,False
1,"[-0.05200671428571431, 0.016875466666666665, -...","[-0.15369624684953903, -0.02870744029014017, -...","[-0.16036031250000002, -0.0502420625, -0.25120...",I declare resumed the session of the European ...,Je déclare reprise la session du Parlement eur...,"[declare, resumed, session, european, parliame...","[déclare, reprise, session, parlement, europée...","[declare, resumed, session, european, parliame...","[déclare, reprise, session, parlement, europée...","[[-0.06874171352754704, 0.06292740460440661, -...","[[0.11086, -0.02769, -0.077919, 0.16068, 0.286...","[session, séance, européens, européennes, euro...","[declares, declare, resumed, resume, reprise, ...",21,17,2,1,False,False,False,False
2,"[-0.08509222727272729, -0.03279808636363638, -...","[-0.1387243927412599, 0.022607814078046323, -0...","[-0.08278590769230772, 0.042981399999999996, -...","Although, as you will have seen, the dreaded'm...","Comme vous avez pu le constater, le grand ""bog...","[although, ,, seen, ,, dreaded, ', millennium,...","[comme, pu, constater, ,, grand, "", bogue, ', ...","[although, seen, dreaded, millennium, bug, fai...","[comme, pu, constater, grand, bogue, an, produ...","[[-0.005225076408981788, 0.1502183274231433, -...","[[-0.021896, -0.075727, 0.18826, 0.21511, 0.07...","[quoique, vu, vus, échec, raté, échoué, still,...","[as, like, large, big, great, grand, product, ...",17,18,6,9,False,False,False,False
3,"[-0.064901, -0.08438565454545456, -0.008655272...","[-0.02876501347855113, 0.043874074162248124, -...","[-0.0774160909090909, -0.11717536363636362, -0...",You have requested a debate on this subject in...,Vous avez souhaité un débat à ce sujet dans le...,"[requested, debate, subject, course, next, day...","[souhaité, débat, sujet, prochains, jours, ,, ...","[requested, debate, subject, course, next, day...","[souhaité, débat, sujet, prochains, jours, cou...","[[-0.19922813348606175, -0.30227105407400734, ...","[[-0.17082, -0.37877, -0.10921, 0.13303, 0.076...","[demandée, demandé, sollicité, demandés, débat...","[debates, discussion, debate, topic, subject, ...",8,9,3,2,False,False,False,False
4,"[-0.07612752, -0.011232175999999993, -0.131726...","[-0.10589229490151733, -0.030987626808951958, ...","[-0.13213240269230775, -0.05523615384615385, -...","In the meantime, I should like to observe a mi...","En attendant, je souhaiterais, comme un certai...","[meantime, ,, like, observe, minute, ', silenc...","[attendant, ,, souhaiterais, ,, comme, certain...","[meantime, like, observe, minute, silence, num...","[attendant, souhaiterais, comme, certain, nomb...","[[-0.2649453375682037, 0.030093366664549925, 0...","[[-0.12316, -0.109, -0.058061, 0.001211, 0.231...","[comme, aime, aiment, genre, aimez, minute, no...","[waiting, as, like, certain, number, requested...",18,19,7,8,False,False,False,False


## Create test collection

In [16]:
start = datetime.datetime.now()
test_collection = sens.create_test_collection(n_queries=1000, n_docs=10000)
stop = datetime.datetime.now()

---- INFO: Preliminary queries dataframe created
---- INFO: Preliminary documentes dataframe created
---- INFO: Merged queries and documents dataframe
---- INFO: Merged with test dataframe
---- INFO: Added translation indicator
---- DONE: Test collection created


In [17]:
time(start, stop, 'creating the test collection')

Computation time creating the test collection: 0:01:30.253024
Finished at: 2020-05-23 02:58:56.644508


## Feature extraction

In [18]:
features_dict = {'text_based': list(text_based.FEATURES.keys()), 
                 'vector_based': list(vector_based.FEATURES.keys())}

In [19]:
start = datetime.datetime.now()
test_collection = sens.extract_features(features_dict=features_dict, data='test', drop_prepared=True)
stop = datetime.datetime.now()


---- INFO: No vector elements as features specified
---- INFO: Started extraction of text-based feature norm_diff_translated_words
---- INFO: Started extraction of text-based feature abs_diff_num_words
---- INFO: Started extraction of text-based feature abs_diff_num_punctuation
---- INFO: Started extraction of text-based feature abs_diff_occ_question_mark
---- INFO: Started extraction of text-based feature abs_diff_occ_exclamation_mark
---- INFO: Started extraction of text-based feature rel_diff_num_words
---- INFO: Started extraction of text-based feature rel_diff_num_punctuation
---- INFO: Started extraction of text-based feature norm_diff_num_words
---- INFO: Started extraction of text-based feature norm_diff_num_punctuation
---- INFO: Started extraction of vector-based feature euclidean_distance
---- INFO: Started extraction of vector-based feature cosine_similarity


In [20]:
time(start, stop, 'extracting the features')

Computation time extracting the features: 2:26:29.298251
Finished at: 2020-05-23 05:25:26.023951


## View data

In [21]:
test_collection.head()

Unnamed: 0,src_sentence,src_preprocessed,src_embedding,src_embedding_aligned,src_words,src_words_found_embedding,trg_sentence,trg_preprocessed,trg_embedding,trg_words,trg_words_found_embedding,translation,norm_diff_translated_words,abs_diff_num_words,abs_diff_num_punctuation,abs_diff_occ_question_mark,abs_diff_occ_exclamation_mark,rel_diff_num_words,rel_diff_num_punctuation,norm_diff_num_words,norm_diff_num_punctuation,euclidean_distance,cosine_similarity
0,"After this interesting exchange of opinions, a...","[interesting, exchange, opinions, ,, accordanc...","[-0.07247762499999999, -0.05773741250000001, -...","[-0.07755082538309126, 0.026599344197241616, -...","[interesting, exchange, opinions, accordance, ...","[[0.0160077380810206, 0.24196988362157357, -0....","Après cet intéressant échange d'opinions, conf...","[après, cet, intéressant, échange, ', opinions...","[-0.0852523125, 0.010679999999999997, -0.11436...","[après, cet, intéressant, échange, opinions, c...","[[-0.070378, 0.17455, -0.16929, 0.19593, -0.08...",1,0.282051,1,1,1,1,1,-1,0.071429,-0.333333,0.882359,0.881754
1,"After this interesting exchange of opinions, a...","[interesting, exchange, opinions, ,, accordanc...","[-0.07247762499999999, -0.05773741250000001, -...","[-0.07755082538309126, 0.026599344197241616, -...","[interesting, exchange, opinions, accordance, ...","[[0.0160077380810206, 0.24196988362157357, -0....",L'heure des questions au conseil est close.,"[', heure, questions, conseil, close, .]","[-0.06069066666666667, -0.017596166666666677, ...","[heure, questions, conseil, close]","[[-0.14007, -0.03196, -0.065519, 0.051289, 0.0...",0,0.163462,11,1,1,1,11,1,1.222222,0.5,1.488621,0.72115
2,"After this interesting exchange of opinions, a...","[interesting, exchange, opinions, ,, accordanc...","[-0.07247762499999999, -0.05773741250000001, -...","[-0.07755082538309126, 0.026599344197241616, -...","[interesting, exchange, opinions, accordance, ...","[[0.0160077380810206, 0.24196988362157357, -0....",Impôt sur le capital,"[impôt, capital]","[-0.129457, -0.12695800000000002, -0.0954415, ...","[impôt, capital]","[[-0.21623, -0.23299, -0.20862, -0.1637, 0.175...",0,0.0,13,3,1,1,13,3,1.625,3.0,3.805504,0.409065
3,"After this interesting exchange of opinions, a...","[interesting, exchange, opinions, ,, accordanc...","[-0.07247762499999999, -0.05773741250000001, -...","[-0.07755082538309126, 0.026599344197241616, -...","[interesting, exchange, opinions, accordance, ...","[[0.0160077380810206, 0.24196988362157357, -0....",L'ordre du jour appelle en discussion commune:,"[', ordre, jour, appelle, discussion, commune, :]","[-0.12453099999999999, -0.039995, -0.042993183...","[ordre, jour, appelle, discussion, commune]","[[-0.14007, -0.03196, -0.065519, 0.051289, 0.0...",0,0.0,10,1,1,1,10,1,1.0,0.5,1.714162,0.634488
4,"After this interesting exchange of opinions, a...","[interesting, exchange, opinions, ,, accordanc...","[-0.07247762499999999, -0.05773741250000001, -...","[-0.07755082538309126, 0.026599344197241616, -...","[interesting, exchange, opinions, accordance, ...","[[0.0160077380810206, 0.24196988362157357, -0....",la question orale (B5-0004/ 2000) de M. Désir ...,"[question, orale, (, b5, -, 0004, /, 2000, ), ...","[-0.10444363157894736, 0.012410052631578952, -...","[question, orale, désir, autres, conseil, posi...","[[-0.12886, -0.14088, 0.036697, -0.060468, -0....",0,0.0,2,6,1,1,2,-6,0.142857,-1.0,1.28126,0.741908


## Create chunks

In [22]:
chunks = [test_collection[i:(i+math.ceil(len(test_collection)/20))] 
          for i in range(0,len(test_collection),math.ceil(len(test_collection)/20))]

## Save data

In [23]:
path = f'{paths.data_path}extracted_data/global/en-fr'
if not os.path.exists(path):
    os.makedirs(path)

In [24]:
start = datetime.datetime.now()
for idx, data in enumerate(chunks):
    data.to_pickle(f'{path}/test_collection_{idx:02d}_avg.pkl')
    print(f'---- INFO: Chunk {idx:02d} saved')
time(start, datetime.datetime.now(), 'saving test collection')

---- INFO: Chunk 00 saved
---- INFO: Chunk 01 saved
---- INFO: Chunk 02 saved
---- INFO: Chunk 03 saved
---- INFO: Chunk 04 saved
---- INFO: Chunk 05 saved
---- INFO: Chunk 06 saved
---- INFO: Chunk 07 saved
---- INFO: Chunk 08 saved
---- INFO: Chunk 09 saved
---- INFO: Chunk 10 saved
---- INFO: Chunk 11 saved
---- INFO: Chunk 12 saved
---- INFO: Chunk 13 saved
---- INFO: Chunk 14 saved
---- INFO: Chunk 15 saved
---- INFO: Chunk 16 saved
---- INFO: Chunk 17 saved
---- INFO: Chunk 18 saved
---- INFO: Chunk 19 saved
Computation time saving test collection: 0:01:19.946966
Finished at: 2020-05-23 05:26:47.384690


In [25]:
overall_stop = datetime.datetime.now()
'Total computation time: {}'.format(overall_stop-overall_start)

'Total computation time: 2:29:27.862083'

In [26]:
W_fren, W_enfr = embeddings.WordEmbeddings.learn_projection_matrix(src_lang='fr', trg_lang='en')

---- INFO: Learn projection matrix for fr-en
---- INFO: Found 7938 valid translation pairs in expert dictionary.
---- INFO: 332 other pairs contained at least one unknown word (0 in source language, 332 in target language).
---- DONE: Seed dictionary extracted for the languages: fr-en
---- INFO: Resulting subspace dimension: (7938, 300)
---- INFO: Resulting subspace dimension: (7938, 300)
---- DONE: Projection matrix learned from fr to en
---- INFO: Learn projection matrix for en-fr
---- INFO: Found 10369 valid translation pairs in expert dictionary.
---- INFO: 503 other pairs contained at least one unknown word (0 in source language, 503 in target language).
---- DONE: Seed dictionary extracted for the languages: en-fr
---- INFO: Resulting subspace dimension: (10369, 300)
---- INFO: Resulting subspace dimension: (10369, 300)
---- DONE: Projection matrix learned from en to fr


In [27]:
start = datetime.datetime.now()
sens = sentences.Sentences(src_words=french, trg_words=english)
prepared_features = list(text_based.PREPARED_FEATURES.keys())
data = sens.load_data(single_source=False, n_max=15000, features=prepared_features, agg_method='average')
stop = datetime.datetime.now()

---- DONE: Target sentences loaded
---- DONE: Source sentences loaded
---- DONE: Sentences preprocessed
---- ERROR: Sentence embedding failed in fr on ID 6544: Altener
---- ERROR: Sentence embedding failed in fr on ID 10259: La tremblante
---- ERROR: Sentence embedding failed in fr on ID 7439: et
---- INFO: Sentences embeddings extracted in fr
---- ERROR: Sentence embedding failed in en on ID 6544: ALTENER
---- ERROR: Sentence embedding failed in en on ID 10259: Scrapie
---- ERROR: Sentence embedding failed in en on ID 7439: and
---- INFO: Sentences embeddings extracted in en
---- INFO: Shape of source word embeddings
---- INFO: Extracted word embeddings of found words
---- DONE: Sentences transformed
---- INFO: Embedding space of source language transformed according to projection matrix
---- DONE: Source words extracted
---- DONE: Target words extracted
---- INFO: Embeddings of found words added as a column
---- INFO: Start preparation of text-based feature translated_words
---- INFO

In [28]:
start = datetime.datetime.now()
test_collection = sens.create_test_collection(n_queries=1000, n_docs=10000)
stop = datetime.datetime.now()

---- INFO: Preliminary queries dataframe created
---- INFO: Preliminary documentes dataframe created
---- INFO: Merged queries and documents dataframe
---- INFO: Merged with test dataframe
---- INFO: Added translation indicator
---- DONE: Test collection created


In [29]:
features_dict = {'text_based': list(text_based.FEATURES.keys()), 
                 'vector_based': list(vector_based.FEATURES.keys())}

In [30]:
start = datetime.datetime.now()
test_collection = sens.extract_features(features_dict=features_dict, data='test', drop_prepared=True)
stop = datetime.datetime.now()

---- INFO: No vector elements as features specified
---- INFO: Started extraction of text-based feature norm_diff_translated_words
---- INFO: Started extraction of text-based feature abs_diff_num_words
---- INFO: Started extraction of text-based feature abs_diff_num_punctuation
---- INFO: Started extraction of text-based feature abs_diff_occ_question_mark
---- INFO: Started extraction of text-based feature abs_diff_occ_exclamation_mark
---- INFO: Started extraction of text-based feature rel_diff_num_words
---- INFO: Started extraction of text-based feature rel_diff_num_punctuation
---- INFO: Started extraction of text-based feature norm_diff_num_words
---- INFO: Started extraction of text-based feature norm_diff_num_punctuation
---- INFO: Started extraction of vector-based feature euclidean_distance
---- INFO: Started extraction of vector-based feature cosine_similarity


In [31]:
chunks = [test_collection[i:(i+math.ceil(len(test_collection)/20))] 
          for i in range(0,len(test_collection),math.ceil(len(test_collection)/20))]

In [32]:
path = f'{paths.data_path}extracted_data/global/fr-en'
if not os.path.exists(path):
    os.makedirs(path)

In [33]:
start = datetime.datetime.now()
for idx, data in enumerate(chunks):
    data.to_pickle(f'{path}/test_collection_{idx:02d}_avg.pkl')
    print(f'---- INFO: Chunk {idx:02d} saved')
time(start, datetime.datetime.now(), 'saving test collection')

---- INFO: Chunk 00 saved
---- INFO: Chunk 01 saved
---- INFO: Chunk 02 saved
---- INFO: Chunk 03 saved
---- INFO: Chunk 04 saved
---- INFO: Chunk 05 saved
---- INFO: Chunk 06 saved
---- INFO: Chunk 07 saved
---- INFO: Chunk 08 saved
---- INFO: Chunk 09 saved
---- INFO: Chunk 10 saved
---- INFO: Chunk 11 saved
---- INFO: Chunk 12 saved
---- INFO: Chunk 13 saved
---- INFO: Chunk 14 saved
---- INFO: Chunk 15 saved
---- INFO: Chunk 16 saved
---- INFO: Chunk 17 saved
---- INFO: Chunk 18 saved
---- INFO: Chunk 19 saved
Computation time saving test collection: 0:01:09.492008
Finished at: 2020-05-23 07:56:37.035586


## For tfidf only: Save idf values

In [34]:
#idf_en = {k: sens.vectorizer['en'].idf_[v] for k,v in sens.vectorizer['en'].vocabulary_.items()}

In [35]:
#idf_de = {k: sens.vectorizer['de'].idf_[v] for k,v in sens.vectorizer['de'].vocabulary_.items()}

In [36]:
#with open(f'{path}/idf_english.json', 'wb') as fp:
#    pickle.dump(idf_en, fp, protocol=pickle.HIGHEST_PROTOCOL)
#'Finished at: {}'.format(datetime.datetime.now())

In [37]:
#with open(f'{path}/idf_german.json', 'wb') as fp:
#    pickle.dump(idf_de, fp, protocol=pickle.HIGHEST_PROTOCOL)
#'Finished at: {}'.format(datetime.datetime.now())