In [95]:
import pandas as pd
import numpy as np
import importlib, os, math

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [65]:
from ir_crosslingual.embeddings import embeddings
importlib.reload(embeddings)

from ir_crosslingual.features import text_based
importlib.reload(text_based)

from ir_crosslingual.features import vector_based
importlib.reload(vector_based)

from ir_crosslingual.supervised_classification import sup_model
importlib.reload(sup_model)

from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

<module 'ir_crosslingual.sentences.sentences' from '/Users/jani/PycharmProjects/ir-crosslingual/ir_crosslingual/sentences/sentences.py'>

# Load data

## Word embeddings

In [66]:
german = embeddings.WordEmbeddings('de')
german.load_embeddings()

english = embeddings.WordEmbeddings('en')
english.load_embeddings()

In [67]:
W_ende, W_deen = embeddings.WordEmbeddings.learn_projection_matrix(src_lang='en', trg_lang='de')

Learn projection matrix for en-de
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Resulting subspace dimension: (13700, 300)
Resulting subspace dimension: (13700, 300)
Learn projection matrix for de-en
Found 10604 valid translation pairs in expert dictionary.
262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
Resulting subspace dimension: (10604, 300)
Resulting subspace dimension: (10604, 300)


## Sentence embeddings and feature engineering preparation

In [68]:
prepared_features = ['num_words', 'num_punctuation', 'occ_question_mark', 'occ_exclamation_mark']

In [69]:
sens = sentences.Sentences(src_words=english, trg_words=german)

In [70]:
data = sens.load_data(single_source=False, n_max=50000, features=prepared_features)

Target sentences loaded
Source sentences loaded
Sentences preprocessed
Could not find a term of the sentence 'Additionality' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence 'IGC' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence 'Scrapie' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence 'Comitology:' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence 'Transmissible spongiform encephalopathies' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence 'UCITS' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence 'Eurodac' in 

Sentences embeddings extracted in de
Sentences transformed
Embedding space of source language transformed according to projection matrix
Start preparation of feature num_words
Start preparation of feature num_punctuation
Start preparation of feature occ_question_mark
Start preparation of feature occ_exclamation_mark


# Create training dataset and do feature engineering

In [71]:
len(sens.data)

49949

In [72]:
train_data = sens.create_train_set(n_train=40000, frac_pos=0.5)

In [73]:
train_data.head()

Unnamed: 0,src_embedding,src_embedding_aligned,trg_embedding,src_sentence,trg_sentence,src_preprocessed,trg_preprocessed,src_num_words,trg_num_words,src_num_punctuation,trg_num_punctuation,src_occ_question_mark,trg_occ_question_mark,src_occ_exclamation_mark,trg_occ_exclamation_mark,translation
0,"[-0.07040999999999999, 0.094428, 0.1178755, -0...","[-0.3088614732459478, 0.3472727241705731, -0.2...","[-0.22081, 0.49136, -0.09375700000000001, -0.6...",Resumption of the session,Wiederaufnahme der Sitzungsperiode,"[resumption, session]","[wiederaufnahme, sitzungsperiode]",2,2,0,0,False,False,False,False,1
1,"[-0.05200671428571431, 0.016875466666666665, -...","[-0.19811859957522993, 0.2886577921005622, -0....","[-0.1368885, 0.29018044444444446, -0.152462499...",I declare resumed the session of the European ...,"Ich erkläre die am Freitag, dem 17. Dezember u...","[declare, resumed, session, european, parliame...","[erkläre, freitag, ,, 17, ., dezember, unterbr...",21,16,2,5,False,False,False,False,1
2,"[-0.08509222727272729, -0.03279808636363638, -...","[-0.19086391830712227, 0.07257094264933953, -0...","[-0.1190726470588235, 0.13479876470588237, -0....","Although, as you will have seen, the dreaded '...","Wie Sie feststellen konnten, ist der gefürchte...","[although, ,, seen, ,, dreaded, ', millennium,...","[feststellen, konnten, ,, gefürchtete, "", mill...",17,13,6,6,False,False,False,False,1
3,"[-0.064901, -0.08438565454545456, -0.008655272...","[-0.1613263801336713, 0.22534862126715746, -0....","[-0.12862244444444446, 0.33829177777777786, -0...",You have requested a debate on this subject in...,Im Parlament besteht der Wunsch nach einer Aus...,"[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...",8,8,3,1,False,False,False,False,1
4,"[-0.07612752, -0.011232175999999993, -0.131726...","[-0.19760572236106183, 0.15322830212421143, -0...","[-0.21589275000000002, 0.14882689999999996, -0...","In the meantime, I should like to observe a mi...",Heute möchte ich Sie bitten - das ist auch der...,"[meantime, ,, like, observe, minute, ', silenc...","[heute, möchte, bitten, -, wunsch, kolleginnen...",18,15,7,6,False,False,False,False,1


In [74]:
# specify desired dimension of sentence embedding in dim_emb
# 0: single sentence embedding elements won't be extracted as features
# 1,...299: dimension of sentence embedding will be reduced according to specified dim_emb and elements will be extracted as features 
# 300: sentence embedding elements will be extracted in original form
dim_emb = 0
features_dict = {'text_based': ['diff_{}'.format(feat) for feat in prepared_features], 
                 'vector_based': ['cosine_similarity'],
                 'vector_elements': ['src_embedding_{}'.format(i) for i in range(dim_emb)]}
features_dict['vector_elements'].extend(['trg_embedding_{}'.format(i) for i in range(dim_emb)])

In [75]:
train_data = sens.extract_features(features_dict=features_dict, data='train')

Started diff_num_words
Started diff_num_punctuation
Started diff_occ_question_mark
Started diff_occ_exclamation_mark
Started cosine_similarity


In [76]:
train_data.head()

Unnamed: 0,src_embedding,src_embedding_aligned,trg_embedding,src_sentence,trg_sentence,src_preprocessed,trg_preprocessed,translation,diff_num_words,diff_num_punctuation,diff_occ_question_mark,diff_occ_exclamation_mark,cosine_similarity
0,"[-0.07040999999999999, 0.094428, 0.1178755, -0...","[-0.3088614732459478, 0.3472727241705731, -0.2...","[-0.22081, 0.49136, -0.09375700000000001, -0.6...",Resumption of the session,Wiederaufnahme der Sitzungsperiode,"[resumption, session]","[wiederaufnahme, sitzungsperiode]",1,0,0,0,0,0.69953
1,"[-0.05200671428571431, 0.016875466666666665, -...","[-0.19811859957522993, 0.2886577921005622, -0....","[-0.1368885, 0.29018044444444446, -0.152462499...",I declare resumed the session of the European ...,"Ich erkläre die am Freitag, dem 17. Dezember u...","[declare, resumed, session, european, parliame...","[erkläre, freitag, ,, 17, ., dezember, unterbr...",1,5,3,0,0,0.87535
2,"[-0.08509222727272729, -0.03279808636363638, -...","[-0.19086391830712227, 0.07257094264933953, -0...","[-0.1190726470588235, 0.13479876470588237, -0....","Although, as you will have seen, the dreaded '...","Wie Sie feststellen konnten, ist der gefürchte...","[although, ,, seen, ,, dreaded, ', millennium,...","[feststellen, konnten, ,, gefürchtete, "", mill...",1,4,0,0,0,0.842287
3,"[-0.064901, -0.08438565454545456, -0.008655272...","[-0.1613263801336713, 0.22534862126715746, -0....","[-0.12862244444444446, 0.33829177777777786, -0...",You have requested a debate on this subject in...,Im Parlament besteht der Wunsch nach einer Aus...,"[requested, debate, subject, course, next, day...","[parlament, besteht, wunsch, aussprache, verla...",1,0,2,0,0,0.749109
4,"[-0.07612752, -0.011232175999999993, -0.131726...","[-0.19760572236106183, 0.15322830212421143, -0...","[-0.21589275000000002, 0.14882689999999996, -0...","In the meantime, I should like to observe a mi...",Heute möchte ich Sie bitten - das ist auch der...,"[meantime, ,, like, observe, minute, ', silenc...","[heute, möchte, bitten, -, wunsch, kolleginnen...",1,3,1,0,0,0.887995


# Create logistic regression model

### Fit a logistic regression model on training data

In [77]:
features = [feature for values in features_dict.values() for feature in values]
label = 'translation'

In [78]:
logisticRegr = LogisticRegression()

In [79]:
X_train = train_data[features]
y_train = train_data[[label]]

In [80]:
logisticRegr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [81]:
logisticRegr.coef_

array([[-0.43057101, -0.35255732, -0.28223152,  0.29064566, 19.08996513]])

### Save model

In [82]:
#sup_model.SupModel.save_model(model=logisticRegr, name='logReg_v0.2', prepared_features=sens.prepared_features, features=sens.features_dict)


# Evaluate logistic regression model

### Create test collection, annotate translation label and do feature engineering

In [83]:
test_data = sens.create_test_collection(n_queries=100, n_docs=9949)

In [84]:
len(test_data)

994900

In [85]:
test_data = sens.extract_features(features_dict=features_dict, data='test')

Started diff_num_words
Started diff_num_punctuation
Started diff_occ_question_mark
Started diff_occ_exclamation_mark
Started cosine_similarity


In [86]:
test_data.head()

Unnamed: 0,src_sentence,src_preprocessed,src_embedding,src_embedding_aligned,trg_sentence,trg_preprocessed,trg_embedding,translation,diff_num_words,diff_num_punctuation,diff_occ_question_mark,diff_occ_exclamation_mark,cosine_similarity
0,It was decided to try and achieve greater cohe...,"[decided, try, achieve, greater, coherence, cu...","[-0.048211736842105266, 0.05641728421052632, -...","[-0.0969772502481965, 0.06966935894749798, -0....","Es ging genau darum, eine Aktion im Sinne eine...","[ging, genau, darum, ,, aktion, sinne, größere...","[-0.11921721739130432, 0.055093369565217404, -...",1,3,2,0,0,0.871409
1,It was decided to try and achieve greater cohe...,"[decided, try, achieve, greater, coherence, cu...","[-0.048211736842105266, 0.05641728421052632, -...","[-0.0969772502481965, 0.06966935894749798, -0....",Darüber hinaus muß versucht werden - wie wir e...,"[darüber, hinaus, muß, versucht, -, bereits, b...","[-0.16698228372093024, 0.013159534883720923, -...",0,24,7,0,0,0.836974
2,It was decided to try and achieve greater cohe...,"[decided, try, achieve, greater, coherence, cu...","[-0.048211736842105266, 0.05641728421052632, -...","[-0.0969772502481965, 0.06966935894749798, -0....",In diesem Prozeß haben die Verpflichtungen in ...,"[prozeß, verpflichtungen, bezug, ziele, nation...","[-0.21648888888888887, 0.008203444444444449, -...",0,1,1,0,0,0.769093
3,It was decided to try and achieve greater cohe...,"[decided, try, achieve, greater, coherence, cu...","[-0.048211736842105266, 0.05641728421052632, -...","[-0.0969772502481965, 0.06966935894749798, -0....",In einer zweiten Dimension - und hier ist sehr...,"[zweiten, dimension, -, wohl, rückgriff, europ...","[-0.09003026592592593, -0.07708211111111112, -...",0,8,2,0,0,0.788372
4,It was decided to try and achieve greater cohe...,"[decided, try, achieve, greater, coherence, cu...","[-0.048211736842105266, 0.05641728421052632, -...","[-0.0969772502481965, 0.06966935894749798, -0....",Die Kommission war auf diesem Gebiet äußerst a...,"[kommission, gebiet, äußerst, aktiv, ., kommis...","[-0.1492267619047619, 0.028525047619047617, -0...",0,5,2,0,0,0.723034


### Evaluate boolean classifier

#### "Hard coded"

In [87]:
y_pred = logisticRegr.predict(test_data[features])
y_true = test_data[label]

In [97]:
forest = RandomForestClassifier(1000)
forest.fit(X_train, y_train)
y_pred_forest = forest.predict(test_data[features])
y_true_forest = test_data[label]

In [98]:
display(sum(y_pred_forest))
display(sum(y_true))
display(len(y_pred_forest))

131188

100

994900

In [100]:
precision_score(y_true, y_pred_forest)

0.0006631704119279202

In [101]:
recall_score(y_true, y_pred_forest)

0.87

In [102]:
accuracy_score(y_true, y_pred_forest)

0.8682138908433008

In [103]:
f1_score(y_true, y_pred_forest)

0.0013253305709585036

#### Using SupModel instance

In [104]:
sup = sup_model.SupModel()
#sup.evaluate_boolean(logisticRegr, sens)

In [None]:
sup.accuracy

In [None]:
sup.precision

In [None]:
sup.recall

In [None]:
sup.f1

In [None]:
sens.features_dict

### Evaluate ranking

In [105]:
sup.compute_map(forest, sens)

0.1031920623378471

### Load preloaded model and apply ranking

In [133]:
logisticRegr, prepared_features, features = sup_model.SupModel.load_model(name='logReg_v0.2')

### Rank target sentences in test dataset

In [173]:
sup_model.SupModel.rank_trg_sentences(logisticRegr, sens)


Done with index: 0
Done with index: 100


In [126]:
sens.test_data.head()

Unnamed: 0,src_sentence,trg_sentence,src_preprocessed,trg_preprocessed,src_embedding,trg_embedding,src_num_words,trg_num_words,src_num_punctuation,trg_num_punctuation,src_occ_question_mark,trg_occ_question_mark,src_occ_exclamation_mark,trg_occ_exclamation_mark,src_num_noun,trg_num_noun,src_num_verb,trg_num_verb,src_num_adverb,trg_num_adverb,src_num_adjective,trg_num_adjective,src_num_wh,trg_num_wh,src_num_pronoun,trg_num_pronoun,translation,predictions,predicted_sentences,predicted_probabilities
0,"On the other hand, we can be fairly confident ...",Hingegen kann man der Zukunft einigermaßen zuv...,"[hand, ,, fairly, confident, future, know, eff...","[hingegen, zukunft, einigermaßen, zuversichtli...","[-0.03125365286703291, 0.05021663067572581, -0...","[-0.12881409285714285, -0.026504692857142853, ...",19,16,6,3,False,False,False,False,12,15,3,0,1,0,3,0,0,0,0,0,1,"[[Unterstreichen möchte ich auch, daß es notwe...","[Unterstreichen möchte ich auch, daß es notwen...","[0.9831631508796376, 0.9831631508796376, 0.983..."
1,I should also like to stress the need to maint...,"Unterstreichen möchte ich auch, daß es notwend...","[also, like, stress, need, maintain, flexible,...","[unterstreichen, möchte, ,, notwendig, ,, ents...","[-0.025572795222382205, -0.0009202446333488115...","[-0.03415418181818181, 0.03397977727272728, -0...",23,19,3,6,False,False,False,False,10,14,4,2,3,1,4,2,0,0,0,0,1,[[Doch diese notwendige realistische und pragm...,[Doch diese notwendige realistische und pragma...,"[0.9380140491536678, 0.9380140491536678, 0.912..."
2,This is a problem of compatibility between our...,Dies ist eine Frage der Vereinbarkeit unserer ...,"[problem, compatibility, directives, end, -, -...","[frage, vereinbarkeit, unserer, richtlinien, a...","[-0.13964157818371498, 0.03865048480077487, -0...","[-0.12898427777777777, -0.00664022222222222, -...",15,14,6,5,False,False,False,False,8,12,4,0,1,0,1,2,0,0,0,0,1,"[[Die vom Parlament vorgeschlagene Lösung, d. ...","[Die vom Parlament vorgeschlagene Lösung, d. h...","[0.9428284887565496, 0.9256622022827817, 0.923..."
3,We must therefore congratulate the Committee o...,Man kann sich daher nur über die Klugheit des ...,"[must, therefore, congratulate, committee, env...","[daher, klugheit, ausschusses, umweltfragen, ,...","[-0.10510580457469121, 0.08141651421038351, -0...","[-0.13002725, 0.12540335, -0.15950131875, -0.2...",25,16,5,5,False,False,False,False,13,9,4,1,1,1,6,2,0,0,0,0,1,[[Ich möchte nochmals kurz auf die wirtschaftl...,[Ich möchte nochmals kurz auf die wirtschaftli...,"[0.9774905186008319, 0.9733422658640649, 0.973..."
4,"In conclusion, let me say that this directive ...","Abschließend möchte ich feststellen, daß diese...","[conclusion, ,, let, say, directive, ambitious...","[abschließend, möchte, feststellen, ,, richtli...","[-0.18686343283558546, 0.1048829425718163, -0....","[-0.14413220833333332, -0.0016888750000000005,...",22,21,3,7,False,False,False,False,10,18,5,2,3,0,4,0,0,0,0,0,1,[[Aus meiner eigenen kommunalpolitischen Erfah...,[Aus meiner eigenen kommunalpolitischen Erfahr...,"[0.9214032349946576, 0.9179825901790721, 0.909..."


In [193]:
# Train: 4800, Test: 200 | Only true translations
sup_model.SupModel.evaluate_at_k(sens, 1)

0.44

In [189]:
# Train: 4800, Test: 200 | Only true translations
sup_model.SupModel.evaluate_at_k(sens, 3)

0.62

In [190]:
# Train: 4800, Test: 200 | Only true translations
sup_model.SupModel.evaluate_at_k(sens, 5)

0.68

In [191]:
# Train: 4800, Test: 200 | Only true translations
sup_model.SupModel.evaluate_at_k(sens, 10)

0.78

In [None]:
# Train: 4800, Test: 200
sup_model.SupModel.evaluate_at_k(sens, 1)

In [177]:
# Train: 4800, Test: 200
sup_model.SupModel.evaluate_at_k(sens, 3)

0.31

In [178]:
# Train: 4800, Test: 200
sup_model.SupModel.evaluate_at_k(sens, 5)

0.34

In [179]:
# Train: 4800, Test: 200
sup_model.SupModel.evaluate_at_k(sens, 10)

0.41

In [127]:
# Train: 7000, Test: 3000
sup_model.SupModel.evaluate_at_k(sens, 1)

0.176

In [128]:
# Train: 7000, Test: 3000
sup_model.SupModel.evaluate_at_k(sens, 3)

0.22166666666666668

In [129]:
# Train: 7000, Test: 3000
sup_model.SupModel.evaluate_at_k(sens, 5)

0.253

In [130]:
# Train: 7000, Test: 3000
sup_model.SupModel.evaluate_at_k(sens, 10)

0.2966666666666667

In [101]:
# Train: 4000, Test: 1000
sup_model.SupModel.evaluate_at_k(sens, 1)

0.248

In [102]:
# Train: 4000, Test: 1000
sup_model.SupModel.evaluate_at_k(sens, 3)

0.292

In [103]:
# Train: 4000, Test: 1000
sup_model.SupModel.evaluate_at_k(sens, 5)

0.318

In [104]:
# Train: 4000, Test: 1000
sup_model.SupModel.evaluate_at_k(sens, 10)

0.365