## Imports

In [21]:
import importlib, pickle, joblib
from datetime import datetime
import pandas as pd
import numpy as np

from sklearn.neural_network import MLPClassifier

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

from ir_crosslingual.supervised_classification import sup_model
importlib.reload(sup_model)

from ir_crosslingual.unsupervised_classification import unsup_model
importlib.reload(unsup_model)

from ir_crosslingual.features import element_based
importlib.reload(element_based)

<module 'ir_crosslingual.features.element_based' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/06_Documents/ir-crosslingual/ir_crosslingual/features/element_based.py'>

In [14]:
def time(start, stop, message):
    print(f'---- TIME {datetime.now()}: Computation time {message}: {stop - start}')

## Load models

In [4]:
mlp_model, mlp_prepared_features, mlp_features_dict = sup_model.SupModel.load_model(name='mlp_avg_best')

In [5]:
mlp_features = ['norm_diff_num_words', 'euclidean_distance', 'abs_diff_occ_exclamation_mark_0',
 'abs_diff_occ_question_mark_2', 'abs_diff_occ_question_mark_0', 'cosine_similarity', 'norm_diff_translated_words',
 'abs_diff_occ_exclamation_mark_1', 'abs_diff_occ_question_mark_1', 'abs_diff_num_words',
 'abs_diff_occ_exclamation_mark_2', 'abs_diff_num_punctuation', 'src_embedding_pca_0', 'src_embedding_pca_1',
 'src_embedding_pca_2', 'src_embedding_pca_3', 'src_embedding_pca_4', 'src_embedding_pca_5', 'src_embedding_pca_6',
 'src_embedding_pca_7', 'src_embedding_pca_8', 'src_embedding_pca_9', 'trg_embedding_pca_0', 'trg_embedding_pca_1',
 'trg_embedding_pca_2', 'trg_embedding_pca_3', 'trg_embedding_pca_4', 'trg_embedding_pca_5', 'trg_embedding_pca_6',
 'trg_embedding_pca_7', 'trg_embedding_pca_8', 'trg_embedding_pca_9']

In [6]:
# define features
model_features = ['src_sentence', 'trg_sentence', 'translation',
                  'norm_diff_translated_words', 'abs_diff_num_words', 'abs_diff_num_punctuation',
                  'abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark',
                  'rel_diff_num_words', 'rel_diff_num_punctuation', 'norm_diff_num_words',
                  'norm_diff_num_punctuation', 'euclidean_distance', 'cosine_similarity'] \
                 + ['src_embedding_pca_{}'.format(i) for i in range(10)] \
                 + ['trg_embedding_pca_{}'.format(i) for i in range(10)]
meta_features = ['src_sentence', 'trg_sentence']
label = 'translation'

In [7]:
pca = {}
mean_scaler = {}
scaler = joblib.load(open('../main/models/scaler/ct.pkl', 'rb'))
for prefix in ['src', 'trg']:
    mean_scaler['{}'.format(prefix)] = joblib.load(open('../main/models/mean_scaler/mean_scaler_{}.pkl'.format(prefix),
                                                        'rb'))
    pca['{}'.format(prefix)] = joblib.load(open('../main/models/pca/pca_{}.pkl'.format(prefix), 'rb'))

In [8]:
# define function that get feature names of transformed columns
def get_transformer_feature_names(columnTransformer):

    output_features = []

    for name, pipe, features in columnTransformer.transformers_:
        if name!='remainder':
            for i in pipe:
                trans_features = []
                if hasattr(i,'categories_'):
                    trans_features.extend(i.get_feature_names(features))
                else:
                    trans_features = features
            output_features.extend(trans_features)

    return output_features

## Load documents data

In [9]:
sens, chunks = sentences.Sentences.load_chunks_from_file(docs=True, train=True)

---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- DONE: Seed dictionary extracted for the languages: en-de
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix learned from en to de
---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- DONE: Seed dictionary extracted for the languages: de-en
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix learned from de to en
---- DONE: All chunks loaded


In [10]:
sens.test_collection = pd.concat(chunks)

## Unsupervised Classification

In [15]:
model = unsup_model.UnsupModel()
sup = sup_model.SupModel()
start = datetime.now()
print('Unsupervised evaluation on documents data')
print('-' * 60)
print('---- DONE: MAP Score = {}'.format(sup.compute_map(model, sens, ['src_embedding_aligned', 'trg_embedding', 'cosine_similarity'])))
time(start, datetime.now(), 'computing the MAP score')


Unsupervised evaluation on documents data
------------------------------------------------------------
---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
---- DONE: MAP Score = 0.5401538175621837
---- TIME 2020-05-24 05:11:25.857894: Computation time computing the MAP score: 0:00:00.333873


## Supervised Classification

#### Train best model from sentence level on reasonable documents features

In [19]:
documents_model = MLPClassifier(activation='tanh', alpha=0.1, batch_size=2000, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(9,), learning_rate='adaptive',
              learning_rate_init=0.001, max_fun=15000, max_iter=1200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=42, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [20]:
documents_features = ['euclidean_distance', 'cosine_similarity',
                      'src_embedding_pca_0', 'src_embedding_pca_1', 'src_embedding_pca_2', 
                      'src_embedding_pca_3', 'src_embedding_pca_4', 'src_embedding_pca_5', 
                      'src_embedding_pca_6', 'src_embedding_pca_7', 'src_embedding_pca_8', 
                      'src_embedding_pca_9', 'trg_embedding_pca_0', 'trg_embedding_pca_1',
                      'trg_embedding_pca_2', 'trg_embedding_pca_3', 'trg_embedding_pca_4', 
                      'trg_embedding_pca_5', 'trg_embedding_pca_6', 'trg_embedding_pca_7', 
                      'trg_embedding_pca_8', 'trg_embedding_pca_9']

In [None]:
sens = element_based.vec2features(sens, pca, mean_scaler, train=False)

In [None]:
# divide test collection into chunks which allows for faster operations on test collection
n = 500000  #chunk row size
chunks_test_collection = [sens.test_collection[i:i+n] for i in range(0, sens.test_collection.shape[0], n)]


## German/English

In [9]:
deen_sens, deen_chunks = sentences.Sentences.load_chunks_from_file('de', 'en')

---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- DONE: Seed dictionary extracted for the languages: de-en
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix learned from de to en
---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- DONE: Seed dictionary extracted for the languages: en-de
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix learned from en to de
---- DONE: All chunks loaded


In [14]:
deen_sens.test_collection = pd.concat(deen_chunks, ignore_index=True)

In [15]:
deen_sens.test_collection.head()

Unnamed: 0,src_sentence,src_preprocessed,src_embedding,src_embedding_aligned,src_words,src_words_found_embedding,trg_sentence,trg_preprocessed,trg_embedding,trg_words,trg_words_found_embedding,translation,norm_diff_translated_words,abs_diff_num_words,abs_diff_num_punctuation,abs_diff_occ_question_mark,abs_diff_occ_exclamation_mark,rel_diff_num_words,rel_diff_num_punctuation,norm_diff_num_words,norm_diff_num_punctuation,euclidean_distance,cosine_similarity
0,Man braucht nur ein Entwicklungs-Glossar zu ne...,"[braucht, entwicklungs, -, glossar, nehmen, fr...","[-0.170302625, 0.08421487500000001, -0.0884655...","[0.07061240360856913, -0.042766405454618996, -...","[braucht, entwicklungs, glossar, nehmen, frage...","[[0.13183987810598768, 0.002547861925903911, -...",All we would have to do is get a directory of ...,"[would, get, directory, development, put, item...","[-0.049815000000000005, -0.013216419999999998,...","[would, get, directory, development, put, item...","[[-0.1718, 0.20407, -0.12805, -0.1194, -0.0034...",1,0.0,1,0,1,1,-1,0,-0.142857,0.0,1.318805,0.785875
1,Man braucht nur ein Entwicklungs-Glossar zu ne...,"[braucht, entwicklungs, -, glossar, nehmen, fr...","[-0.170302625, 0.08421487500000001, -0.0884655...","[0.07061240360856913, -0.042766405454618996, -...","[braucht, entwicklungs, glossar, nehmen, frage...","[[0.13183987810598768, 0.002547861925903911, -...",We are sufficiently responsible to realise tha...,"[sufficiently, responsible, realise, include, ...","[-0.05318141428571429, -0.07261509047619048, -...","[sufficiently, responsible, realise, include, ...","[[-0.2702, 0.0608, -0.1625, 0.43151, -0.24751,...",0,0.0,10,2,1,1,-10,-2,-0.833333,-0.666667,1.488705,0.723256
2,Man braucht nur ein Entwicklungs-Glossar zu ne...,"[braucht, entwicklungs, -, glossar, nehmen, fr...","[-0.170302625, 0.08421487500000001, -0.0884655...","[0.07061240360856913, -0.042766405454618996, -...","[braucht, entwicklungs, glossar, nehmen, frage...","[[0.13183987810598768, 0.002547861925903911, -...",We must understand that there are limitations ...,"[must, understand, limitations, achieve, ,, pa...","[-0.05662485, -0.04663293333333333, -0.1941054...","[must, understand, limitations, achieve, parti...","[[-0.013209, 0.13582, -0.17634, 0.14703, -0.14...",0,0.0,3,0,1,1,-3,0,-0.375,0.0,1.470648,0.735352
3,Man braucht nur ein Entwicklungs-Glossar zu ne...,"[braucht, entwicklungs, -, glossar, nehmen, fr...","[-0.170302625, 0.08421487500000001, -0.0884655...","[0.07061240360856913, -0.042766405454618996, -...","[braucht, entwicklungs, glossar, nehmen, frage...","[[0.13183987810598768, 0.002547861925903911, -...","You have mentioned Ethiopia, which falls withi...","[mentioned, ethiopia, ,, falls, within, scope,...","[-0.06602332142857144, -0.0758303214285714, -0...","[mentioned, ethiopia, falls, within, scope, na...","[[-0.0396, -0.12417, -0.016515, 0.35388, -0.02...",0,0.0,13,6,1,1,-13,-6,-1.0,-1.2,1.461765,0.728323
4,Man braucht nur ein Entwicklungs-Glossar zu ne...,"[braucht, entwicklungs, -, glossar, nehmen, fr...","[-0.170302625, 0.08421487500000001, -0.0884655...","[0.07061240360856913, -0.042766405454618996, -...","[braucht, entwicklungs, glossar, nehmen, frage...","[[0.13183987810598768, 0.002547861925903911, -...","Indeed, in terms of development aid policy, I ...","[indeed, ,, terms, development, aid, policy, ,...","[-0.11380196190476187, -0.10416507142857141, -...","[indeed, terms, development, aid, policy, thin...","[[-0.2022, 0.025995, -0.22349, 0.25912, -0.079...",0,0.0,11,1,1,1,-11,-1,-0.916667,-0.5,1.448506,0.738626


In [20]:
deen_sens = element_based.vec2features(deen_sens, pca, mean_scaler, train=False)

---- INFO: Unique queries extracted
---- INFO: Unique documents extracted
---- INFO: src_embedding_pca elements extracted for unique queries.
---- INFO: Unique queries merged to test collection
---- INFO: trg_embedding_pca elements extracted for unique documents.
---- INFO: Unique documents merged to test collection
---- DONE: Extracted all vector elements and merged to test collection


In [21]:
# divide test collection into chunks which allows for faster operations on test collection
n = 500000  #chunk row size
chunks_test_collection = [deen_sens.test_collection[i:i+n] for i in range(0, deen_sens.test_collection.shape[0], n)]

In [22]:
# scale columns of test collection
for i, chunk in enumerate(chunks_test_collection):
    chunks_test_collection[i] = pd.DataFrame(scaler.transform(chunk[model_features]))
    chunks_test_collection[i].columns = get_transformer_feature_names(scaler) + meta_features + [label]
    chunks_test_collection[i] = chunks_test_collection[i].infer_objects()
    print('Chunk {} scaled.'.format(i))

Chunk 0 scaled.
Chunk 1 scaled.
Chunk 2 scaled.
Chunk 3 scaled.
Chunk 4 scaled.
Chunk 5 scaled.
Chunk 6 scaled.
Chunk 7 scaled.
Chunk 8 scaled.
Chunk 9 scaled.
Chunk 10 scaled.
Chunk 11 scaled.
Chunk 12 scaled.
Chunk 13 scaled.
Chunk 14 scaled.
Chunk 15 scaled.
Chunk 16 scaled.
Chunk 17 scaled.
Chunk 18 scaled.
Chunk 19 scaled.


In [23]:
# update data attribute of Sentences object
deen_sens.test_collection = pd.concat([chunk for chunk in chunks_test_collection], ignore_index=True)

In [24]:
# Evaluation of base logistic regression
print('Evaluation on logistic regression model')
start = datetime.datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp_model, deen_sens, mlp_features)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
stop = datetime.datetime.now()
time(start, stop, 'evaluating boolean')
start = datetime.datetime.now()
print('MAP: {}'.format(sup.compute_map(mlp_model, deen_sens, mlp_features)))
stop = datetime.datetime.now()
time(start, stop, 'computing the MAP score')

Evaluation on logistic regression model
Accuracy: 0.5912253
Precision: 0.00024604126694132306
Recall: 0.9921104536489151
F1: 0.0004919605287255313
Computation time evaluating boolean: 0:00:24.749746
Finished at: 2020-05-23 23:46:26.109598
---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.7857719010404002
Computation time computing the MAP score: 0:09:12.013899
Finished at: 2020-05-23 23:55:38.123721
