In [1]:
import io, os, importlib, pickle
import datetime
import pandas as pd
import numpy as np
import joblib

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

from ir_crosslingual.supervised_classification import sup_model
importlib.reload(sup_model)

from ir_crosslingual.features import element_based
importlib.reload(element_based)

<module 'ir_crosslingual.features.element_based' from '/Users/jani/PycharmProjects/ir-crosslingual/ir_crosslingual/features/element_based.py'>

In [3]:
def time(start, stop, message):
    print('Computation time {}: {}'.format(message, stop-start))
    print('Finished at: {}'.format(datetime.datetime.now()))

## Load models

In [4]:
mlp_model, mlp_prepared_features, mlp_features_dict = sup_model.SupModel.load_model(name='mlp_avg_best')

In [5]:
mlp_features = ['norm_diff_num_words', 'euclidean_distance', 'abs_diff_occ_exclamation_mark_0',
 'abs_diff_occ_question_mark_2', 'abs_diff_occ_question_mark_0', 'cosine_similarity', 'norm_diff_translated_words',
 'abs_diff_occ_exclamation_mark_1', 'abs_diff_occ_question_mark_1', 'abs_diff_num_words',
 'abs_diff_occ_exclamation_mark_2', 'abs_diff_num_punctuation', 'src_embedding_pca_0', 'src_embedding_pca_1',
 'src_embedding_pca_2', 'src_embedding_pca_3', 'src_embedding_pca_4', 'src_embedding_pca_5', 'src_embedding_pca_6',
 'src_embedding_pca_7', 'src_embedding_pca_8', 'src_embedding_pca_9', 'trg_embedding_pca_0', 'trg_embedding_pca_1',
 'trg_embedding_pca_2', 'trg_embedding_pca_3', 'trg_embedding_pca_4', 'trg_embedding_pca_5', 'trg_embedding_pca_6',
 'trg_embedding_pca_7', 'trg_embedding_pca_8', 'trg_embedding_pca_9']

In [6]:
# define features
model_features = ['src_sentence', 'trg_sentence', 'translation',
                  'norm_diff_translated_words', 'abs_diff_num_words', 'abs_diff_num_punctuation',
                  'abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark',
                  'rel_diff_num_words', 'rel_diff_num_punctuation', 'norm_diff_num_words',
                  'norm_diff_num_punctuation', 'euclidean_distance', 'cosine_similarity'] \
                 + ['src_embedding_pca_{}'.format(i) for i in range(10)] \
                 + ['trg_embedding_pca_{}'.format(i) for i in range(10)]
meta_features = ['src_sentence', 'trg_sentence']
label = 'translation'

In [7]:
pca = {}
mean_scaler = {}
scaler = joblib.load(open('../main/models/scaler/ct.pkl', 'rb'))
for prefix in ['src', 'trg']:
    mean_scaler['{}'.format(prefix)] = joblib.load(open('../main/models/mean_scaler/mean_scaler_{}.pkl'.format(prefix),
                                                        'rb'))
    pca['{}'.format(prefix)] = joblib.load(open('../main/models/pca/pca_{}.pkl'.format(prefix), 'rb'))

In [8]:
# define function that get feature names of transformed columns
def get_transformer_feature_names(columnTransformer):

    output_features = []

    for name, pipe, features in columnTransformer.transformers_:
        if name!='remainder':
            for i in pipe:
                trans_features = []
                if hasattr(i,'categories_'):
                    trans_features.extend(i.get_feature_names(features))
                else:
                    trans_features = features
            output_features.extend(trans_features)

    return output_features

## English/Finnish

#### Load test collection

In [17]:
enfi_sens, enfi_chunks = sentences.Sentences.load_chunks_from_file('en', 'fi')

---- INFO: Learn projection matrix for en-fi
---- INFO: Found 10141 valid translation pairs in expert dictionary.
---- INFO: 1355 other pairs contained at least one unknown word (0 in source language, 1355 in target language).
---- DONE: Seed dictionary extracted for the languages: en-fi
---- INFO: Resulting subspace dimension: (10141, 300)
---- INFO: Resulting subspace dimension: (10141, 300)
---- DONE: Projection matrix learned from en to fi
---- INFO: Learn projection matrix for fi-en
---- INFO: Found 6946 valid translation pairs in expert dictionary.
---- INFO: 185 other pairs contained at least one unknown word (0 in source language, 185 in target language).
---- DONE: Seed dictionary extracted for the languages: fi-en
---- INFO: Resulting subspace dimension: (6946, 300)
---- INFO: Resulting subspace dimension: (6946, 300)
---- DONE: Projection matrix learned from fi to en


KeyboardInterrupt: 

In [None]:
enfi_sens.test_collection = pd.concat(enfi_chunks, ignore_index=True)

#### Apply PCA

In [None]:
enfi_sens = element_based.vec2features(enfi_sens, pca, mean_scaler, train=False)

#### Apply scaling of features

In [None]:
# divide test collection into chunks which allows for faster operations on test collection
n = 500000  #chunk row size
chunks_test_collection = [enfi_sens.test_collection[i:i+n] for i in range(0, enfi_sens.test_collection.shape[0], n)]

In [None]:
# scale columns of test collection
for i, chunk in enumerate(chunks_test_collection):
    chunks_test_collection[i] = pd.DataFrame(scaler.transform(chunk[model_features]))
    chunks_test_collection[i].columns = get_transformer_feature_names(scaler) + meta_features + [label]
    chunks_test_collection[i] = chunks_test_collection[i].infer_objects()
    print('Chunk {} scaled.'.format(i))

In [None]:
# update data attribute of Sentences object
enfi_sens.test_collection = pd.concat([chunk for chunk in chunks_test_collection], ignore_index=True)

#### Evaluate test collection on best MLP model

In [None]:
# Evaluation of base logistic regression
# print('Evaluation on logistic regression model')
start = datetime.datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp_model, enfi_sens, mlp_features)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
stop = datetime.datetime.now()
time(start, stop, 'evaluating boolean')
start = datetime.datetime.now()
print('MAP: {}'.format(sup.compute_map(mlp_model, enfi_sens, mlp_features)))
stop = datetime.datetime.now()
time(start, stop, 'computing the MAP score')

## Finnish/English

In [None]:
fien_sens, fien_chunks = sentences.Sentences.load_chunks_from_file('fi', 'en')

In [None]:
fien_sens.test_collection = pd.concat(fien_chunks, ignore_index=True)

In [None]:
fien_sens = element_based.vec2features(fien_sens, pca, mean_scaler, train=False)

In [None]:
# divide test collection into chunks which allows for faster operations on test collection
n = 500000  #chunk row size
chunks_test_collection = [fien_sens.test_collection[i:i+n] for i in range(0, fien_sens.test_collection.shape[0], n)]

In [None]:
# scale columns of test collection
for i, chunk in enumerate(chunks_test_collection):
    chunks_test_collection[i] = pd.DataFrame(scaler.transform(chunk[model_features]))
    chunks_test_collection[i].columns = get_transformer_feature_names(scaler) + meta_features + [label]
    chunks_test_collection[i] = chunks_test_collection[i].infer_objects()
    print('Chunk {} scaled.'.format(i))

In [None]:
# update data attribute of Sentences object
fien_sens.test_collection = pd.concat([chunk for chunk in chunks_test_collection], ignore_index=True)

In [None]:
# Evaluation of base logistic regression
# print('Evaluation on logistic regression model')
start = datetime.datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp_model, fien_sens, mlp_features)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
stop = datetime.datetime.now()
time(start, stop, 'evaluating boolean')
start = datetime.datetime.now()
print('MAP: {}'.format(sup.compute_map(mlp_model, fien_sens, mlp_features)))
stop = datetime.datetime.now()
time(start, stop, 'computing the MAP score')

## English/French

In [None]:
enfr_sens, enfr_chunks = sentences.Sentences.load_chunks_from_file('en', 'fr')

In [None]:
enfr_sens.test_collection = pd.concat(enfr_chunks, ignore_index=True)

In [None]:
enfr_sens = element_based.vec2features(enfr_sens, pca, mean_scaler, train=False)

In [None]:
# divide test collection into chunks which allows for faster operations on test collection
n = 500000  #chunk row size
chunks_test_collection = [enfr_sens.test_collection[i:i+n] for i in range(0, enfr_sens.test_collection.shape[0], n)]

In [None]:
# scale columns of test collection
for i, chunk in enumerate(chunks_test_collection):
    chunks_test_collection[i] = pd.DataFrame(scaler.transform(chunk[model_features]))
    chunks_test_collection[i].columns = get_transformer_feature_names(scaler) + meta_features + [label]
    chunks_test_collection[i] = chunks_test_collection[i].infer_objects()
    print('Chunk {} scaled.'.format(i))

In [None]:
# update data attribute of Sentences object
enfr_sens.test_collection = pd.concat([chunk for chunk in chunks_test_collection], ignore_index=True)

In [None]:
# Evaluation of base logistic regression
print('Evaluation on logistic regression model')
start = datetime.datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp_model, enfr_sens, mlp_features)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
stop = datetime.datetime.now()
time(start, stop, 'evaluating boolean')
start = datetime.datetime.now()
print('MAP: {}'.format(sup.compute_map(mlp_model, enfr_sens, mlp_features)))
stop = datetime.datetime.now()
time(start, stop, 'computing the MAP score')

## French/English

In [9]:
fren_sens, fren_chunks = sentences.Sentences.load_chunks_from_file('fr', 'en')

---- INFO: Learn projection matrix for fr-en
---- INFO: Found 7938 valid translation pairs in expert dictionary.
---- INFO: 332 other pairs contained at least one unknown word (0 in source language, 332 in target language).
---- DONE: Seed dictionary extracted for the languages: fr-en
---- INFO: Resulting subspace dimension: (7938, 300)
---- INFO: Resulting subspace dimension: (7938, 300)
---- DONE: Projection matrix learned from fr to en
---- INFO: Learn projection matrix for en-fr
---- INFO: Found 10369 valid translation pairs in expert dictionary.
---- INFO: 503 other pairs contained at least one unknown word (0 in source language, 503 in target language).
---- DONE: Seed dictionary extracted for the languages: en-fr
---- INFO: Resulting subspace dimension: (10369, 300)
---- INFO: Resulting subspace dimension: (10369, 300)
---- DONE: Projection matrix learned from en to fr
---- DONE: All chunks loaded


In [10]:
fren_sens.test_collection = pd.concat(fren_chunks, ignore_index=True)

In [11]:
fren_sens = element_based.vec2features(fren_sens, pca, mean_scaler, train=False)

---- INFO: Unique queries extracted
---- INFO: Unique documents extracted
---- INFO: src_embedding_pca elements extracted for unique queries.
---- INFO: Unique queries merged to test collection
---- INFO: trg_embedding_pca elements extracted for unique documents.
---- INFO: Unique documents merged to test collection
---- DONE: Extracted all vector elements and merged to test collection


In [12]:
# divide test collection into chunks which allows for faster operations on test collection
n = 500000  #chunk row size
chunks_test_collection = [fren_sens.test_collection[i:i+n] for i in range(0, fren_sens.test_collection.shape[0], n)]

In [13]:
# scale columns of test collection
for i, chunk in enumerate(chunks_test_collection):
    chunks_test_collection[i] = pd.DataFrame(scaler.transform(chunk[model_features]))
    chunks_test_collection[i].columns = get_transformer_feature_names(scaler) + meta_features + [label]
    chunks_test_collection[i] = chunks_test_collection[i].infer_objects()
    print('Chunk {} scaled.'.format(i))

Chunk 0 scaled.
Chunk 1 scaled.
Chunk 2 scaled.
Chunk 3 scaled.
Chunk 4 scaled.
Chunk 5 scaled.
Chunk 6 scaled.
Chunk 7 scaled.
Chunk 8 scaled.
Chunk 9 scaled.
Chunk 10 scaled.
Chunk 11 scaled.
Chunk 12 scaled.
Chunk 13 scaled.
Chunk 14 scaled.
Chunk 15 scaled.
Chunk 16 scaled.
Chunk 17 scaled.
Chunk 18 scaled.
Chunk 19 scaled.


In [14]:
# update data attribute of Sentences object
fren_sens.test_collection = pd.concat([chunk for chunk in chunks_test_collection], ignore_index=True)

In [15]:
# Evaluation of base logistic regression
print('Evaluation on logistic regression model')
start = datetime.datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp_model, fren_sens, mlp_features)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
stop = datetime.datetime.now()
time(start, stop, 'evaluating boolean')
start = datetime.datetime.now()
print('MAP: {}'.format(sup.compute_map(mlp_model, fren_sens, mlp_features)))
stop = datetime.datetime.now()
time(start, stop, 'computing the MAP score')

Evaluation on logistic regression model
Accuracy: 0.6560946
Precision: 0.00029330909342490546
Recall: 0.994088669950739
F1: 0.0005864451543007527
Computation time evaluating boolean: 0:00:41.533548
Finished at: 2020-05-24 12:05:26.380415
---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.8590039263547499
Computation time computing the MAP score: 0:07:57.005346
Finished at: 2020-05-24 12:13:23.392081
