## Imports

In [1]:
import importlib, pickle, joblib
from datetime import datetime
import pandas as pd
import numpy as np

from sklearn.neural_network import MLPClassifier

from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

from ir_crosslingual.supervised_classification import sup_model
importlib.reload(sup_model)

from ir_crosslingual.unsupervised_classification import unsup_model
importlib.reload(unsup_model)

from ir_crosslingual.features import element_based
importlib.reload(element_based)

<module 'ir_crosslingual.features.element_based' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/06_Dev/ir-crosslingual/ir_crosslingual/features/element_based.py'>

In [3]:
def time(start, stop, message):
    print(f'---- TIME {datetime.now()}: Computation time {message}: {stop - start}')

## Load documents data

In [4]:
sens, train_chunks, test_chunks = sentences.Sentences.load_chunks_from_file(docs=True, train=[13,40])

---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix learned from en to de
---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix learned from de to en
---- INFO: Files loaded containing training data
---- INFO: Files loaded containing test collection
---- DONE: All chunks loaded


In [5]:
sens.train_data = pd.concat(train_chunks)
sens.test_collection = pd.concat(test_chunks)

## Unsupervised Classification

In [12]:
model = unsup_model.UnsupModel()
sup = sup_model.SupModel()
start = datetime.now()
print('Unsupervised evaluation on documents data')
print('-' * 60)
print('---- DONE: MAP Score = {}'.format(sup.compute_map(model, sens, ['src_embedding_aligned', 'trg_embedding', 'cosine_similarity'])))
time(start, datetime.now(), 'computing the MAP score')


Unsupervised evaluation on documents data
------------------------------------------------------------
---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
---- DONE: MAP Score = 0.2941381472817646
---- TIME 2020-05-24 18:24:52.612839: Computation time computing the MAP score: 0:10:26.706793


## Supervised Classification

### 1) Pre-trained model from sentence level

#### Load pre-trained model from sentence level

In [13]:
mlp_model, mlp_prepared_features, mlp_features_dict = sup_model.SupModel.load_model(name='mlp_avg_best')

In [14]:
mlp_features = ['norm_diff_num_words', 'euclidean_distance', 'abs_diff_occ_exclamation_mark_0',
 'abs_diff_occ_question_mark_2', 'abs_diff_occ_question_mark_0', 'cosine_similarity', 'norm_diff_translated_words',
 'abs_diff_occ_exclamation_mark_1', 'abs_diff_occ_question_mark_1', 'abs_diff_num_words',
 'abs_diff_occ_exclamation_mark_2', 'abs_diff_num_punctuation', 'src_embedding_pca_0', 'src_embedding_pca_1',
 'src_embedding_pca_2', 'src_embedding_pca_3', 'src_embedding_pca_4', 'src_embedding_pca_5', 'src_embedding_pca_6',
 'src_embedding_pca_7', 'src_embedding_pca_8', 'src_embedding_pca_9', 'trg_embedding_pca_0', 'trg_embedding_pca_1',
 'trg_embedding_pca_2', 'trg_embedding_pca_3', 'trg_embedding_pca_4', 'trg_embedding_pca_5', 'trg_embedding_pca_6',
 'trg_embedding_pca_7', 'trg_embedding_pca_8', 'trg_embedding_pca_9']

In [12]:
# define features
model_features = ['src_sentence', 'trg_sentence', 'translation',
                  'norm_diff_translated_words', 'abs_diff_num_words', 'abs_diff_num_punctuation',
                  'abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark',
                  'rel_diff_num_words', 'rel_diff_num_punctuation', 'norm_diff_num_words',
                  'norm_diff_num_punctuation', 'euclidean_distance', 'cosine_similarity'] \
                 + ['src_embedding_pca_{}'.format(i) for i in range(10)] \
                 + ['trg_embedding_pca_{}'.format(i) for i in range(10)]
meta_features = ['src_sentence', 'trg_sentence']
label = 'translation'

In [16]:
pca = {}
mean_scaler = {}
scaler = joblib.load(open('../main/models/scaler/ct.pkl', 'rb'))
for prefix in ['src', 'trg']:
    mean_scaler['{}'.format(prefix)] = joblib.load(open('../main/models/mean_scaler/mean_scaler_{}.pkl'.format(prefix),
                                                        'rb'))
    pca['{}'.format(prefix)] = joblib.load(open('../main/models/pca/pca_{}.pkl'.format(prefix), 'rb'))

In [13]:
# define function that get feature names of transformed columns
def get_transformer_feature_names(columnTransformer):

    output_features = []

    for name, pipe, features in columnTransformer.transformers_:
        if name!='remainder':
            for i in pipe:
                trans_features = []
                if hasattr(i,'categories_'):
                    trans_features.extend(i.get_feature_names(features))
                else:
                    trans_features = features
            output_features.extend(trans_features)

    return output_features

In [18]:
sens = element_based.vec2features(sens, pca, mean_scaler, train=False)

---- INFO: Unique queries extracted
---- INFO: Unique documents extracted
---- INFO: src_embedding_pca elements extracted for unique queries.
---- INFO: Unique queries merged to test collection
---- INFO: trg_embedding_pca elements extracted for unique documents.
---- INFO: Unique documents merged to test collection
---- DONE: Extracted all vector elements and merged to test collection


#### Train best model from sentence level

In [19]:
# divide test collection into chunks which allows for faster operations on test collection
n = 250000  #chunk row size
chunks_test_collection = [sens.test_collection[i:i+n] for i in range(0, sens.test_collection.shape[0], n)]

In [None]:
# scale columns of test collection
for i, chunk in enumerate(chunks_test_collection):
    chunks_test_collection[i] = pd.DataFrame(scaler.transform(chunk[model_features]))
    chunks_test_collection[i].columns = get_transformer_feature_names(scaler) + meta_features + [label]
    chunks_test_collection[i] = chunks_test_collection[i].infer_objects()
    print('Chunk {} scaled.'.format(i))

In [30]:
# update data attribute of Sentences object
sens.test_collection = pd.concat(chunks_test_collection, ignore_index=True)

In [33]:
print('Supervised evaluation on documents data using pre-trained model from sentence level')
print('-' * 80)
start = datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp_model, sens, mlp_features)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')
start = datetime.now()
print('MAP: {}'.format(sup.compute_map(mlp_model, sens, mlp_features)))
time(start, datetime.now(), 'computing the MAP score')

Supervised evaluation on documents data using pre-trained model from sentence level
--------------------------------------------------------------------------------
Accuracy: 0.9823317331733173
Precision: 0.0009432193325870916
Recall: 0.16533864541832669
F1: 0.0018757380068588733
---- TIME 2020-05-24 18:50:16.218380: Computation time evaluating boolean: 0:00:33.296554
---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.01217209786811421
---- TIME 2020-05-24 18:58:54.565830: Computation time computing the MAP score: 0:08:38.346174


### 2) Explicitely trained model on document level

In [34]:
sens, train_chunks, test_chunks = sentences.Sentences.load_chunks_from_file(docs=True, train=[13,40])

---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix learned from en to de
---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix learned from de to en
---- INFO: Files loaded containing training data
---- INFO: Files loaded containing test collection
---- DONE: All chunks loaded


In [35]:
sens.train_data = pd.concat(train_chunks, ignore_index=True)
sens.test_collection = pd.concat(test_chunks, ignore_index=True)

#### Reduce embedding dimensionality and extract elements as features

In [36]:
mean_scaler = {}
for prefix in ['src', 'trg']:
    mean_scaler['{}'.format(prefix)] = StandardScaler(with_std=False)
    X = np.vstack(sens.train_data['{}_embedding'.format(prefix)])
    mean_scaler['{}'.format(prefix)].fit(X)

In [39]:
pca = {}
for prefix in ['src', 'trg']:
    pca['{}'.format(prefix)] = PCA(n_components=10, random_state=42)
    X = np.vstack(sens.train_data['{}_embedding'.format(prefix)])
    X = mean_scaler['{}'.format(prefix)].transform(X)
    pca['{}'.format(prefix)].fit(X)

In [40]:
sens = element_based.vec2features(sens, pca, mean_scaler)

---- INFO: Unique queries extracted
---- INFO: Unique documents extracted
---- INFO: Started extraction for src language.
---- INFO: src_embedding_pca elements extracted for train data.
---- INFO: src_embedding_pca elements extracted for unique queries.
---- INFO: Unique queries merged to test collection
---- INFO: Started extraction for trg language.
---- INFO: trg_embedding_pca elements extracted for train data.
---- INFO: trg_embedding_pca elements extracted for unique documents.
---- INFO: Unique documents merged to test collection
---- DONE: Extracted all vector elements and merged to test collection


#### Scale features (z-scores for numerical, OneHotEncoding for categorical variables)

In [14]:
# define features
model_features = ['src_sentence', 'trg_sentence', 'translation',
                  'norm_diff_translated_words', 'abs_diff_num_words', 'abs_diff_num_punctuation',
                  'abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark',
                  'rel_diff_num_words', 'rel_diff_num_punctuation', 'norm_diff_num_words',
                  'norm_diff_num_punctuation', 'euclidean_distance', 'cosine_similarity'] \
                 + ['src_embedding_pca_{}'.format(i) for i in range(10)] \
                 + ['trg_embedding_pca_{}'.format(i) for i in range(10)]
num_features = ['norm_diff_translated_words', 'abs_diff_num_words', 'abs_diff_num_punctuation',
                'rel_diff_num_words', 'rel_diff_num_punctuation', 'norm_diff_num_words',
                'norm_diff_num_punctuation', 'euclidean_distance', 'cosine_similarity'] \
                 + ['src_embedding_pca_{}'.format(i) for i in range(10)] \
                 + ['trg_embedding_pca_{}'.format(i) for i in range(10)]

cat_features = ['abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark']
meta_features = ['src_sentence', 'trg_sentence']
label = 'translation'

##### Fit scaler on training data

In [42]:
# create column transformer / scaler
numeric_pipeline = make_pipeline(StandardScaler())
cat_pipeline     = make_pipeline(OneHotEncoder())

transformers = [
('num', numeric_pipeline, num_features),
('cat', cat_pipeline, cat_features)
]

ct = ColumnTransformer(transformers, remainder='passthrough')

In [13]:
# define function that get feature names of transformed columns
def get_transformer_feature_names(columnTransformer):

    output_features = []

    for name, pipe, features in columnTransformer.transformers_:
        if name!='remainder':
            for i in pipe:
                trans_features = []
                if hasattr(i,'categories_'):
                    trans_features.extend(i.get_feature_names(features))
                else:
                    trans_features = features
            output_features.extend(trans_features)

    return output_features

In [45]:
# fit scaler on training data and scale training columns
ct.fit(sens.train_data[model_features])
print('Fitted.')

Fitted.


In [46]:
sens.train_data = pd.DataFrame(ct.transform(sens.train_data[model_features]))
sens.train_data.columns = get_transformer_feature_names(ct) + meta_features + [label]
sens.train_data = sens.train_data.infer_objects()
print('Train data scaled.')

Train data scaled.


In [48]:
# divide test collection into chunks which allows for faster operations on test collection
n = 250000  #chunk row size
chunks_test_collection = [sens.test_collection[i:i+n] for i in range(0, sens.test_collection.shape[0], n)]

In [None]:
# scale columns of test collection
for i, chunk in enumerate(chunks_test_collection):
    chunks_test_collection[i] = pd.DataFrame(ct.transform(chunk[model_features]))
    chunks_test_collection[i].columns = get_transformer_feature_names(ct) + meta_features + [label]
    chunks_test_collection[i] = chunks_test_collection[i].infer_objects()
    print('Chunk {} scaled.'.format(i))

In [50]:
# update data attribute of Sentences object
sens.test_collection = pd.concat([chunk for chunk in chunks_test_collection], ignore_index=True)

In [53]:
# Best model from sentence level
documents_mlp = MLPClassifier(activation='tanh', alpha=0.1, batch_size=2000, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(9,), learning_rate='adaptive',
              learning_rate_init=0.001, max_fun=15000, max_iter=1200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=42, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [None]:
# Fit on training data
start = datetime.now()
documents_mlp.fit(sens.train_data[mlp_features], sens.train_data[label])
time(start, datetime.now(), 'fitting the multilayer percptron')

In [55]:
print('Supervised evaluation on documents data using best sentence level model trained on document data')
print('-' * 80)
start = datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(documents_mlp, sens, mlp_features)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')
start = datetime.now()
print('MAP: {}'.format(sup.compute_map(documents_mlp, sens, mlp_features)))
time(start, datetime.now(), 'computing the MAP score')

Supervised evaluation on documents data using best sentence level model trained on document data
--------------------------------------------------------------------------------
Accuracy: 0.7192008200820083
Precision: 0.00022863321143123323
Recall: 0.6394422310756972
F1: 0.0004571029852313728
---- TIME 2020-05-24 20:04:58.623353: Computation time evaluating boolean: 0:00:27.926235
---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.0011508661640863384
---- TIME 2020-05-24 20:14:42.849330: Computation time computing the MAP score: 0:09:44.225033


### 3) Train sentence level data on reasonable document features and evaluate document data

In [6]:
train_file = f'{paths.data_path}extracted_data/global/en-de/training_data_avg.pkl'
test_file = f'{paths.data_path}extracted_data/global/en-de/test_collection_avg.pkl'
sens_level, train_data_sens, test_collection_sens, features_sens = sentences.Sentences.load_from_file(train_file, test_file)

---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix learned from en to de
---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix learned from de to en
---- INFO: File loaded containing training data
---- INFO: File loaded containing test collection
---- DONE: All files loaded and features extracted


In [7]:
sens_level.test_collection = pd.concat(test_chunks, ignore_index=True)

In [8]:
# Reduce embedding dimensionality and extract elements as features
mean_scaler = {}
for prefix in ['src', 'trg']:
    mean_scaler['{}'.format(prefix)] = StandardScaler(with_std=False)
    X = np.vstack(sens_level.train_data['{}_embedding'.format(prefix)])
    mean_scaler['{}'.format(prefix)].fit(X)

In [9]:
pca = {}
for prefix in ['src', 'trg']:
    pca['{}'.format(prefix)] = PCA(n_components=10, random_state=42)
    X = np.vstack(sens_level.train_data['{}_embedding'.format(prefix)])
    X = mean_scaler['{}'.format(prefix)].transform(X)
    pca['{}'.format(prefix)].fit(X)

In [10]:
sens_level = element_based.vec2features(sens_level, pca, mean_scaler)

---- INFO: Unique queries extracted
---- INFO: Unique documents extracted
---- INFO: Started extraction for src language.
---- INFO: src_embedding_pca elements extracted for train data.
---- INFO: src_embedding_pca elements extracted for unique queries.
---- INFO: Unique queries merged to test collection
---- INFO: Started extraction for trg language.
---- INFO: trg_embedding_pca elements extracted for train data.
---- INFO: trg_embedding_pca elements extracted for unique documents.
---- INFO: Unique documents merged to test collection
---- DONE: Extracted all vector elements and merged to test collection


In [15]:
# create column transformer / scaler
numeric_pipeline = make_pipeline(StandardScaler())
cat_pipeline     = make_pipeline(OneHotEncoder())

transformers = [
('num', numeric_pipeline, num_features),
('cat', cat_pipeline, cat_features)
]

ct = ColumnTransformer(transformers, remainder='passthrough')

In [16]:
# fit scaler on training data and scale training columns
ct.fit(sens_level.train_data[model_features])
print('Fitted.')

Fitted.


In [17]:
sens_level.train_data = pd.DataFrame(ct.transform(sens_level.train_data[model_features]))
sens_level.train_data.columns = get_transformer_feature_names(ct) + meta_features + [label]
sens_level.train_data = sens_level.train_data.infer_objects()
print('Train data scaled.')

Train data scaled.


In [18]:
# divide test collection into chunks which allows for faster operations on test collection
n = 250000  #chunk row size
chunks_test_collection = [sens_level.test_collection[i:i+n] for i in range(0, sens_level.test_collection.shape[0], n)]

In [None]:
# scale columns of test collection
for i, chunk in enumerate(chunks_test_collection):
    chunks_test_collection[i] = pd.DataFrame(ct.transform(chunk[model_features]))
    chunks_test_collection[i].columns = get_transformer_feature_names(ct) + meta_features + [label]
    chunks_test_collection[i] = chunks_test_collection[i].infer_objects()
    print('Chunk {} scaled.'.format(i))

In [21]:
# update data attribute of Sentences object
sens_level.test_collection = pd.concat(chunks_test_collection, ignore_index=True)

In [24]:
# Best model from sentence level
documents_mlp = MLPClassifier(activation='tanh', alpha=0.1, batch_size=2000, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(9,), learning_rate='adaptive',
              learning_rate_init=0.001, max_fun=15000, max_iter=1200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=42, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [25]:
# Fit on sentence level training data
start = datetime.now()
documents_mlp.fit(sens_level.train_data[documents_features], sens_level.train_data[label])
time(start, datetime.now(), 'fitting the multilayer percptron')

---- TIME 2020-05-25 01:16:45.010695: Computation time fitting the multilayer percptron: 0:01:21.312951


In [27]:
print('Supervised evaluation on documents data using reasonable documents features only, trained on sentence level data')
print('-' * 100)
start = datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(documents_mlp, sens_level, documents_features)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')
start = datetime.now()
print('MAP: {}'.format(sup.compute_map(documents_mlp, sens_level, documents_features)))
time(start, datetime.now(), 'computing the MAP score')

Supervised evaluation on documents data using reasonable documents features only, trained on sentence level data
----------------------------------------------------------------------------------------------------
Accuracy: 0.43374157415741577
Precision: 0.00017376004373667443
Recall: 0.9800796812749004
F1: 0.00034745848594964747
---- TIME 2020-05-25 01:19:35.182085: Computation time evaluating boolean: 0:00:34.930362
---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.2384825701666099
---- TIME 2020-05-25 01:29:07.241004: Computation time computing the MAP score: 0:09:32.057834
