## Imports

In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import importlib, datetime
import copy
from sklearn.externals import joblib
import pickle

from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)



In [2]:
from ir_crosslingual.supervised_classification import sup_model
importlib.reload(sup_model)

from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.embeddings import embeddings
importlib.reload(embeddings)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

from ir_crosslingual.features import element_based
importlib.reload(element_based)

<module 'ir_crosslingual.features.element_based' from '/Users/jani/PycharmProjects/ir-crosslingual/ir_crosslingual/features/element_based.py'>

## Load and preprocess data

### Load from file

In [3]:
train_file_avg = f'{paths.data_path}extracted_data/global/training_data_avg.pkl'
test_file_avg = f'{paths.data_path}extracted_data/global/test_collection_avg.pkl'
sens_avg, train_data_avg, test_collection_avg, features_avg = sentences.Sentences.load_from_file(train_file_avg, test_file_avg)

---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- DONE: Seed dictionary extracted for the languages: en-de
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix learned from en to de
---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- DONE: Seed dictionary extracted for the languages: de-en
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix learned from de to en
---- INFO: File loaded containing training data
---- INFO: File loaded containing test collection
---- DONE:

### Reduce embedding dimensionality and extract elements as features

In [4]:
mean_scaler = {}
for prefix in ['src', 'trg']:
    mean_scaler['{}'.format(prefix)] = StandardScaler(with_std=False)
    X = np.vstack(sens_avg.train_data['{}_embedding'.format(prefix)])
    mean_scaler['{}'.format(prefix)].fit(X)

In [5]:
pca = {}
for prefix in ['src', 'trg']:
    pca['{}'.format(prefix)] = PCA(n_components=10, random_state=42)
    X = np.vstack(sens_avg.train_data['{}_embedding'.format(prefix)])
    X = mean_scaler['{}'.format(prefix)].transform(X)
    pca['{}'.format(prefix)].fit(X)

In [6]:
sens_avg = element_based.vec2features(sens_avg, pca, mean_scaler)

---- INFO: Unique queries extracted
---- INFO: Unique documents extracted
---- INFO: Started extraction for src language.
---- INFO: src_embedding_pca elements extracted for train data.
---- INFO: src_embedding_pca elements extracted for unique queries.
---- INFO: Unique queries merged to test collection
---- INFO: Started extraction for trg language.
---- INFO: trg_embedding_pca elements extracted for train data.
---- INFO: trg_embedding_pca elements extracted for unique documents.
---- INFO: Unique documents merged to test collection
---- DONE: Extracted all vector elements and merged to test collection


### Remove pairs that cointain sentences equal to '.'

In [7]:
sens_avg.train_data = sens_avg.train_data[(sens_avg.train_data['src_sentence'] != '.') & (sens_avg.train_data['trg_sentence'] != '.')]
sens_avg.test_collection = sens_avg.test_collection[(sens_avg.test_collection['src_sentence'] != '.') & (sens_avg.test_collection['trg_sentence'] != '.')]

### Scale features (z-scores for numerical, OneHotEncoding for categorical)

In [8]:
# define features
model_features = ['src_sentence', 'trg_sentence', 'translation',
                  'norm_diff_translated_words', 'abs_diff_num_words', 'abs_diff_num_punctuation',
                  'abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark',
                  'rel_diff_num_words', 'rel_diff_num_punctuation', 'norm_diff_num_words',
                  'norm_diff_num_punctuation', 'euclidean_distance', 'cosine_similarity'] \
                 + ['src_embedding_pca_{}'.format(i) for i in range(10)] \
                 + ['trg_embedding_pca_{}'.format(i) for i in range(10)]
num_features = ['norm_diff_translated_words', 'abs_diff_num_words', 'abs_diff_num_punctuation',
                'rel_diff_num_words', 'rel_diff_num_punctuation', 'norm_diff_num_words',
                'norm_diff_num_punctuation', 'euclidean_distance', 'cosine_similarity'] \
                 + ['src_embedding_pca_{}'.format(i) for i in range(10)] \
                 + ['trg_embedding_pca_{}'.format(i) for i in range(10)]
cat_features = ['abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark']
meta_features = ['src_sentence', 'trg_sentence']
label = 'translation'

#### Fit scaler on training data

In [9]:
# create column transformer / scaler
numeric_pipeline = make_pipeline(StandardScaler())
cat_pipeline     = make_pipeline(OneHotEncoder())

transformers = [
('num', numeric_pipeline, num_features),
('cat', cat_pipeline, cat_features)
]

ct = ColumnTransformer(transformers, remainder='passthrough')

In [10]:
# define function that get feature names of transformed columns
def get_transformer_feature_names(columnTransformer):

    output_features = []

    for name, pipe, features in columnTransformer.transformers_:
        if name!='remainder':
            for i in pipe:
                trans_features = []
                if hasattr(i,'categories_'):
                    trans_features.extend(i.get_feature_names(features))
                else:
                    trans_features = features
            output_features.extend(trans_features)

    return output_features

In [11]:
# fit scaler on training data and scale training columns
ct.fit(sens_avg.train_data[model_features])
print('Fitted.')

Fitted.


In [12]:
sens_avg.train_data = pd.DataFrame(ct.transform(sens_avg.train_data[model_features]))
sens_avg.train_data.columns = get_transformer_feature_names(ct) + meta_features + [label]
sens_avg.train_data = sens_avg.train_data.infer_objects()
print('Train data scaled.')

Train data scaled.


In [13]:
# divide test collection into chunks which allows for faster operations on test collection
n = 500000  #chunk row size
chunks_test_collection = [sens_avg.test_collection[i:i+n] for i in range(0, sens_avg.test_collection.shape[0], n)]

In [14]:
# scale columns of test collection
for i, chunk in enumerate(chunks_test_collection):
    chunks_test_collection[i] = pd.DataFrame(ct.transform(chunk[model_features]))
    chunks_test_collection[i].columns = get_transformer_feature_names(ct) + meta_features + [label]
    chunks_test_collection[i] = chunks_test_collection[i].infer_objects()
    print('Chunk {} scaled.'.format(i))

Chunk 0 scaled.
Chunk 1 scaled.
Chunk 2 scaled.
Chunk 3 scaled.
Chunk 4 scaled.
Chunk 5 scaled.
Chunk 6 scaled.
Chunk 7 scaled.
Chunk 8 scaled.
Chunk 9 scaled.
Chunk 10 scaled.
Chunk 11 scaled.
Chunk 12 scaled.
Chunk 13 scaled.
Chunk 14 scaled.
Chunk 15 scaled.
Chunk 16 scaled.
Chunk 17 scaled.
Chunk 18 scaled.
Chunk 19 scaled.


In [15]:
# update data attribute of Sentences object
sens_avg.test_collection = pd.concat([chunk for chunk in chunks_test_collection], ignore_index=True)

## Baseline MLP models using "average" aggregation

#### 1) Use self-selected features based on intuition

In [33]:
features_1 = 'norm_diff_translated_words norm_diff_num_punctuation abs_diff_occ_question_mark_0 abs_diff_occ_question_mark_1 abs_diff_occ_question_mark_2 abs_diff_occ_exclamation_mark_0 abs_diff_occ_exclamation_mark_1 abs_diff_occ_exclamation_mark_2 norm_diff_num_words euclidean_distance cosine_similarity'.split()

In [34]:
# Create baseline MLP classifier
mlp_1 = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5,), random_state=1, learning_rate='adaptive', activation='tanh')

# Fit baseline model on training data
start = datetime.datetime.now()
mlp_1.fit(sens_avg.train_data[features_1], sens_avg.train_data[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluate baseline model on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp_1, sens_avg, features_1)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(mlp_1, sens_avg, features_1)))

Computation time fitting the multilayer perceptron: 0:00:12.432663
Accuracy: 0.9737696696696697
Precision: 0.003631841552830935
Recall: 0.955955955955956
F1: 0.007236191565858815
---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.8000676445473803


#### 2) Use features selected through RFECV (see logistic_regression)

In [29]:
features_2 = list(set(features_1 + ['abs_diff_num_words', 'abs_diff_num_punctuation']) - set(['norm_diff_num_punctuation']))

In [31]:
# Create baseline MLP classifier
mlp_2 = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5,), random_state=42, learning_rate='adaptive', activation='tanh')

# Fit baseline model on training data
start = datetime.datetime.now()
mlp_2.fit(sens_avg.train_data[features_2], sens_avg.train_data[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluate baseline model on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp_2, sens_avg, features_2)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(mlp_2, sens_avg, features_2)))

Computation time fitting the multilayer perceptron: 0:00:13.530632
Accuracy: 0.9767011011011011
Precision: 0.00415916063688217
Recall: 0.972972972972973
F1: 0.008282914358755859
---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.8277222438347304


#### 3) Use correlation-based features

In [27]:
features_corr = 'norm_diff_translated_words abs_diff_num_punctuation abs_diff_occ_question_mark_0 abs_diff_occ_question_mark_1 abs_diff_occ_question_mark_2 abs_diff_occ_exclamation_mark_0 abs_diff_occ_exclamation_mark_1 abs_diff_occ_exclamation_mark_2 abs_diff_num_words cosine_similarity'.split()

In [32]:
# Create baseline MLP classifier
mlp_corr = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5,), random_state=42, learning_rate='adaptive', activation='tanh')

# Fit baseline model on training data
start = datetime.datetime.now()
mlp_corr.fit(sens_avg.train_data[features_corr], sens_avg.train_data[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluate baseline model on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp_corr, sens_avg, features_corr)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(mlp_corr, sens_avg, features_corr)))

Computation time fitting the multilayer perceptron: 0:00:07.112758
Accuracy: 0.9745493493493493
Precision: 0.0037193661792428796
Recall: 0.94994994994995
F1: 0.00740972086667968
---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.8256640284049531


#### 4) Use all features

In [22]:
features_3 = features_2 + ['norm_diff_num_punctuation', 'rel_diff_num_words', 'rel_diff_num_punctuation']

In [23]:
# Create baseline MLP classifier
mlp_3 = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5,), random_state=1, learning_rate='adaptive', activation='tanh')

# Fit baseline model on training data
start = datetime.datetime.now()
mlp_3.fit(sens_avg.train_data[features_3], sens_avg.train_data[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluate baseline model on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp_3, sens_avg, features_3)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(mlp_3, sens_avg, features_3)))

Computation time fitting the multilayer perceptron: 0:00:14.093721
Accuracy: 0.9767005005005005
Precision: 0.004154810551763976
Recall: 0.9719719719719719
F1: 0.008274251823573523
---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.821184407892351


#### 5) Use best of {1,2,3,4} + extracted elements of dimension reduced sentence embedding (10-dim) as features

In [25]:
features_4 = features_2 + ['src_embedding_pca_{}'.format(i) for i in range(10)] + ['trg_embedding_pca_{}'.format(i) for i in range(10)]

In [26]:
mlp_4 = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(8,), random_state=1, learning_rate='adaptive', activation='tanh')

# Fit baseline model on training data
start = datetime.datetime.now()
mlp_4.fit(sens_avg.train_data[features_4], sens_avg.train_data[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluate baseline model on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp_4, sens_avg, features_4)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(mlp_4, sens_avg, features_4)))

Computation time fitting the multilayer perceptron: 0:00:20.210128
Accuracy: 0.9838556556556557
Precision: 0.006040322232700332
Recall: 0.980980980980981
F1: 0.012006713958417565
---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.850281183980444


## Hyperparametertuning of best model (variant 5)

### Create random grid for tuning best model (1st run)

In [97]:
# Hidden layers
hidden_layers = [(4,4), (3,3,2), (8,), (2,2,2,1), (8,3,2), (7,), (9,), (4,3,2)]

# Activation function
activation = ['identity', 'logistic', 'tanh', 'relu']

# Optimizer
optimizer = ['lbfgs', 'sgd', 'adam']

# Alpha
alpha = [0.1, 0.01, 0.001, 0.0001]

# Batch size
batch_size = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Learning rate
learning_rate = ['constant', 'invscaling', 'adaptive']

# Learning rate init
learning_rate_init = [0.1, 0.01, 0.001, 0.0001]

# Maximum iterations
max_iter = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

In [98]:
parameter_grid = {
    'hidden_layer_sizes': hidden_layers,
    'activation': activation,
    'solver': optimizer,
    'alpha': alpha,
    'batch_size': batch_size,
    'learning_rate': learning_rate,
    'learning_rate_init': learning_rate_init,
    'max_iter': max_iter
}

### Perform random search for optimal hyperparameters (1st run)

In [None]:
cv = StratifiedKFold(n_splits=5, random_state=42)
mlp = MLPClassifier(random_state=42)
randomized_search = RandomizedSearchCV(estimator = mlp, param_distributions = parameter_grid, n_iter = 50, 
                                       cv = cv, verbose=20, random_state=42, n_jobs = -1)
randomized_search.fit(sens_avg.train_data[features_4], sens_avg.train_data[label])

### Evaluate random search to identify optimal hyperparameters and resulting MAP (1st run)

In [100]:
# Identify best hyperparameters retrieved by random search
randomized_search.best_params_

{'solver': 'lbfgs',
 'max_iter': 1200,
 'learning_rate_init': 0.001,
 'learning_rate': 'adaptive',
 'hidden_layer_sizes': (9,),
 'batch_size': 2000,
 'alpha': 0.1,
 'activation': 'tanh'}

In [101]:
# Identify best hyperparameters retrieved by random search
randomized_search.best_score_

0.9742503350737163

In [102]:
# Fit best model on training data
best_model = randomized_search.best_estimator_

In [107]:
# Evaluate best model on test collection
start = datetime.datetime.now()
best_model.fit(sens_avg.train_data[features_4], sens_avg.train_data[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluation on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(best_model, sens_avg, features_4)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(best_model, sens_avg, features_4)))

Computation time fitting the multilayer perceptron: 0:01:35.418652
Accuracy: 0.9847024024024024
Precision: 0.006391833019051954
Recall: 0.983983983983984
F1: 0.012701160935208573
---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.8666164677324514


### Create random grid for tuning best model (variant 5) (2nd run)

In [36]:
# Hidden layers
hidden_layers_2 = [(i,) for i in range(10, 32, 2)]

# Activation function
activation_2 = ['identity', 'logistic', 'tanh', 'relu']

# Optimizer
optimizer_2 = ['lbfgs', 'sgd', 'adam']

# Alpha
alpha_2 = [0.001, 0.005, 0.05, 0.1, 0.15, 0.2]

# Batch size
batch_size_2 = [int(x) for x in np.linspace(start = 1500, stop = 2500, num = 5)]

# Learning rate
learning_rate_2 = ['constant', 'invscaling', 'adaptive']

# Learning rate init
learning_rate_init_2 = [0.0005, 0.001, 0.0015, 0.002]

# Maximum iterations
max_iter_2 = [int(x) for x in np.linspace(start = 500, stop = 1500, num = 5)]

In [37]:
parameter_grid_2 = {
    'hidden_layer_sizes': hidden_layers_2,
    'activation': activation_2,
    'solver': optimizer_2,
    'alpha': alpha_2,
    'batch_size': batch_size_2,
    'learning_rate': learning_rate_2,
    'learning_rate_init': learning_rate_init_2,
    'max_iter': max_iter_2
}

### Perform random search for optimal hyperparameters (2nd run)

In [None]:
cv = StratifiedKFold(n_splits=5, random_state=42)
mlp_final = MLPClassifier()
randomized_search_2 = RandomizedSearchCV(estimator = mlp_final, param_distributions = parameter_grid_2, n_iter = 50, 
                                         cv = cv, verbose=20, random_state=42, n_jobs = -1)
randomized_search_2.fit(sens_avg.train_data[features_4], sens_avg.train_data[label])

### Evaluate random search to identify optimal hyperparameters (2nd run)

In [40]:
# Identify best hyperparameters retrieved by random search
randomized_search_2.best_params_

{'solver': 'lbfgs',
 'max_iter': 1250,
 'learning_rate_init': 0.001,
 'learning_rate': 'adaptive',
 'hidden_layer_sizes': (26,),
 'batch_size': 2250,
 'alpha': 0.001,
 'activation': 'tanh'}

In [41]:
# Identify best hyperparameters retrieved by random search
randomized_search_2.best_score_

0.9759167016743684

In [42]:
# Fit best model on training data
best_model_2 = randomized_search_2.best_estimator_

In [43]:
# Evaluate best model on test collection
start = datetime.datetime.now()
best_model_2.fit(sens_avg.train_data[features_4], sens_avg.train_data[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluation on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(best_model_2, sens_avg, features_4)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(best_model_2, sens_avg, features_4)))

Computation time fitting the multilayer perceptron: 0:02:12.130973
Accuracy: 0.9855294294294294
Precision: 0.006775049129413058
Recall: 0.986986986986987
F1: 0.013457719421563744
---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.8584644422131052


### Did not further improve w.r.t. MAP -> choose model from first random search as best model

#### Save best model

In [29]:
sup_model.SupModel.save_model(model=best_model, name='mlp_avg_best', prepared_features=sens_avg.prepared_features, features_dict=sens_avg.features_dict)

#### Save scaler

In [16]:
joblib.dump(ct, '../main/models/scaler/ct.pkl')

['../main/models/scaler/ct.pkl']

#### Save PCA

In [8]:
for prefix in ['src', 'trg']:
    joblib.dump(pca['{}'.format(prefix)], '../main/models/pca/pca_{}.pkl'.format(prefix))

#### Save mean_scaler

In [9]:
for prefix in ['src', 'trg']:
    joblib.dump(mean_scaler['{}'.format(prefix)], '../main/models/mean_scaler/mean_scaler_{}.pkl'.format(prefix))