## Imports

In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import importlib, datetime

from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
from ir_crosslingual.supervised_classification import sup_model
importlib.reload(sup_model)

from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.embeddings import embeddings
importlib.reload(embeddings)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

<module 'ir_crosslingual.sentences.sentences' from '/Users/jani/PycharmProjects/ir-crosslingual/ir_crosslingual/sentences/sentences.py'>

## Load data

### Load from file

In [3]:
train_file_avg = f'{paths.data_path}extracted_data/global/training_data_avg.pkl'
test_file_avg = f'{paths.data_path}extracted_data/global/test_collection_avg.pkl'
sens_avg, train_data_avg, test_collection_avg, features_avg = sentences.Sentences.load_from_file(train_file_avg, test_file_avg)

---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- DONE: Seed dictionary extracted for the languages: en-de
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix from en to de
---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- DONE: Seed dictionary extracted for the languages: de-en
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix from de to en


In [4]:
len(train_data_avg)

500000

In [5]:
len(test_collection_avg)

10000000

## Baseline MLP Model (averaging)

### Extract sentence embedding elements as features

In [None]:
# extract for target and source language:
# 1) vector elements of 300-dim sentence embeddings
# 2) vector elements of dimension reduced sentence embeddings
dim = 20
for prefix in ['src', 'trg']:
    train_data_avg[['{}_embedding_pca_{}'.format(prefix, i) for i in range(dim)]] = pd.DataFrame(sens_avg.reduce_dim(train_data_avg['{}_embedding'.format(prefix)], dim, use_ppa=False).tolist())
    print('1')
    test_collection_avg[['{}_embedding_pca_{}'.format(prefix, i) for i in range(dim)]] = pd.DataFrame(sens_avg.reduce_dim(test_collection_avg['{}_embedding'.format(prefix)], dim, use_ppa=False).tolist())
    print('2')
    train_data_avg[['{}_embedding_{}'.format(prefix, i) for i in range(300)]] = pd.DataFrame(train_data_avg['{}_embedding'.format(prefix)].tolist())
    print('3')
    test_collection_avg[['{}_embedding_{}'.format(prefix, i) for i in range(300)]] = pd.DataFrame(test_collection_avg['{}_embedding'.format(prefix)].tolist())
    print('4')

1


In [None]:
#### 1) Use self-selected features based on intuition

In [6]:
features_1 = 'norm_diff_translated_words norm_diff_num_punctuation abs_diff_occ_question_mark abs_diff_occ_exclamation_mark norm_diff_num_words euclidean_distance cosine_similarity'.split()
label = 'translation'

In [14]:
# Create baseline MLP classifier
mlp_1 = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(4,), random_state=1, learning_rate='adaptive', activation='tanh')

# Fit baseline model on training data
start = datetime.datetime.now()
mlp_1.fit(train_data_avg[features_1], train_data_avg[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluate baseline model on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp_1, sens_avg, features_1)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(mlp_1, sens_avg, features_1)))

Computation time fitting the multilayer perceptron: 0:00:11.665122
Accuracy: 0.9707365
Precision: 0.0032398070424070643
Recall: 0.9500499500499501
F1: 0.006457592764236751
MAP: 0.7844755124611402


In [None]:
#### 2) Use extracted elements of full sentence embedding (300-dim) as features

In [None]:
features_2 = ['src_embedding_{}'.format(i) for i in range(300)] + ['trg_embedding_{}'.format(i) for i in range(300)]

In [None]:
# Create baseline MLP classifier
mlp_2 = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(15,15,10), random_state=1, learning_rate='adaptive', activation='tanh')

# Fit baseline model on training data
start = datetime.datetime.now()
mlp_2.fit(train_data_avg[features_1], train_data_avg[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluate baseline model on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp_2, sens_avg, features_2)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(mlp_2, sens_avg, features_2)))

In [None]:
#### 3) Use self-selected features + extracted elements of dimension reduced sentence embedding (20-dim) as features

In [None]:
features_3 = features_1 + ['src_embedding_pca_{}'.format(i) for i in range(dim)] + ['trg_embedding_pca_{}'.format(i) for i in range(dim)]

In [None]:
# Create baseline MLP classifier
mlp_3 = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(4,3), random_state=1, learning_rate='adaptive', activation='tanh')

# Fit baseline model on training data
start = datetime.datetime.now()
mlp_3.fit(train_data_avg[features_3], train_data_avg[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluate baseline model on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp_3, sens_avg, features_3)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(mlp_3, sens_avg, features_3)

## Create random grid for tuning best model (variant 1)

In [15]:
# Hidden layers
hidden_layers = [(i,) for i in range(10)] + [(2,2), (2,2,2), (3,3), (4,2,1), (3,3,1)]

# Activation function
activation = ['identity', 'logistic', 'tanh', 'relu']

# Optimizer
optimizer = ['lbfgs', 'sgd', 'adam']

# Alpha
alpha = [0.1, 0.01, 0.001, 0.0001]

# Batch size
batch_size = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Learning rate
learning_rate = ['constant', 'invscaling', 'adaptive']

# Learning rate init
learning_rate_init = [0.1, 0.01, 0.001, 0.0001]

# Maximum iterations
max_iter = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

In [16]:
parameter_grid = {
    'hidden_layer_sizes': hidden_layers,
    'activation': activation,
    'solver': optimizer,
    'alpha': alpha,
    'batch_size': batch_size,
    'learning_rate': learning_rate,
    'learning_rate_init': learning_rate_init,
    'max_iter': max_iter
}

## Perform random search for optimal hyperparameters

In [17]:
cv = StratifiedKFold(n_splits=5, random_state=42)
mlp = MLPClassifier()
randomized_search = RandomizedSearchCV(estimator = mlp, param_distributions = parameter_grid, n_iter = 10, 
                                cv = cv, verbose=20, random_state=42, n_jobs = -1)
randomized_search.fit(train_data_avg[features_1], train_data_avg[label])

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  1

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
                   error_score=nan,
                   estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                           batch_size='auto', beta_1=0.9,
                                           beta_2=0.999, early_stopping=False,
                                           epsilon=1e-08,
                                           hidden_layer_sizes=(100,),
                                           learning_rate='constant',
                                           learning_rate_init=0.001,
                                           max_fun=15000, max_iter=200,
                                           momentum=0.9, n_iter...
                                                               (3,), (4,), (5,),
                                                               (6,), (7,), (8,),
                                                               (9,), (2, 2),
        

## Evaluate random search to identify optimal hyperparameters

In [18]:
# Identify best hyperparameters retrieved by random search
randomized_search.best_params_

{'solver': 'lbfgs',
 'max_iter': 200,
 'learning_rate_init': 0.001,
 'learning_rate': 'adaptive',
 'hidden_layer_sizes': (8,),
 'batch_size': 1200,
 'alpha': 0.01,
 'activation': 'relu'}

In [19]:
# Identify best hyperparameters retrieved by random search
randomized_search.best_score_

0.9471400000000001

In [20]:
# Fit best model on training data
best_model = randomized_search.best_estimator_
best_model.fit(train_data_avg[features_1], train_data_avg[label])

MLPClassifier(activation='relu', alpha=0.01, batch_size=1200, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(8,), learning_rate='adaptive',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='lbfgs',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [21]:
# Evaluate best model on test collection (2nd run)
start = datetime.datetime.now()
best_model.fit(train_data_avg[features_1], train_data_avg[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluation on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(best_model, sens_avg, features_1)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(best_model, sens_avg, features_1)))

Computation time fitting the multilayer perceptron: 0:00:16.033537
Accuracy: 0.9698826
Precision: 0.003158093359065946
Recall: 0.9530469530469531
F1: 0.006295326017381435
MAP: 0.7851291324638691
