## Imports

In [1]:
import pandas as pd
import numpy as np
import importlib, datetime

from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
from ir_crosslingual.supervised_classification import sup_model
importlib.reload(sup_model)

from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.embeddings import embeddings
importlib.reload(embeddings)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

<module 'ir_crosslingual.sentences.sentences' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/03_Feature-Selection/ir-crosslingual/ir_crosslingual/sentences/sentences.py'>

## Load data

### Load from file

In [3]:
train_file = f'{paths.data_path}extracted-data/training_data_v0.0.pkl'
test_file = f'{paths.data_path}extracted-data/test_collection_v0.0.pkl'
sens, train_data, test_collection, features = sentences.Sentences.load_from_file(train_file, test_file)

Learn projection matrix for en-de
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Resulting subspace dimension: (13700, 300)
Resulting subspace dimension: (13700, 300)
Learn projection matrix for de-en
Found 10604 valid translation pairs in expert dictionary.
262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
Resulting subspace dimension: (10604, 300)
Resulting subspace dimension: (10604, 300)


### Load manually

In [None]:
german = embeddings.WordEmbeddings('de')
german.load_embeddings()

english = embeddings.WordEmbeddings('en')
english.load_embeddings()

W_ende, W_deen = embeddings.WordEmbeddings.learn_projection_matrix(src_lang='en', trg_lang='de')

prepared_features = ['num_words', 'num_punctuation', 'occ_question_mark', 'occ_exclamation_mark']
sens = sentences.Sentences(src_words=english, trg_words=german)
data = sens.load_data(single_source=False, n_max=5000, features=prepared_features, agg_method='average')

features_dict = {'text_based': ['diff_{}'.format(feat) for feat in prepared_features], 
                 'vector_based': ['cosine_similarity']}
features = [feature for values in features_dict.values() for feature in values]

train_data = sens.create_train_set(n_train=4000, frac_pos=0.5)
test_data = sens.create_test_collection(n_queries=50, n_docs=996)

train_data, test_data = sens.extract_features(features_dict=features_dict, data='train_test')

## Baseline MLP Model

In [4]:
features = 'diff_num_words diff_num_punctuation diff_occ_question_mark diff_occ_exclamation_mark cosine_similarity'.split()
label = 'translation'

In [8]:
# Create baseline MLP classifier
mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5, 4, 3, 2), random_state=1, learning_rate='adaptive', activation='tanh')

# Fit baseline model on training data
start = datetime.datetime.now()
mlp.fit(train_data[features], train_data[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluate baseline model on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp, sens, features)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(mlp, sens, features)))

Computation time fitting the multilayer perceptron: 0:00:00.631952
Accuracy: 0.8780923694779117
Precision: 0.00784698381559588
Recall: 0.96
F1: 0.015566726122912275
MAP: 0.8122558280755896


In [37]:
# Create baseline MLP classifier
mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(5, 4, 3, 2), random_state=1, learning_rate='adaptive', activation='tanh')

# Fit baseline model on training data
start = datetime.datetime.now()
mlp.fit(train_data[features], train_data[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluate baseline model on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(mlp, sens)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(mlp, sens)))

Computation time fitting the multilayer perceptron: 0:00:00.649427
Accuracy: 0.8926305220883534
Precision: 0.008900426478768774
Recall: 0.96
F1: 0.017637332353481534
MAP: 0.8279756873623236


## Create random grid

In [9]:
# Hidden layers
hidden_layers = [(5,4,3,2), (5,4,2), (5,3,2), (4,3,2), (5,2), (4,2), (3,2)]

# Activation function
activation = ['identity', 'logistic', 'tanh', 'relu']

# Optimizer
optimizer = ['lbfgs', 'sgd', 'adam']

# Alpha
alpha = [0.1, 0.01, 0.001, 0.0001]

# Batch size
batch_size = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Learning rate
learning_rate = ['constant', 'invscaling', 'adaptive']

# Learning rate init
learning_rate_init = [0.1, 0.01, 0.001, 0.0001]

# Maximum iterations
max_iter = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

In [10]:
parameter_grid = {
    'hidden_layer_sizes': hidden_layers,
    'activation': activation,
    'solver': optimizer,
    'alpha': alpha,
    'batch_size': batch_size,
    'learning_rate': learning_rate,
    'learning_rate_init': learning_rate_init,
    'max_iter': max_iter
}

## Perform random search for optimal hyperparameters

In [11]:
cv = StratifiedKFold(n_splits=5, random_state=42)
mlp = MLPClassifier()
randomized_search = RandomizedSearchCV(estimator = mlp, param_distributions = parameter_grid, n_iter = 100, 
                                cv = cv, verbose=20, random_state=42, n_jobs = -1)
randomized_search.fit(train_data[features], train_data[label])

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
                   error_score=nan,
                   estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                           batch_size='auto', beta_1=0.9,
                                           beta_2=0.999, early_stopping=False,
                                           epsilon=1e-08,
                                           hidden_layer_sizes=(100,),
                                           learning_rate='constant',
                                           learning_rate_init=0.001,
                                           max_fun=15000, max_iter=200,
                                           momentum=0.9, n_iter...
                                        'hidden_layer_sizes': [(5, 4, 3, 2),
                                                               (5, 4, 2),
                                                               (5, 3, 2),
                      

## Evaluate random search to identify optimal hyperparameters

In [12]:
# Identify best hyperparameters retrieved by random search
randomized_search.best_params_

{'solver': 'adam',
 'max_iter': 600,
 'learning_rate_init': 0.01,
 'learning_rate': 'adaptive',
 'hidden_layer_sizes': (5, 3, 2),
 'batch_size': 1800,
 'alpha': 0.1,
 'activation': 'logistic'}

In [13]:
# Identify best hyperparameters retrieved by random search (2nd run)
randomized_search.best_score_

0.8172499999999999

In [55]:
# Identify best hyperparameters retrieved by random search (1st run)
randomized_search.best_score_

0.8937499999999998

In [14]:
# Fit best model on training data
best_model = randomized_search.best_estimator_
best_model.fit(train_data[features], train_data[label])

MLPClassifier(activation='logistic', alpha=0.1, batch_size=1800, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(5, 3, 2), learning_rate='adaptive',
              learning_rate_init=0.01, max_fun=15000, max_iter=600,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [16]:
# Evaluate best model on test collection (2nd run)
start = datetime.datetime.now()
best_model.fit(train_data[features], train_data[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluation on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(best_model, sens, features)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(best_model, sens, features)))

Computation time fitting the multilayer perceptron: 0:00:00.873283
Accuracy: 0.8624497991967871
Precision: 0.007103508263264714
Recall: 0.98
F1: 0.014104778353483017
MAP: 0.7354742504409171


In [59]:
# Evaluate best model on test collection (1st run)
start = datetime.datetime.now()
best_model.fit(train_data[features], train_data[label])
stop = datetime.datetime.now()
print('Computation time fitting the multilayer perceptron: {}'.format(stop-start))

# Evaluation on test collection
sup = sup_model.SupModel()
sup.evaluate_boolean(best_model, sens)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))

print('MAP: {}'.format(sup.compute_map(best_model, sens)))

Computation time fitting the multilayer perceptron: 0:00:00.625341
Accuracy: 0.8960040160642571
Precision: 0.008998659774076202
Recall: 0.94
F1: 0.01782666413806182
MAP: 0.776559977059977
