## Imports

In [1]:
import pandas as pd
import numpy as np
import importlib
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.features import text_based
importlib.reload(text_based)

from ir_crosslingual.features import vector_based
importlib.reload(vector_based)

from ir_crosslingual.supervised_classification import sup_model
importlib.reload(sup_model)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

from ir_crosslingual.features import element_based
importlib.reload(element_based)

<module 'ir_crosslingual.features.element_based' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/06_Logistic-Regression/ir-crosslingual/ir_crosslingual/features/element_based.py'>

In [3]:
def time(start, stop, message):
    print(f"---- TIME: {datetime.now().strftime('%d-%m-%Y %H:%M:%S')} Computation time {message}: {stop-start}")

## Load and preprocess data

#### Load from file

In [4]:
start = datetime.now()
train_file = f'{paths.data_path}extracted_data/global/en-de/training_data_avg.pkl'
test_file = f'{paths.data_path}extracted_data/global/en-de/test_collection_avg.pkl'
sens, train_data, test_collection, features = sentences.Sentences.load_from_file(train_file, test_file)
stop = datetime.now()
time(start, stop, 'loading data from file')

---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- DONE: Seed dictionary extracted for the languages: en-de
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix learned from en to de
---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- DONE: Seed dictionary extracted for the languages: de-en
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix learned from de to en
---- INFO: File loaded containing training data
---- INFO: File loaded containing test collection
---- DONE:

In [5]:
start = datetime.now()
sens = element_based.vec2features(sens, 10)
time(start, datetime.now(), 'extracting vector elements')

---- INFO: Unique queries extracted
---- INFO: Unique documents extracted
---- INFO: Started extraction for src language.
---- INFO: src_embedding_pca elements extracted for train data.
---- INFO: src_embedding_pca elements extracted for unique queries.
---- INFO: Unique queries merged to test collection
---- INFO: Started extraction for trg language.
---- INFO: trg_embedding_pca elements extracted for train data.
---- INFO: trg_embedding_pca elements extracted for unique documents.
---- INFO: Unique documents merged to test collection
---- DONE: Extracted all vector elements and merged to test collection
---- TIME: 20-05-2020 03:41:15 Computation time extracting vector elements: 0:04:18.950814


#### Remove pairs that contain sentences equal to '.'

In [6]:
train_data = sens.train_data[(sens.train_data['src_sentence'] != '.') & (sens.train_data['trg_sentence'] != '.')]
test_collection = sens.test_collection[(sens.test_collection['src_sentence'] != '.') & (sens.test_collection['trg_sentence'] != '.')]

In [7]:
sens.train_data = train_data
sens.test_collection = test_collection

## Baseline LR Models using "tfidf" aggregation

In [8]:
label = 'translation'

#### 1) All text/vector-based features

In [10]:
features_1 = list(text_based.FEATURES.keys()) + list(vector_based.FEATURES.keys())

In [11]:
start = datetime.now()
lr_1 = LogisticRegression(random_state=42)
lr_1.fit(train_data[features_1], train_data[label])
time(start, datetime.now(), 'fitting the baseline model')

---- TIME: 19-05-2020 15:23:32 Computation time fitting the baseline model: 0:00:03.348732


In [12]:
start = datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(lr_1, sens, features_1)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')

start = datetime.now()
print('MAP: {}'.format(sup.compute_map(lr_1, sens, features_1)))
time(start, datetime.now(), 'computing the MAP score')

Accuracy: 0.969378978978979
Precision: 0.003096429328074836
Recall: 0.950950950950951
F1: 0.0061727592883783185
---- TIME: 19-05-2020 15:24:03 Computation time evaluating boolean: 0:00:31.315991
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.6579165696100835
---- TIME: 19-05-2020 15:34:26 Computation time computing the MAP score: 0:10:23.008112


#### 2) Self-selected features based on intuition

In [13]:
features_2 = ['norm_diff_translated_words', 'norm_diff_num_words', 'norm_diff_num_punctuation',
              'abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark',
              'euclidean_distance', 'cosine_similarity'
             ]

In [14]:
start = datetime.now()
lr_2 = LogisticRegression(random_state=42)
lr_2.fit(train_data[features_2], train_data[label])
time(start, datetime.now(), 'fitting the baseline model')

---- TIME: 19-05-2020 15:34:28 Computation time fitting the baseline model: 0:00:01.573912


In [15]:
start = datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(lr_2, sens, features_2)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')

start = datetime.now()
print('MAP: {}'.format(sup.compute_map(lr_2, sens, features_2)))
time(start, datetime.now(), 'computing the MAP score')

Accuracy: 0.9607871871871871
Precision: 0.002394190776252684
Recall: 0.9409409409409409
F1: 0.004776228608593147
---- TIME: 19-05-2020 15:34:46 Computation time evaluating boolean: 0:00:17.782429
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.48391072285080006
---- TIME: 19-05-2020 15:45:57 Computation time computing the MAP score: 0:11:10.985105


#### 3) Use selected features from RFECV

In [9]:
features_3 = ['norm_diff_translated_words', 'abs_diff_num_words', 'abs_diff_num_punctuation',
              'abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark', 'norm_diff_num_words',
              'euclidean_distance', 'cosine_similarity'
             ]

In [17]:
start = datetime.now()
lr_3 = LogisticRegression(random_state=42)
lr_3.fit(train_data[features_3], train_data[label])
time(start, datetime.now(), 'fitting the baseline model')

---- TIME: 19-05-2020 15:45:59 Computation time fitting the baseline model: 0:00:02.598792


In [18]:
start = datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(lr_3, sens, features_3)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')

start = datetime.now()
print('MAP: {}'.format(sup.compute_map(lr_3, sens, features_3)))
time(start, datetime.now(), 'computing the MAP score')

Accuracy: 0.9693517517517517
Precision: 0.0030969229416534507
Recall: 0.9519519519519519
F1: 0.0061737611903479
---- TIME: 19-05-2020 15:46:22 Computation time evaluating boolean: 0:00:22.996031
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.6584851843421424
---- TIME: 19-05-2020 15:57:00 Computation time computing the MAP score: 0:10:37.367608


#### 4) Use best of {1,2,3} + extracted elements of dimension reduced sentence embedding (10-dim) as features

In [19]:
features_4 = features_3 \
                + [f'src_embedding_pca_{i}' for i in range(10)] \
                + [f'trg_embedding_pca_{i}' for i in range(10)]

In [11]:
lr_4 = LogisticRegression(random_state=42)
lr_4.fit(train_data[features_4], train_data[label])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
start = datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(lr_4, sens, features_4)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')

start = datetime.now()
print('MAP: {}'.format(sup.compute_map(lr_4, sens, features_4)))
time(start, datetime.now(), 'computing the MAP score')

Accuracy: 0.9721572572572572
Precision: 0.0034649066230955554
Recall: 0.9679679679679679
F1: 0.006905095989403142
---- TIME: 19-05-2020 17:43:37 Computation time evaluating boolean: 0:00:42.238030
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.6601763795967865
---- TIME: 19-05-2020 17:52:23 Computation time computing the MAP score: 0:08:46.540573


#### 5) Correlation-reduced features

In [29]:
features_5 = ['norm_diff_translated_words', 'abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark',
              'abs_diff_num_words', 'abs_diff_num_punctuation',
              'cosine_similarity'
             ]

In [30]:
lr_5 = LogisticRegression(random_state=42)
lr_5.fit(train_data[features_5], train_data[label])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [31]:
start = datetime.now() 
sup = sup_model.SupModel()
sup.evaluate_boolean(lr_5, sens, features_5)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')

start = datetime.now()
print('MAP: {}'.format(sup.compute_map(lr_5, sens, features_5)))
time(start, datetime.now(), 'computing the MAP score')

Accuracy: 0.9673895895895895
Precision: 0.0028928364797394302
Recall: 0.9459459459459459
F1: 0.005768033497320458
---- TIME: 20-05-2020 10:35:46 Computation time evaluating boolean: 0:00:22.173894
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.6843722994123714
---- TIME: 20-05-2020 10:46:44 Computation time computing the MAP score: 0:10:57.214786


#### 6) Correlation-reduced features + extracted elements of dimension reduced sentence embedding (10-dim) as features

In [32]:
features_6 = features_5 \
                + [f'src_embedding_pca_{i}' for i in range(10)] \
                + [f'trg_embedding_pca_{i}' for i in range(10)]

In [33]:
lr_6 = LogisticRegression(random_state=42)
lr_6.fit(train_data[features_5], train_data[label])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
start = datetime.now() 
sup = sup_model.SupModel()
sup.evaluate_boolean(lr_6, sens, features_6)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')

start = datetime.now()
print('MAP: {}'.format(sup.compute_map(lr_6, sens, features_6)))
time(start, datetime.now(), 'computing the MAP score')

Accuracy: 0.9636202202202202
Precision: 0.002615681463244598
Recall: 0.953953953953954
F1: 0.005217058082881699
---- TIME: 20-05-2020 11:22:06 Computation time evaluating boolean: 0:00:55.435638
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.6781147670879518
---- TIME: 20-05-2020 11:32:58 Computation time computing the MAP score: 0:10:51.552589


## Hyperparametertuning of model with correlation-reduced features

#### Create parameter grid

In [35]:
# Penalty
penalty = ['l1', 'l2', 'elasticnet']

# Regularization parameter C
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# Algorithm used in the optimization problem
solver = ['newton-cg', 'lbfgs', 'sag', 'saga']

In [36]:
parameter_grid = {'penalty': penalty,
                  'C': C,
                  'solver': solver
                 }

#### Perform grid search for best hyperparameters

In [None]:
start = datetime.now()
lr = LogisticRegression(random_state=42)
cv = StratifiedKFold(n_splits=5, random_state=42)
grid_search = GridSearchCV(estimator=lr, param_grid=parameter_grid, n_jobs=-1, cv=cv, return_train_score=True, verbose=20)
grid_search.fit(train_data[features_5], train_data[label])
time(start, datetime.now(), 'performing the GridSearch')

#### Evaluate grid search to identify optimal hyperparameters and resulting MAP

In [38]:
# Identify best hyperparameters retrieved by grid search
grid_search.best_params_

{'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}

In [39]:
# Identify best score achieved by best model in grid search
grid_search.best_score_

0.9455580227650083

In [40]:
# Fit best model on training data
best_model = grid_search.best_estimator_

In [41]:
start = datetime.now()
best_model.fit(train_data[features_5], train_data[label])
time(start, datetime.now(), 'fitting the best model retrieved by grid search')

---- TIME: 20-05-2020 11:50:07 Computation time fitting the best model retrieved by grid search: 0:00:02.683978


In [42]:
start = datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(best_model, sens, features_5)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')

start = datetime.now()
print('MAP: {}'.format(sup.compute_map(best_model, sens, features_5)))
time(start, datetime.now(), 'computing the MAP score')

Accuracy: 0.9735627627627628
Precision: 0.0035623044789186292
Recall: 0.944944944944945
F1: 0.007097851095505196
---- TIME: 20-05-2020 11:50:37 Computation time evaluating boolean: 0:00:30.505540
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.7482786619932548
---- TIME: 20-05-2020 12:00:10 Computation time computing the MAP score: 0:09:32.126954


## Save best model

In [43]:
prepared_features_ = ['translated_words', 'num_words', 'num_punctuation',
                     'occ_question_mark', 'occ_exclamation_mark']

features_dict_ = {'text_based': ['norm_diff_translated_words', 'abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark',
                                 'abs_diff_num_words', 'abs_diff_num_punctuation'
                                ],
                  'vector_based': 'cosine_similarity'
                 }

info = 'Logistic Regression Model 5, averaged'

In [44]:
sup_model.SupModel.save_model(name='log_reg_best_avg', model=best_model, prepared_features=prepared_features_, features_dict=features_dict_, info=info)
