## Imports

In [31]:
import pandas as pd
import numpy as np
import importlib, os, math
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import validation_curve
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.features import text_based
importlib.reload(text_based)

from ir_crosslingual.features import vector_based
importlib.reload(vector_based)

from ir_crosslingual.supervised_classification import sup_model
importlib.reload(sup_model)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

from ir_crosslingual.features import element_based
importlib.reload(element_based)

<module 'ir_crosslingual.features.element_based' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/06_Logistic-Regression/ir-crosslingual/ir_crosslingual/features/element_based.py'>

In [3]:
def time(start, stop, message):
    print(f"---- TIME: {datetime.now().strftime('%d-%m-%Y %H:%M:%S')} Computation time {message}: {stop-start}")

## Load and preprocess data

### Load from file

In [4]:
start = datetime.now()
train_file = f'{paths.data_path}extracted_data/global/en-de/training_data_tfidf.pkl'
test_file = f'{paths.data_path}extracted_data/global/en-de/test_collection_tfidf.pkl'
sens, train_data, test_collection, features = sentences.Sentences.load_from_file(train_file, test_file)
stop = datetime.now()
time(start, stop, 'loading data from file')

---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- DONE: Seed dictionary extracted for the languages: en-de
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix learned from en to de
---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- DONE: Seed dictionary extracted for the languages: de-en
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix learned from de to en
---- INFO: File loaded containing training data
---- INFO: File loaded containing test collection
---- DONE:

In [5]:
start = datetime.now()
sens = element_based.vec2features(sens, 10)
time(start, datetime.now(), 'extracting vector elements')

---- INFO: Unique queries extracted
---- INFO: Unique documents extracted
---- INFO: Started extraction for src language.
---- INFO: src_embedding_pca elements extracted for train data.
---- INFO: src_embedding_pca elements extracted for unique queries.
---- INFO: Unique queries merged to test collection
---- INFO: Started extraction for trg language.
---- INFO: trg_embedding_pca elements extracted for train data.
---- INFO: trg_embedding_pca elements extracted for unique documents.
---- INFO: Unique documents merged to test collection
---- DONE: Extracted all vector elements and merged to test collection
---- TIME: 20-05-2020 01:57:31 Computation time extracting vector elements: 0:01:11.946094


### Remove pairs that contain sentences equal to '.'

In [6]:
train_data = sens.train_data[(sens.train_data['src_sentence'] != '.') & (sens.train_data['trg_sentence'] != '.')]
test_collection = sens.test_collection[(sens.test_collection['src_sentence'] != '.') & (sens.test_collection['trg_sentence'] != '.')]

In [7]:
sens.train_data = train_data
sens.test_collection = test_collection

## Baseline LR Models using "tfidf" aggregation

In [8]:
label = 'translation'

#### 1) All text/vector-based features

In [40]:
features_1 = list(text_based.FEATURES.keys()) + list(vector_based.FEATURES.keys())

In [41]:
start = datetime.now()
lr_1 = LogisticRegression(random_state=42)
lr_1.fit(train_data[features_1], train_data[label])
time(start, datetime.now(), 'fitting the baseline model')

---- TIME: 20-05-2020 04:04:07 Computation time fitting the baseline model: 0:00:03.230510


In [11]:
start = datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(lr_1, sens, features_1)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')

start = datetime.now()
print('MAP: {}'.format(sup.compute_map(lr_1, sens, features_1)))
time(start, datetime.now(), 'computing the MAP score')

Accuracy: 0.9742427427427427
Precision: 0.0037213733067945074
Recall: 0.9619619619619619
F1: 0.0074140651218768915
---- TIME: 19-05-2020 16:38:49 Computation time evaluating boolean: 0:00:16.263245
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.6565315849754441
---- TIME: 19-05-2020 16:48:33 Computation time computing the MAP score: 0:09:44.257536


#### 2) Self-selected features based on intuition

In [12]:
features_2 = ['norm_diff_translated_words', 'norm_diff_num_words', 'norm_diff_num_punctuation',
              'abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark',
              'euclidean_distance', 'cosine_similarity'
             ]

In [13]:
start = datetime.now()
lr_2 = LogisticRegression(random_state=42)
lr_2.fit(train_data[features_2], train_data[label])
time(start, datetime.now(), 'fitting the baseline model')

---- TIME: 19-05-2020 16:48:35 Computation time fitting the baseline model: 0:00:01.368845


In [14]:
start = datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(lr_2, sens, features_2)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')

start = datetime.now()
print('MAP: {}'.format(sup.compute_map(lr_2, sens, features_2)))
time(start, datetime.now(), 'computing the MAP score')

Accuracy: 0.9657991991991992
Precision: 0.002764471145285073
Recall: 0.9479479479479479
F1: 0.005512865292816392
---- TIME: 19-05-2020 16:48:55 Computation time evaluating boolean: 0:00:20.346357
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.4491584744319259
---- TIME: 19-05-2020 16:57:50 Computation time computing the MAP score: 0:08:55.198464


#### 3) Use selected features from RFECV

In [15]:
features_3 = ['norm_diff_translated_words', 'abs_diff_num_words', 'abs_diff_num_punctuation',
              'abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark', 'norm_diff_num_words',
              'euclidean_distance', 'cosine_similarity'
             ]

In [16]:
start = datetime.now()
lr_3 = LogisticRegression(random_state=42)
lr_3.fit(train_data[features_3], train_data[label])
time(start, datetime.now(), 'fitting the baseline model')

---- TIME: 19-05-2020 16:57:54 Computation time fitting the baseline model: 0:00:03.931974


In [17]:
start = datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(lr_3, sens, features_3)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')

start = datetime.now()
print('MAP: {}'.format(sup.compute_map(lr_3, sens, features_3)))
time(start, datetime.now(), 'computing the MAP score')

Accuracy: 0.9725983983983983
Precision: 0.003506038176860148
Recall: 0.963963963963964
F1: 0.006986665118911154
---- TIME: 19-05-2020 16:58:18 Computation time evaluating boolean: 0:00:24.021501
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.6202932314241427
---- TIME: 19-05-2020 17:07:36 Computation time computing the MAP score: 0:09:17.383051


#### 4) Use best of {1,2,3} + extracted elements of dimension reduced sentence embedding (10-dim) as features

In [9]:
features_4 = list(text_based.FEATURES.keys()) + list(vector_based.FEATURES.keys()) \
                + [f'src_embedding_pca_{i}' for i in range(10)] \
                + [f'trg_embedding_pca_{i}' for i in range(10)]

In [10]:
lr_4 = LogisticRegression(random_state=42)
lr_4.fit(train_data[features_4], train_data[label])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
start = datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(lr_4, sens, features_4)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')

start = datetime.now()
print('MAP: {}'.format(sup.compute_map(lr_4, sens, features_4)))
time(start, datetime.now(), 'computing the MAP score')

Accuracy: 0.9741466466466466
Precision: 0.0037037608605071065
Recall: 0.960960960960961
F1: 0.0073790810738100275
---- TIME: 20-05-2020 02:11:42 Computation time evaluating boolean: 0:00:37.812396
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.6301510088951957
---- TIME: 20-05-2020 02:21:16 Computation time computing the MAP score: 0:09:34.281931


#### 5) Correlation-reduced features

In [42]:
lr_1.coef_

array([[21.97079138, -0.2840837 , -0.23724321,  1.31774805,  0.20552533,
         0.03726664, -0.04395643, -0.23821477,  0.0966708 ,  4.3831724 ,
        23.46416402]])

In [43]:
features_1

['norm_diff_translated_words',
 'abs_diff_num_words',
 'abs_diff_num_punctuation',
 'abs_diff_occ_question_mark',
 'abs_diff_occ_exclamation_mark',
 'rel_diff_num_words',
 'rel_diff_num_punctuation',
 'norm_diff_num_words',
 'norm_diff_num_punctuation',
 'euclidean_distance',
 'cosine_similarity']

In [44]:
features_5 = ['norm_diff_translated_words', 'abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark',
              'abs_diff_num_words', 'abs_diff_num_punctuation',
              'cosine_similarity'
             ]

In [45]:
lr_5 = LogisticRegression(random_state=42)
lr_5.fit(train_data[features_5], train_data[label])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
start = datetime.now() 
sup = sup_model.SupModel()
sup.evaluate_boolean(lr_5, sens, features_5)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')

start = datetime.now()
print('MAP: {}'.format(sup.compute_map(lr_5, sens, features_5)))
time(start, datetime.now(), 'computing the MAP score')

Accuracy: 0.9686908908908909
Precision: 0.003012729979691968
Recall: 0.9459459459459459
F1: 0.006006330481650501
---- TIME: 20-05-2020 04:07:46 Computation time evaluating boolean: 0:00:34.143862
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.7074447005417038
---- TIME: 20-05-2020 04:17:39 Computation time computing the MAP score: 0:09:52.397639


#### 6) Correlation-reduced features + extracted elements of dimension reduced sentence embedding (10-dim) as features

In [57]:
features_6 = features_5 \
                + [f'src_embedding_pca_{i}' for i in range(10)] \
                + [f'trg_embedding_pca_{i}' for i in range(10)]

In [58]:
lr_6 = LogisticRegression(random_state=42)
lr_6.fit(train_data[features_6], train_data[label])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [59]:
start = datetime.now() 
sup = sup_model.SupModel()
sup.evaluate_boolean(lr_6, sens, features_6)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')

start = datetime.now()
print('MAP: {}'.format(sup.compute_map(lr_6, sens, features_6)))
time(start, datetime.now(), 'computing the MAP score')

Accuracy: 0.969034934934935
Precision: 0.003068532712750527
Recall: 0.9529529529529529
F1: 0.006117367347266623
---- TIME: 20-05-2020 11:20:31 Computation time evaluating boolean: 0:00:37.516193
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.695793385447464
---- TIME: 20-05-2020 11:33:08 Computation time computing the MAP score: 0:12:36.698118


## Hyperparametertuning of model with correlation-reduced features

#### Create parameter grid

In [49]:
# Penalty
penalty = ['l1', 'l2', 'elasticnet']

# Regularization parameter C
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

# Algorithm used in the optimization problem
solver = ['newton-cg', 'lbfgs', 'sag', 'saga']

In [50]:
parameter_grid = {'penalty': penalty,
                  'C': C,
                  'solver': solver
                 }

#### Perform grid search for best hyperparameters

In [None]:
start = datetime.now()
lr = LogisticRegression(random_state=42)
cv = StratifiedKFold(n_splits=5, random_state=42)
grid_search = GridSearchCV(estimator=lr, param_grid=parameter_grid, n_jobs=-1, cv=cv, return_train_score=True, verbose=20)
grid_search.fit(train_data[features_5], train_data[label])
time(start, datetime.now(), 'performing the GridSearch')

#### Evaluate grid search to identify optimal hyperparameters and resulting MAP

In [52]:
# Identify best hyperparameters retrieved by grid search
grid_search.best_params_

{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}

In [53]:
# Identify best score achieved by best model in grid search
grid_search.best_score_

0.9474344355758267

In [54]:
# Fit best model on training data
best_model = grid_search.best_estimator_

In [55]:
start = datetime.now()
best_model_.fit(train_data[features_5], train_data[label])
time(start, datetime.now(), 'fitting the best model retrieved by grid search')

---- TIME: 20-05-2020 11:05:18 Computation time fitting the best model retrieved by grid search: 0:00:02.749947


In [56]:
start = datetime.now()
sup = sup_model.SupModel()
sup.evaluate_boolean(best_model, sens, features_5)
print('Accuracy: {}'.format(sup.accuracy))
print('Precision: {}'.format(sup.precision))
print('Recall: {}'.format(sup.recall))
print('F1: {}'.format(sup.f1))
time(start, datetime.now(), 'evaluating boolean')

start = datetime.now()
print('MAP: {}'.format(sup.compute_map(best_model, sens, features_5)))
time(start, datetime.now(), 'computing the MAP score')

Accuracy: 0.968468968968969
Precision: 0.0029915855720101556
Recall: 0.9459459459459459
F1: 0.00596430881865661
---- TIME: 20-05-2020 11:05:54 Computation time evaluating boolean: 0:00:35.629633
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.7039245191121866
---- TIME: 20-05-2020 11:15:23 Computation time computing the MAP score: 0:09:29.715826


## Save best model

In [63]:
prepared_features_ = ['translated_words', 'num_words', 'num_punctuation',
                     'occ_question_mark', 'occ_exclamation_mark']

features_dict_ = {'text_based': ['norm_diff_translated_words', 'abs_diff_occ_question_mark', 'abs_diff_occ_exclamation_mark',
                                 'abs_diff_num_words', 'abs_diff_num_punctuation'
                                ],
                  'vector_based': 'cosine_similarity'
                 }

info = 'Logistic Regression Model 5'

In [65]:
sup_model.SupModel.save_model(name='log_reg_best', model=lr_6, prepared_features=prepared_features_, features_dict=features_dict_, info=info)
