## Unsupervised Similarity Measure

### Imports

In [1]:
import pandas as pd
import importlib, random
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.supervised_classification import sup_model
importlib.reload(sup_model)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

from ir_crosslingual.unsupervised_classification import unsup_model
importlib.reload(unsup_model)

<module 'ir_crosslingual.unsupervised_classification.unsup_model' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/04_Unsupervised/ir-crosslingual/ir_crosslingual/unsupervised_classification/unsup_model.py'>

In [3]:
def time(start, stop, message):
    print(f'---- TIME {datetime.now()}: Computation time {message}: {stop-start}')

### TF-IDF aggregation of word embeddings

#### Load data

In [4]:
train_file_tfidf = f'{paths.data_path}extracted_data/global/en-de/training_data_tfidf.pkl'
test_file_tfidf = f'{paths.data_path}extracted_data/global/en-de/test_collection_tfidf.pkl'
sens_tfidf, train_data_tfidf, test_collection_tfidf, _ = sentences.Sentences.load_from_file(train_file_tfidf, test_file_tfidf)

---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- DONE: Seed dictionary extracted for the languages: en-de
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix learned from en to de
---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- DONE: Seed dictionary extracted for the languages: de-en
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix learned from de to en
---- INFO: File loaded containing training data
---- INFO: File loaded containing test collection
---- DONE:

#### Evaluation on test collection

In [5]:
model = unsup_model.UnsupModel()
sup = sup_model.SupModel()
start = datetime.now()
print('MAP: {}'.format(sup.compute_map(model, sens_tfidf, ['src_embedding', 'trg_embedding', 'cosine_similarity'])))
time(start, datetime.now(), 'computing the MAP score on tf-idf weighted sentence embeddings')


---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.522049738737564
---- TIME 2020-05-20 00:52:50.522809: Computation time computing the MAP score on tf-idf weighted sentence embeddings: 0:09:40.032475


### Averaged aggreation of word embeddings

#### Load data

In [6]:
train_file_avg = f'{paths.data_path}extracted_data/global/en-de/training_data_avg.pkl'
test_file_avg = f'{paths.data_path}extracted_data/global/en-de/test_collection_avg.pkl'
sens_avg, train_data_avg, test_collection_avg, _ = sentences.Sentences.load_from_file(train_file_avg, test_file_avg)

---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- DONE: Seed dictionary extracted for the languages: en-de
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix learned from en to de
---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- DONE: Seed dictionary extracted for the languages: de-en
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix learned from de to en
---- INFO: File loaded containing training data
---- INFO: File loaded containing test collection
---- DONE:

#### Evaluation on test collection

In [7]:
model = unsup_model.UnsupModel()
sup = sup_model.SupModel()
start = datetime.now()
print('MAP: {}'.format(sup.compute_map(model, sens_avg, ['src_embedding', 'trg_embedding', 'cosine_similarity'])))
time(start, datetime.now(), 'computing the MAP score on averaged sentence embeddings')


---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.552430787515781
---- TIME 2020-05-20 01:04:58.779117: Computation time computing the MAP score on averaged sentence embeddings: 0:11:12.003907


### Random Baseline

#### Create random value for simulation of cosine similarity

In [10]:
random.seed(42)
sens_rand = sens_avg
sens_rand.test_collection['cosine_similarity'] = [random.uniform(-1,1) for _ in range(len(sens_rand.test_collection))]

#### Evaluation on test collection

In [11]:
model = unsup_model.UnsupModel()
sup = sup_model.SupModel()
start = datetime.now()
print('MAP: {}'.format(sup.compute_map(model, sens_rand, ['src_embedding', 'trg_embedding', 'cosine_similarity'])))
time(start, datetime.now(), 'computing the MAP score on a random baseline')


---- INFO: Start computing the MAP
---- INFO: Probabilities predicted
---- INFO: Dataframe with evaluation ranking created
---- INFO: Probabilities sorted for each query
---- INFO: Index of ranking of true translation retrieved
MAP: 0.0008385511672483858
---- TIME 2020-05-20 01:17:36.189654: Computation time computing the MAP score on a random baseline: 0:09:02.703443
