## Unsupervised Similarity Measure

### Imports

In [1]:
import pandas as pd
import importlib, random
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.supervised_classification import sup_model
importlib.reload(sup_model)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

from ir_crosslingual.unsupervised_classification import unsup_model
importlib.reload(unsup_model)

<module 'ir_crosslingual.unsupervised_classification.unsup_model' from '/Users/jani/PycharmProjects/ir-crosslingual/ir_crosslingual/unsupervised_classification/unsup_model.py'>

In [3]:
def time(start, stop):
    print(f'---- TIME {datetime.now()}: Computation time: {stop-start}')

In [4]:
def rank_unsupervised(src_lang='en', trg_lang='de'):
    sens, chunks = sentences.Sentences.load_chunks_from_file(src_lang, trg_lang)
    print(f'---- DONE: Chunks loaded for {src_lang}-{trg_lang}')
    sens.test_collection = pd.concat(chunks, ignore_index=True)
    model = unsup_model.UnsupModel()
    sup = sup_model.SupModel()
    start = datetime.now()
    print(f"---- MAP: {sup.compute_map(model, sens, ['src_embedding_aligned', 'trg_embedding', 'cosine_similarity'])}")
    time(start, datetime.now())



### German

In [5]:
print('Unsupervised evaluation on German/English')
print('-' * 60)
rank_unsupervised('de', 'en')

Unsupervised evaluation on German/English
------------------------------------------------------------
---- INFO: Learn projection matrix for de-en
---- INFO: Found 10604 valid translation pairs in expert dictionary.
---- INFO: 262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
---- INFO: Resulting subspace dimension: (10604, 300)
---- INFO: Resulting subspace dimension: (10604, 300)
---- DONE: Projection matrix learned from de to en
---- INFO: Learn projection matrix for en-de
---- INFO: Found 13700 valid translation pairs in expert dictionary.
---- INFO: 977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
---- INFO: Resulting subspace dimension: (13700, 300)
---- INFO: Resulting subspace dimension: (13700, 300)
---- DONE: Projection matrix learned from en to de
---- DONE: All chunks loaded
---- DONE: Chunks loaded for de-en

---- INFO: Start computing the MAP
---- INFO: Probabilities pred

### French

In [6]:
print('Unsupervised evaluation on English/French')
print('-' * 60)
rank_unsupervised('en', 'fr')

Unsupervised evaluation on English/French
------------------------------------------------------------
---- INFO: Learn projection matrix for en-fr
---- INFO: Found 10369 valid translation pairs in expert dictionary.
---- INFO: 503 other pairs contained at least one unknown word (0 in source language, 503 in target language).
---- INFO: Resulting subspace dimension: (10369, 300)
---- INFO: Resulting subspace dimension: (10369, 300)
---- DONE: Projection matrix learned from en to fr
---- INFO: Learn projection matrix for fr-en
---- INFO: Found 7938 valid translation pairs in expert dictionary.
---- INFO: 332 other pairs contained at least one unknown word (0 in source language, 332 in target language).
---- INFO: Resulting subspace dimension: (7938, 300)
---- INFO: Resulting subspace dimension: (7938, 300)
---- DONE: Projection matrix learned from fr to en
---- DONE: All chunks loaded
---- DONE: Chunks loaded for en-fr

---- INFO: Start computing the MAP
---- INFO: Probabilities predict

In [7]:
print('Unsupervised evaluation on French/English')
print('-' * 60)
rank_unsupervised('fr', 'en')

Unsupervised evaluation on French/English
------------------------------------------------------------
---- INFO: Learn projection matrix for fr-en
---- INFO: Found 7938 valid translation pairs in expert dictionary.
---- INFO: 332 other pairs contained at least one unknown word (0 in source language, 332 in target language).
---- INFO: Resulting subspace dimension: (7938, 300)
---- INFO: Resulting subspace dimension: (7938, 300)
---- DONE: Projection matrix learned from fr to en
---- INFO: Learn projection matrix for en-fr
---- INFO: Found 10369 valid translation pairs in expert dictionary.
---- INFO: 503 other pairs contained at least one unknown word (0 in source language, 503 in target language).
---- INFO: Resulting subspace dimension: (10369, 300)
---- INFO: Resulting subspace dimension: (10369, 300)
---- DONE: Projection matrix learned from en to fr
---- DONE: All chunks loaded
---- DONE: Chunks loaded for fr-en

---- INFO: Start computing the MAP
---- INFO: Probabilities predict

### Finnish

In [8]:
print('Unsupervised evaluation on English/Finnish')
print('-' * 60)
rank_unsupervised('en', 'fi')

Unsupervised evaluation on English/Finnish
------------------------------------------------------------
---- INFO: Learn projection matrix for en-fi
---- INFO: Found 10141 valid translation pairs in expert dictionary.
---- INFO: 1355 other pairs contained at least one unknown word (0 in source language, 1355 in target language).
---- INFO: Resulting subspace dimension: (10141, 300)
---- INFO: Resulting subspace dimension: (10141, 300)
---- DONE: Projection matrix learned from en to fi
---- INFO: Learn projection matrix for fi-en
---- INFO: Found 6946 valid translation pairs in expert dictionary.
---- INFO: 185 other pairs contained at least one unknown word (0 in source language, 185 in target language).
---- INFO: Resulting subspace dimension: (6946, 300)
---- INFO: Resulting subspace dimension: (6946, 300)
---- DONE: Projection matrix learned from fi to en
---- DONE: All chunks loaded
---- DONE: Chunks loaded for en-fi

---- INFO: Start computing the MAP
---- INFO: Probabilities pred

In [9]:
print('Unsupervised evaluation on Finnish/English')
print('-' * 60)
rank_unsupervised('fi', 'en')

Unsupervised evaluation on Finnish/English
------------------------------------------------------------
---- INFO: Learn projection matrix for fi-en
---- INFO: Found 6946 valid translation pairs in expert dictionary.
---- INFO: 185 other pairs contained at least one unknown word (0 in source language, 185 in target language).
---- INFO: Resulting subspace dimension: (6946, 300)
---- INFO: Resulting subspace dimension: (6946, 300)
---- DONE: Projection matrix learned from fi to en
---- INFO: Learn projection matrix for en-fi
---- INFO: Found 10141 valid translation pairs in expert dictionary.
---- INFO: 1355 other pairs contained at least one unknown word (0 in source language, 1355 in target language).
---- INFO: Resulting subspace dimension: (10141, 300)
---- INFO: Resulting subspace dimension: (10141, 300)
---- DONE: Projection matrix learned from en to fi
---- DONE: All chunks loaded
---- DONE: Chunks loaded for fi-en

---- INFO: Start computing the MAP
---- INFO: Probabilities pred