In [1]:
import pandas as pd
import numpy as np
import importlib, os, math

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [145]:
from ir_crosslingual.embeddings import embeddings
importlib.reload(embeddings)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

from ir_crosslingual.features import text_based
importlib.reload(text_based)

from ir_crosslingual.features import vector_based
importlib.reload(vector_based)

from ir_crosslingual.utils import strings
importlib.reload(strings)

<module 'ir_crosslingual.utils.strings' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/04_Feature-Extraction/ir-crosslingual/ir_crosslingual/utils/strings.py'>

# Load data

## Word embeddings

In [156]:
german = embeddings.WordEmbeddings('de')
german.load_embeddings()

english = embeddings.WordEmbeddings('en')
english.load_embeddings()

In [157]:
W_ende, W_deen = embeddings.WordEmbeddings.learn_projection_matrix(src_lang='en', trg_lang='de')

Learn projection matrix for en-de
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Resulting subspace dimension: (13700, 300)
Resulting subspace dimension: (13700, 300)
Learn projection matrix for de-en
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (977 in source language, 0 in target language).
Resulting subspace dimension: (13700, 300)
Resulting subspace dimension: (13700, 300)


In [158]:
W_ende

array([[ 0.0837359 , -0.00533531, -0.08310661, ..., -0.00022553,
         0.03983511, -0.01515955],
       [ 0.07748925,  0.03427794, -0.00176127, ..., -0.0621672 ,
         0.04705337, -0.00781926],
       [-0.00330077, -0.01819852, -0.02009503, ...,  0.03127514,
         0.00050988, -0.00330782],
       ...,
       [-0.08098747, -0.03583977, -0.02924371, ...,  0.03696562,
        -0.10345663,  0.05307951],
       [-0.03358475,  0.04922254,  0.09542661, ...,  0.0717814 ,
        -0.04759582, -0.01552653],
       [-0.01005702,  0.0915472 , -0.0616054 , ...,  0.00101319,
         0.0084034 , -0.00668943]])

In [159]:
W_deen

array([[ 0.0837359 ,  0.07748925, -0.00330077, ..., -0.08098747,
        -0.03358475, -0.01005702],
       [-0.00533531,  0.03427794, -0.01819852, ..., -0.03583977,
         0.04922254,  0.0915472 ],
       [-0.08310661, -0.00176127, -0.02009503, ..., -0.02924371,
         0.09542661, -0.0616054 ],
       ...,
       [-0.00022553, -0.0621672 ,  0.03127514, ...,  0.03696562,
         0.0717814 ,  0.00101319],
       [ 0.03983511,  0.04705337,  0.00050988, ..., -0.10345663,
        -0.04759582,  0.0084034 ],
       [-0.01515955, -0.00781926, -0.00330782, ...,  0.05307951,
        -0.01552653, -0.00668943]])

## Sentence embeddings and feature extraction

In [184]:
#from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

<module 'ir_crosslingual.sentences.sentences' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/04_Feature-Extraction/ir-crosslingual/ir_crosslingual/sentences/sentences.py'>

In [185]:
ind_features = ['num_words', 'num_punctuation', 'occ_question_mark', 'occ_exclamation_mark']

In [197]:
sens = sentences.Sentences(src_words=english, trg_words=german)

In [198]:
data = sens.load_data(n_max=10000, features=ind_features)

Target sentences loaded
Source sentences loaded
Sentences preprocessed
Could not find a term of the sentence 'altener' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Sentences embeddings extracted in en
Could not find a term of the sentence 'regierungskonferenz' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence 'sicherheitsberater für den gefahrguttransport' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence 'arbeitsplan' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence 'altfahrzeuge' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence 'kapitalsteuer' in word embedding vocabulary and thus, could not calculate the respective embed

In [199]:
features_dict = {'text_based': ['diff_{}'.format(feat) for feat in ind_features], 
            'vector_based': ['cosine_similarity']}

In [200]:
train_data, test_data = sens.create_datasets(n_train=7000, n_test=3000)

In [201]:
train_data, test_data = sens.extract_features(features_dict, single_source=False, train=True)

# Supervised classifier for cross-lingual retrieval (L2R) - DEMO

### Fit a logistic regression model on training data

In [202]:
features = [feature for values in features_dict.values() for feature in values]
label = 'translation'

In [203]:
logisticRegr = LogisticRegression()

In [204]:
X_train = train_data[features]
y_train = train_data[[label]]

In [205]:
logisticRegr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Apply binary classifier to test samples

In [206]:
test_data['predictions'] = test_data.apply(lambda row: logisticRegr.predict(np.asarray(row[features]).reshape(1,-1))[0], axis=1)

In [211]:
len(test_data.loc[test_data['translation'] == test_data['predictions']]) / len(test_data)

0.8786666666666667