In [255]:
import pandas as pd
import numpy as np
import importlib, os, math

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression

from ir_crosslingual.induce_multilingual_embedding_space import mono_embedding_loading as mono
from ir_crosslingual.induce_multilingual_embedding_space import multi_embedding_learning as multi
from ir_crosslingual.induce_multilingual_embedding_space import subspace_creation as sub

In [256]:
from ir_crosslingual.features import text_based
importlib.reload(text_based)

<module 'ir_crosslingual.features.text_based' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/04_Feature-Extraction/ir-crosslingual/ir_crosslingual/features/text_based.py'>

In [257]:
from ir_crosslingual.features import vector_based
importlib.reload(vector_based)

<module 'ir_crosslingual.features.vector_based' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/04_Feature-Extraction/ir-crosslingual/ir_crosslingual/features/vector_based.py'>

In [258]:
from ir_crosslingual.supervised_classification import sentence_vector_creation as sentence
importlib.reload(sentence)

<module 'ir_crosslingual.supervised_classification.sentence_vector_creation' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/04_Feature-Extraction/ir-crosslingual/ir_crosslingual/supervised_classification/sentence_vector_creation.py'>

# Global variables and functions

In [259]:
data_path = '../../../../../04_Data/'

In [260]:
# TODO/IDEA: Include ":" as special punctuation to look for in equal_occurrence_punctuation
tb_features = {'difference_count_words': [text_based.difference_count_tokens, 'preprocessed', False],
            'difference_count_punctuation': [text_based.difference_count_tokens, 'preprocessed', True],
            'equal_occ_question': [text_based.equal_occurrence_punctuation, 'preprocessed', '?'],
            'equal_occ_exclamation': [text_based.equal_occurrence_punctuation, 'preprocessed', '!']
           }
for word_group in 'noun verb adverb adjective wh pronoun'.split():
    tb_features['difference_count_{}'.format(word_group)] = [text_based.difference_count_nltk_tags, 'preprocessed', word_group]

In [261]:
vb_features = {'cosine_similarity': [vector_based.cosine_sim, 'vec']
            # 'euclidean_distance': [vector_based.euclidean_dist, 'vec'],
            # 'jenson_shannon_distance': [vector_based.jenson_shannon_dist, 'vec'],
            # 'wasserstein_distance': [vector_based.wasserstein_dist, 'vec'],
            # 'greedy_association_similarity': [vector_based.gas, 'word_embeddings']
           }

In [262]:
# Copied from dataset_creation.py as I haven't merged it into this branch yet
def create_datasets(europarl_dataset: str, n_train: int, frac_pos: float, n_test: int):
    if isinstance(europarl_dataset, str) and os.path.isfile(europarl_dataset):
        _, df = tmx_dataframe(europarl_dataset)
    else:
        df = europarl_dataset
    n_train_pos = math.ceil(n_train*frac_pos)
    df_train_pos = df[:n_train_pos]
    df_train_pos.loc[:, 'translation'] = 1
    multiple = math.ceil(n_train/n_train_pos)
    df_train_neg = pd.concat([df_train_pos[['source_sentence']]] * multiple, ignore_index=True)[:n_train-n_train_pos]
    df_train_neg['target_sentence'] = np.random.choice(df[n_train_pos:-n_test]['target_sentence'],
                                                       n_train-n_train_pos)
    df_train_neg.loc[:, 'translation'] = 0
    df_train = df_train_pos.append(df_train_neg, ignore_index=True)
    df_test = df[-n_test:].reset_index(drop=True)
    return df_train, df_test

In [263]:
# TODO: Parameterize by source and target language
def get_sen_embedding(sentence, language):
    if language == 'english':
        return sen_emb_en[sen_en.index(sentence)]
    elif language == 'german':
        return sen_emb_de[sen_de.index(sentence)]

In [264]:
def get_list_of_word_embeddings(idx, word_embeddings, words_found):
    embeddings = list()
    for word in words_found[idx]:
        embeddings.append(word_embeddings[word])
    return embeddings

In [287]:
# TODO: Improve code by removing multiple for loops
# TODO: Parameterize by source and target language
def load_data():
    data = pd.DataFrame({'source_sentence': sen_en, 'target_sentence': sen_de})
    # Extract train and test data
    train_data, test_data = create_datasets(data, n_train=4800, frac_pos=0.7, n_test=200)
    data_sets = [train_data, test_data]
    origins = {'source': 'english', 'target': 'german'}
    for data in data_sets:
        for origin, language in origins.items():
            data['{}_preprocessed'.format(origin)] = data.apply(lambda row: sentence.preprocess_sentences(sentences=[row['{}_sentence'.format(origin)]], language=language)[0], axis=1)
            data['{}_vec'.format(origin)] = data.apply(lambda row: get_sen_embedding(row['{}_sentence'.format(origin)], language), axis=1)
    for data in data_sets:
        data['source_word_embeddings'] = data.apply(lambda row: get_list_of_word_embeddings(row.name, emb_en, words_found_en), axis=1)
        data['target_word_embeddings'] = data.apply(lambda row: get_list_of_word_embeddings(row.name, emb_de, words_found_de), axis=1)
    
    
    # Extract text-based features
    for data in [train_data, test_data]:
        for name, function in tb_features.items():
            print('Start extraction feature {}'.format(name))
            data[name] = data.apply(lambda row: function[0](row['source_{}'.format(function[1])], row['target_{}'.format(function[1])], function[2]), axis=1)
    
    # Extract vector-based features
    for data in [train_data, test_data]:
        for name, function in vb_features.items():
            print('Start extraction feature {}'.format(name))
            data[name] = data.apply(lambda row: function[0](row['source_{}'.format(function[1])] @ W, row['target_{}'.format(function[1])]), axis=1)
            
    return train_data, test_data

In [266]:
def predict_most_similar_sentence(idx, sentence, test_data, regressor):
    tmp = pd.DataFrame({'source_sentence': sentence,
                       'target_sentence': test_data['target_sentence'], 'target_preprocessed': test_data['target_preprocessed'], 'target_vec': test_data['target_vec'], 'target_word_embeddings': test_data['target_word_embeddings']})
    tmp['source_preprocessed'] = tmp.apply(lambda row: test_data['source_preprocessed'].iloc[idx], axis=1)
    tmp['source_vec'] = tmp.apply(lambda row: test_data['source_vec'].iloc[idx], axis=1)
    tmp['source_word_embeddings'] = tmp.apply(lambda row: test_data['source_word_embeddings'].iloc[idx], axis=1)
    for name, function in tb_features.items():
        tmp[name] = tmp.apply(lambda row: function[0](row['source_{}'.format(function[1])], row['target_{}'.format(function[1])], function[2]), axis=1)
    for name, function in vb_features.items():
        tmp[name] = tmp.apply(lambda row: function[0](row['source_{}'.format(function[1])] @ W, row['target_{}'.format(function[1])]), axis=1)
    tmp['prediction_probability'] = tmp.apply(lambda row: regressor.predict_proba(np.asarray(row[features]).reshape(1, -1))[0][1], axis=1)
    tmp.sort_values('prediction_probability', ascending=False, inplace=True)
    prob = tmp['prediction_probability'].iloc[0]
    sen = tmp['target_sentence'].iloc[0]
    print('Done with sentence id {}, probability of {}'.format(idx, prob))
    return [sen, prob]

# Load raw data
TODO: Parameterize by source and target language

### Load word embeddings and projection matrix

In [267]:
emb_en, id2word_en, word2id_en = mono.load_monolingual_embedding('{}fastText_mon_emb/wiki.en.vec'.format(data_path), 50000)
emb_de, id2word_de, word2id_de = mono.load_monolingual_embedding('{}fastText_mon_emb/wiki.de.vec'.format(data_path), 50000)

In [268]:
W = multi.learn_projection_matrix(emb_en, emb_de, '{}expert_dictionaries/en-de/MUSE_en-de.0-5000.txt'.format(data_path), word2id_en, word2id_de)


Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Resulting subspace dimension: (13700, 300)


### Load Europarl sentences and transform to sentence vector embeddings

In [269]:
sen_en = sentence.load_sentences('{}corpus/de-en/Europarl.de-en.en'.format(data_path), 5000)
sen_de = sentence.load_sentences('{}corpus/de-en/Europarl.de-en.de'.format(data_path), 5000)

In [270]:
sen_en_preprocessed = sentence.preprocess_sentences(sentences=sen_en, language='english')
sen_de_preprocessed = sentence.preprocess_sentences(sentences=sen_de, language='german')

In [271]:
sen_emb_de, id2sentence_de, words_found_de, invalid_sentences_de = sentence.transform_into_sentence_vectors(sen_de_preprocessed, 'german', emb_de, word2id_de, preprocessed=True)
sen_emb_en, id2sentence_en, words_found_en, invalid_sentences_en = sentence.transform_into_sentence_vectors(sen_en_preprocessed, 'english', emb_en, word2id_en, preprocessed=True, prev_invalid_sentences=invalid_sentences_de)


Could not find a term of the sentence '['sicherheitsberater', 'gefahrguttransport']' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence '['lebensmittelsicherheit']' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence '['arbeitsplan']' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence '['kapitalsteuer']' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence '['safety', 'advisers', 'transport', 'dangerous', 'goods']' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of the sentence '['food', 'safety']' in word embedding vocabulary and thus, could not calculate the respective embedding vector.
Could not find a term of 

In [272]:
# Delete invalid sentences from list of raw sentences
for idx in list(invalid_sentences_de):
    del sen_en[idx]
    del sen_de[idx]
    del sen_en_preprocessed[idx]
    del sen_de_preprocessed[idx]

# Supervised classifier for cross-lingual retrieval (L2R) - DEMO

### Data preparation and feature extraction

In [273]:
# Load data
train_data, test_data = load_data()

Start extraction feature difference_count_words
Start extraction feature difference_count_punctuation
Start extraction feature equal_occ_question
Start extraction feature equal_occ_exclamation
Start extraction feature difference_count_noun
Start extraction feature difference_count_verb
Start extraction feature difference_count_adverb
Start extraction feature difference_count_adjective
Start extraction feature difference_count_wh
Start extraction feature difference_count_pronoun
Start extraction feature difference_count_words
Start extraction feature difference_count_punctuation
Start extraction feature equal_occ_question
Start extraction feature equal_occ_exclamation
Start extraction feature difference_count_noun
Start extraction feature difference_count_verb
Start extraction feature difference_count_adverb
Start extraction feature difference_count_adjective
Start extraction feature difference_count_wh
Start extraction feature difference_count_pronoun
Start extraction feature cosine_si

### Fit a logistic regression model on training data

In [274]:
features = list(tb_features.keys()) + list(vb_features.keys())
label = 'translation'

In [275]:
logisticRegr = LogisticRegression()

In [276]:
X_train = train_data[features]
y_train = train_data[[label]]

In [277]:
logisticRegr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [278]:
predictions = test_data.copy().drop(columns=features, axis=1)

### Predict on positive test samples only to get an idea of the confidence values

In [279]:
test_data['predictions_proba'] = test_data.apply(lambda row: logisticRegr.predict_proba(np.asarray(row[features]).reshape(1,-1))[0][1],axis=1)

In [280]:
test_data['predictions'] = test_data.apply(lambda row: logisticRegr.predict(np.asarray(row[features]).reshape(1,-1))[0], axis=1)

In [281]:
test_data['predictions'].sum() / len(test_data)

0.96

In [313]:
len(test_data.loc[test_data['predictions_proba'] < 0.9])

68

In [318]:
len(test_data.loc[(test_data['predictions'] == 1) & (test_data['predictions_proba'] < 0.9)])

60

### Predict most similar test target sentence for each test source sentence

In [283]:
predictions['prediction'] = predictions.apply(lambda row: predict_most_similar_sentence(row.name, row['source_sentence'], test_data, logisticRegr), axis=1)

Done with sentence id 0, probability of 0.9813236115218967
Done with sentence id 1, probability of 0.888794582153693
Done with sentence id 2, probability of 0.9869983793041734
Done with sentence id 3, probability of 0.9737796548725615
Done with sentence id 4, probability of 0.9332835186627954
Done with sentence id 5, probability of 0.976765865661742
Done with sentence id 6, probability of 0.9782118116558263
Done with sentence id 7, probability of 0.9586714875428086
Done with sentence id 8, probability of 0.9598901888083166
Done with sentence id 9, probability of 0.9800617449106159
Done with sentence id 10, probability of 0.9165729093004251
Done with sentence id 11, probability of 0.9868509888364159
Done with sentence id 12, probability of 0.9715304358137095
Done with sentence id 13, probability of 0.9763108973630142
Done with sentence id 14, probability of 0.9813425467328555
Done with sentence id 15, probability of 0.9893995157226325
Done with sentence id 16, probability of 0.976390538

Done with sentence id 137, probability of 0.9915917621656068
Done with sentence id 138, probability of 0.8564769330023544
Done with sentence id 139, probability of 0.9660776926020023
Done with sentence id 140, probability of 0.9763469681314573
Done with sentence id 141, probability of 0.9609096053396675
Done with sentence id 142, probability of 0.9546252550491855
Done with sentence id 143, probability of 0.9823835122006856
Done with sentence id 144, probability of 0.9734117823654234
Done with sentence id 145, probability of 0.9634904885632632
Done with sentence id 146, probability of 0.9426951965688531
Done with sentence id 147, probability of 0.9737625608248124
Done with sentence id 148, probability of 0.9863187463361657
Done with sentence id 149, probability of 0.9126341374271237
Done with sentence id 150, probability of 0.9647664251775642
Done with sentence id 151, probability of 0.7284354320650875
Done with sentence id 152, probability of 0.9570922021595129
Done with sentence id 15

In [284]:
predictions['prediction_sentence'] = predictions.apply(lambda row: row['prediction'][0], axis=1)
predictions['prediction_probability'] = predictions.apply(lambda row: row['prediction'][1], axis=1)

In [285]:
predictions['true_prediction'] = predictions.apply(lambda row: 1 if row['target_sentence'] == row['prediction_sentence'] else 0, axis=1)

In [286]:
# Fourth run: Without greedy association similarity and with count of pos tags
# Training size: 4800 instances
sum(predictions['true_prediction']) / len(predictions)

0.375

In [310]:
len(predictions.loc[predictions['prediction_probability'] < 0.9])

18

In [319]:
len(predictions.loc[(predictions['true_prediction'] == 1) & (predictions['prediction_probability'] < 0.9)])

8

In [240]:
# Third run: Without greedy association similarity and with count of pos tags
# Training size: 800 instances
# sum(predictions['true_prediction']) / len(predictions)

0.25

In [154]:
# Second run: With greedy association similarity but without count of pos tags 
# However, computation of the greedy association similarity was wrong in this run
# Extraction of the greedy association similarity feature very time consuming
# Training size: 800 instances
# sum(predictions['true_prediction']) / len(predictions)

0.23

In [41]:
# First run: Without greedy association similarity and without count of pos tags
# Training size: 800 instances
# sum(predictions['true_prediction']) / len(predictions)

0.305