## Imports

In [1]:
import io, os, importlib
import datetime
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
from ir_crosslingual.utils import paths
importlib.reload(paths)

from ir_crosslingual.features import text_based
importlib.reload(text_based)

from ir_crosslingual.features import vector_based
importlib.reload(vector_based)

from ir_crosslingual.embeddings import embeddings
importlib.reload(embeddings)

from ir_crosslingual.sentences import sentences
importlib.reload(sentences)

<module 'ir_crosslingual.sentences.sentences' from '/Users/i500969/Desktop/Admin/Uni-Mannheim/02_Courses/2020_FSS/Information-Retrieval/03_Project/03_Implementation/03_Feature-Selection/ir-crosslingual/ir_crosslingual/sentences/sentences.py'>

In [3]:
def time(start, stop):
    print('Computation time loading the data: {}'.format(stop-start))
    print('Finished at: {}'.format(datetime.datetime.now()))

## Load word embeddings

In [4]:
overall_start = datetime.datetime.now()

In [5]:
german = embeddings.WordEmbeddings('de')
german.load_embeddings()

english = embeddings.WordEmbeddings('en')
english.load_embeddings()

In [6]:
W_ende, W_deen = embeddings.WordEmbeddings.learn_projection_matrix(src_lang='en', trg_lang='de')

Learn projection matrix for en-de
Found 13700 valid translation pairs in expert dictionary.
977 other pairs contained at least one unknown word (0 in source language, 977 in target language).
Resulting subspace dimension: (13700, 300)
Resulting subspace dimension: (13700, 300)
Learn projection matrix for de-en
Found 10604 valid translation pairs in expert dictionary.
262 other pairs contained at least one unknown word (0 in source language, 262 in target language).
Resulting subspace dimension: (10604, 300)
Resulting subspace dimension: (10604, 300)


## Load sentence embeddings

In [None]:
start = datetime.datetime.now()
sens = sentences.Sentences(src_words=english, trg_words=german)
prepared_features = list(text_based.PREPARED_FEATURES.keys())
data = sens.load_data(single_source=False, n_max=600000, features=prepared_features, agg_method='average')
stop = datetime.datetime.now()

In [None]:
time(start, stop)

In [None]:
data.head()

## Create training set

In [None]:
start = datetime.datetime.now()
train_data = sens.create_train_set(n_train=500000, frac_pos=0.5)
stop = datetime.datetime.now()

In [None]:
time(start, stop)

## Create test collection

In [None]:
start = datetime.datetime.now()
test_collection = sens.create_test_collection(n_queries=50, n_docs=996)
stop = datetime.datetime.now()

In [None]:
time(start, stop)

## Feature extraction

In [None]:
features_dict = {'text_based': list(text_based.FEATURES.keys()), 
                 'vector_based': list(vector_based.FEATURES.keys())}

In [None]:
start = datetime.datetime.now()
train_test, test_collection = sens.extract_features(features_dict=features_dict, data='train_test', drop_prepared=False)
stop = datetime.datetime.now()


In [None]:
time(start, stop)

## View data

In [None]:
train_data.head()

In [None]:
test_collection.head()

## Save data

In [None]:
path = f'{paths.data_path}extracted_data/global'
if not os.path.exists(path):
    os.makedirs(path)

In [None]:
train_data.to_pickle(f'{path}/training_data_avg.pkl')
'Finished at: {}'.format(datetime.datetime.now())

In [None]:
test_collection.to_pickle(f'{path}/test_collection_avg.pkl')
'Finished at: {}'.format(datetime.datetime.now())

In [None]:
overall_stop = datetime.datetime.now()
'Total computation time: {}'.format(overall_stop-overall_start)

## TO DO ! For tfidf: Save idf values