In [1]:
from data import get_cleaned_dataframes, get_dataframes
from models import ttl_word2vec, classification

SpaCy pipeline loaded


## Word2Vec
load the preprocessed texts in all languages into a dataframe

In [None]:
cleaned_multilingual_dataframe = get_cleaned_dataframes()

Split data into train and test sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cleaned_multilingual_dataframe['cleaned_text'], cleaned_multilingual_dataframe['label'] , test_size=0.3)

Train a Word2Vec model with the train set

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=X_train, vector_size= 100, min_count=1) # sentences = wikipedia articles!

save that model

In [None]:
model.save("../models/multilingual_word2vec.model")
# save model as KeyedVectors
wv = model.wv
wv.save('../models/word_vectors/multilingual_word2vec.kv')

load model

In [None]:
model = Word2Vec.load('../models/multilingual_word2vec.model')
wv = ttl_word2vec.load_word2vec_keyed_vectors('../models/word_vectors/multilingual_word2vec.kv')

## Text Classification on manually trained Word2Vec model

### Create features from text for machine learning model.
Steps:
- Turn words in texts into word vectors.
- Method 1: averaging word vectors for each text

In [None]:
X_train_vect = classification.text2vec(X_train, wv)
X_test_vect = classification.text2vec(X_test, wv)

In [None]:
X_train_vect_avg = classification.average_vector(X_train_vect)
X_test_vect_avg = classification.average_vector(X_test_vect)

### Classification reports:
- Recall = how many of this class where found over whole num of elements of this class
- Precision = how many correclty classified among that class
- f1-score: harmonic mean between precision and recall
- support: num of occurence of given class in dataset

Instantiate and fit a basic Random Forest model on top of the vectors

In [None]:
y_pred = classification.classify_predict(X_train_vect_avg, X_test_vect_avg, y_train, "Random Forest")
classification.print_classification_report(y_test, y_pred)

y_pred = classification.classify_predict(X_train_vect_avg, X_test_vect_avg, y_train, "Multinomial Naive Bayes")
classification.print_classification_report(y_test, y_pred)

y_pred = classification.classify_predict(X_train_vect_avg, X_test_vect_avg, y_train, "SVM")
classification.print_classification_report(y_test, y_pred)

In [None]:
y_train.value_counts().plot.pie(autopct='%0.2f')

Highly Imbalanced Dataset!
Ideas:
- Undersampling: reduce majority class to make it equal to minority class
- Oversampling: increase minority class to make it equal to majority class through resampling
- K-fold cross validation

## Oversampling

In [None]:
!pip install imblearn
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy="not majority")
X_train_2d = X_train.values.reshape(-1, 1) # puts each elem of array in own array
X_res, y_res = ros.fit_resample(X_train_2d, y_train)

In [None]:
y_res.value_counts()
y_res.value_counts().plot.pie(autopct='%0.2f')

In [None]:
#X_res revert 2_dimensionality
X_res_1d = X_res.flatten()

In [None]:
# classification with resampled vectors
X_res_vect = classification.text2vec(X_res_1d, wv)
X_res_vect_avg = classification.average_vector(X_res_vect)

### Classification reports:
- Recall = how many of this class where found over whole num of elements of this class
- Precision = how many correclty classified among that class
- f1-score: harmonic mean between precision and recall
- support: num of occurence of given class in dataset

In [None]:
y_res_pred = classification.classify_predict(X_res_vect_avg, X_test_vect_avg, y_res, "Random Forest")
classification.print_classification_report(y_test, y_res_pred)
y_res_pred = classification.classify_predict(X_res_vect_avg, X_test_vect_avg, y_res, "Multinomial Naive Bayes")
classification.print_classification_report(y_test, y_res_pred)
y_res_pred = classification.classify_predict(X_res_vect_avg, X_test_vect_avg, y_res, "SVM")
classification.print_classification_report(y_test, y_res_pred)

In [None]:
# TODO: try
#from imblearn.pipeline import Pipeline
# create pipeline for handling imbalanced data
#steps = [('over', RandomOverSampler()), ('model', DecisionTreeClassifier())]
#pipeline = Pipeline(steps=steps)

## Classification with pretrained model
to be continued..

In [None]:
# Model with pretrained vectors (just english)
# try fasttext vectors in several languages
# bert has multilingual CONTEXTUAL word vectors (later;))
# use pretrained glove vectors
# by downloading them manually
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from pathlib import Path
from gensim.scripts.glove2word2vec import glove2word2vec

file_path: Path = Path("../models/word_vectors/glove.6B.100d.txt").resolve()
glove_file = datapath(file_path)
tmp_file = get_tmpfile("glove2word2vec.txt")
_ = glove2word2vec(glove_file, tmp_file)

pretrained_model = ttl_word2vec.load_word2vec_keyed_vectors(tmp_file)

# save when created first
pretrained_model.save("../models/word_vectors/pretrained_model.kv")
# load when file already exists
pretrained_model= ttl_word2vec.load_word2vec_keyed_vectors("../models/word_vectors/pretrained_model.kv")

#missing_words = ttl_word2vec.get_words_in_static_embeddings(pretrained_model, words, '../models/words/pretrained_model.txt')

"""# TODOS:
# add bigram detector?
# add missing word if occurs e.g. 3 times? if possible and makes sense...
sentences =[["test"]]
model.build_vocab(sentences)
model.intersect_word2vec_format(pretrained_model, lockf=1.0, binary=True)
model.train(sentences, total_examples=3, epochs = 5)
"""