this script is an attempt to classify good and bad translations from RusLTC

Representation and learning method proposed:

use trained bi-lingual word embeddings (/home/masha/MUSE/rig1_res/vector-en.vec and vector-ru.vec) to capture accuracy of translations against their sources


In [81]:
import io ### https://docs.python.org/3/library/io.html
import sys,os
import pandas as pd
import time
import random
from collections import OrderedDict

import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate, cross_val_score, cross_val_predict
from sklearn.metrics import precision_recall_fscore_support

import gensim


# # TensorFlow and tf.keras
# import logging

# import tensorflow as tf
# from tensorflow import keras

# from keras.callbacks import TensorBoard, EarlyStopping
# from keras import backend, preprocessing
# from keras.layers import Dense, Input, LSTM, Bidirectional
# from keras.models import Model
# from keras.models import load_model as load_keras_model
# from keras.layers import Embedding
# from keras.utils.vis_utils import plot_model
# from IPython.display import SVG
# from keras.utils.vis_utils import model_to_dot
# import pydot
# from keras.utils import to_categorical
# from sklearn.metrics import classification_report
# from sklearn.metrics import confusion_matrix


# print(tf.__version__)

# ## to avoid reloading kernel after changes to imported modules
# # import importlib
# # import HTQ_functions as mm
# # importlib.reload(mm)

# # import functions as mm1
# # importlib.reload(mm1)

# ## import the functions from the helper scripts
# # from acc_functions import 


%matplotlib inline
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# logger = logging.getLogger(__name__)

In [82]:
def load_vec(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    
    # both open methods return class '_io.TextIOWrapper'
    
    if emb_path.endswith('.gz'): 
        f = gzip.open(emb_path, 'rt', encoding='utf-8', newline='\n', errors='ignore')
        
    elif emb_path.endswith('.vec') or emb_path.endswith('.txt'):
        f = io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore')
        
    for i, line in enumerate(f):
        ## to skip the first line (header) one can use next(f) or the condition below
        if i == 0:
            split = line.split()
            assert len(split) == 2
        else:
            word, vect = line.rstrip().split(' ', 1) # stops the splitting after the first occurence
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, word #'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors) # stack the sequence of input arrays vertically to make a single array
    # the list of vectors is aligned by dimentions in an array
    return embeddings, id2word, word2id

In [83]:
def preparedata(directory):
    ourdic = []
    print('Collecting data from the files...')
    for subdir in os.listdir(directory):
        files = [f for f in os.listdir(os.path.join(directory, subdir)) if f.endswith('.conllu')]
        for f in files:
            rowdic = {'doc': f.strip(), 'group': subdir}
            doc = open(os.path.join(directory, subdir, f))
            text = doc.read().strip() #.replace('\n', ' ')
            doc.close()
            rowdic['text'] = text
            ourdic.append(rowdic)
    ourdic = pd.DataFrame(ourdic)
    return ourdic

In [84]:
def visual(data, labels, classes):
    # Here goes the 2-D plotting of the data...
    pca = PCA(n_components=2)
    x_r = pca.fit_transform(data)
    plt.figure()
    # consistent colors
    cols = ['red', 'green', 'orange', 'blue', 'grey']
    colors = {}
    for i,name in enumerate(classes):
        colors[name] = cols[i]
#     colors = {'bad': 'red', 'good': 'green'}
    lw = 2

    for target_name in classes:
        plt.scatter(x_r[labels == target_name, 0], x_r[labels == target_name, 1], s=1, color=colors[target_name],
                    label=target_name, alpha=.8, lw=lw)
    plt.legend(loc='best', scatterpoints=1, prop={'size': 15})
    plt.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off')
    plt.tick_params(axis='y', which='both', left='off', right='off', labelleft='off')
    #plt.savefig('plot.png', dpi=300)
    plt.show()
    plt.close()
    return x_r

In [85]:
def get_dic_value(key, dic=None):
    # Get the values for keys
    if key in dic:
        return dic[key]
    else:
        return 0

In [87]:
## preprosessing from https://github.com/akutuzov/webvectors/blob/master/preprocessing/rus_preprocessing_udpipe.py
datadir = '/home/masha/accuracy/data/lempos/'  # The path to where subdirectories whith text files are

df = preparedata(datadir)
print(df.head())
labels = df['group']
print(set(labels))

Collecting data from the files...
                  doc group                                               text
0  RU_1_244_35.conllu   bad  весь_DET правда_NOUN о_ADP гмый_PROPN сторонни...
1   RU_1_271_4.conllu   bad  не_PART посещать_VERB 'д_NOUN странный_ADJ реб...
2   RU_1_150_2.conllu   bad  последний_ADJ книга_NOUN роджер::пенроуз_PROPN...
3   RU_1_244_1.conllu   bad  весь_DET правда_NOUN о_ADP генетически_ADV мод...
4  RU_1_274_41.conllu   bad  газета_PROPN нью-йорк::таймз_PROPN xx_NUM октя...
{'good', 'source', 'bad'}


In [88]:
src_path = '/home/masha/MUSE/rig1_res/vectors-en.vec'
tgt_path = '/home/masha/MUSE/rig1_res/vectors-ru.vec'

nmax = 240000  # maximum number of word embeddings to load

src_embeddings, src_id2word, src_word2id = load_vec(src_path, nmax)
print(len(src_embeddings), len(src_id2word))

tgt_embeddings, tgt_id2word, tgt_word2id = load_vec(tgt_path, nmax)
print(len(tgt_embeddings), len(tgt_id2word))

from itertools import islice

print(list(islice(src_word2id.items(),5)))
print(list(islice(tgt_word2id.items(),5)))

## the size is controlled by a thres at uploading time
print(tgt_embeddings.shape)

240000 240000
240000 240000
[('Paul::Evans_PROPN', 152516), ('flu_NOUN', 13856), ('banc_NOUN', 59570), ('hilariously_ADV', 78145), ('common_ADJ', 348)]
[('промстройбанк_PROPN', 209428), ('гант_PROPN', 118875), ('osterreichische_PROPN', 90067), ('власьевский_ADJ', 119995), ('розвадовский_PROPN', 232237)]
(240000, 300)


In [9]:
print(type(tgt_embeddings))
print(tgt_embeddings[:5])
# print(tgt_embeddings[15115])

<class 'numpy.ndarray'>
[[ 0.02954  0.11362 -0.0327  ... -0.07555 -0.06493 -0.01583]
 [-0.02412  0.00086  0.04388 ...  0.01334  0.04463 -0.01691]
 [ 0.01785  0.0805  -0.01598 ... -0.03036 -0.03552 -0.01519]
 [ 0.10832  0.05242 -0.04355 ... -0.10485 -0.03382 -0.03843]
 [ 0.01436 -0.04104  0.09711 ... -0.03474  0.02216  0.10514]]


In [16]:
## transform the df to align each target with its source: doc, src, tgt, label
targets_df = df[~(df['group']=='source')]
print('Lose sources:\n',targets_df.shape)
sources_df = df[df['group']=='source']
sources_df = sources_df.sort_values('doc')
print('Separate df for sources:')
sources_df.head(3)

Lose sources:
 (542, 3)
Separate df for sources:


Unnamed: 0,doc,group,text
608,EN_1_101.conllu,source,pussy_PROPN riot_PROPN putin_ADJ v_NOUN punk_N...
580,EN_1_102.conllu,source,anti_PROPN drug_PROPN agency_PROPN seeks_PROPN...
576,EN_1_114.conllu,source,product_NOUN placement_NOUN on_ADV broadway_VE...


In [19]:
aligned = targets_df.copy()
aligned.insert(loc=2, column='source', value=None)
aligned.columns = ['doc', 'group', 'source', 'target']
print('Aligned text pairs:\n')
# print(aligned.head())
aligned['temp'] = aligned['doc'].replace(to_replace='RU',value=r'EN', regex=True)
aligned['temp'] = aligned['temp'].replace(to_replace='_\d+\.conllu',value=r'.conllu', regex=True)
aligned = aligned.sort_values('temp')
# print(aligned.head())
sfns = aligned['temp'].tolist()
for i in sfns:
    aligned.loc[aligned.temp == i, 'source'] = sources_df.loc[sources_df['doc'] == i, 'text'].item()
aligned.head()

Aligned text pairs:



Unnamed: 0,doc,group,source,target,temp
254,RU_1_101_9.conllu,good,pussy_PROPN riot_PROPN putin_ADJ v_NOUN punk_N...,pussy_PROPN riot_PROPN путин_PROPN против_ADP ...,EN_1_101.conllu
62,RU_1_101_3.conllu,bad,pussy_PROPN riot_PROPN putin_ADJ v_NOUN punk_N...,пусси::райот_PROPN путин_PROPN против_ADP панк...,EN_1_101.conllu
11,RU_1_101_8.conllu,bad,pussy_PROPN riot_PROPN putin_ADJ v_NOUN punk_N...,пусси::райот_PROPN судебный_ADJ разбирательств...,EN_1_101.conllu
471,RU_1_102_6.conllu,good,anti_PROPN drug_PROPN agency_PROPN seeks_PROPN...,служба_NOUN по_ADP борьба_NOUN с_ADP наркотик_...,EN_1_102.conllu
119,RU_1_102_15.conllu,bad,anti_PROPN drug_PROPN agency_PROPN seeks_PROPN...,управление_PROPN по_ADP борьба_NOUN с_ADP нарк...,EN_1_102.conllu


In [23]:
ru_xtrain0 = aligned['target']
en_xtrain0 = aligned['source']

y_train = aligned['group'].tolist()
classes = sorted(list(set(aligned['group'].tolist())))
print('===========================')
print('Distribution of classes in the dataset:')
print(aligned.groupby('group').count())
print('===========================')

Distribution of classes in the dataset:
       doc  source  target  temp
group                           
bad    213     213     213   213
good   329     329     329   329


In [24]:
## Have a look at a random document 
doc = random.choice(range(20))
# print(type(ru_xtrain0[doc]))
print('Random target doc:', ru_xtrain0[doc].split()[:20])
## or at the first text to avoid an extra variable
print('\nFirst target doc:', ru_xtrain0[0].split()[:20])

Random target doc: ['ученый_PROPN', 'против_ADP', 'нобелевский_ADJ', 'премия_NOUN', 'нобелевский_ADJ', 'премия_NOUN', 'по_ADP', 'психология_NOUN', 'медицина_NOUN', 'физик_NOUN', 'или_CCONJ', 'химия_NOUN', 'это_PRON', 'самый_ADJ', 'авторитетный_ADJ', 'и_CCONJ', 'почетный_ADJ', 'награда_NOUN', 'в_ADP', 'область_NOUN']

First target doc: ['весь_DET', 'правда_NOUN', 'о_ADP', 'гмый_PROPN', 'сторонник_NOUN', 'генномодифицировать_ADJ', 'зерно_NOUN', 'настаивать_VERB', 'что_SCONJ', 'это_PRON', 'единственный_ADJ', 'способ_NOUN', 'справляться_VERB', 'с_ADP', 'то_PRON', 'что_SCONJ', 'прокармливать_VERB', 'густонаселенный_ADJ', 'мир_NOUN', 'критика_NOUN']


In [25]:
### manual inspection
ru_indexed = [[get_dic_value(w,tgt_word2id) for w in text.split()] for text in ru_xtrain0]
en_indexed = [[get_dic_value(w,src_word2id) for w in text.split()] for text in en_xtrain0]
print('Source text represented with word vectors indices:', en_indexed[0][:7])
print('Target text represented with word vectors indices:', ru_indexed[0][:7])
## find summed text vectors: for that stack a list of vectors of size = text length
ru_embedded = [[tgt_embeddings[idx] for idx in itm] for itm in ru_indexed]
en_embedded = [[src_embeddings[idx] for idx in itm] for itm in en_indexed]
print(type(ru_embedded[0]))
print('number of words=itm:',len(en_embedded[0]),len(ru_embedded[0]))
print(type(en_embedded[0][0][:5]))
print(type(ru_embedded[0][0][:5]))
print('dims of each word embedding', ru_embedded[0][0].shape)
print(len(ru_embedded[0][0]))
print('Same text as a list of embeddings (stored as arrays):', type(ru_embedded[doc][:20][0]))

## 542 itms, each is a list of arrays to be summed up
en_av_texts_emb0 = np.sum(en_embedded[0], axis=0)
ru_av_texts_emb0 = np.sum(ru_embedded[0], axis=0)

print('I expect 300 here!',len(en_av_texts_emb0),len(ru_av_texts_emb0))
print('========')
print(en_av_texts_emb0[:5],ru_av_texts_emb0[:5])

## and concatenate to be added to the df
# paired = list(zip(list(en_av_texts_emb0), list(ru_av_texts_emb0)))
# print(paired[:3])

X = np.array(np.concatenate((en_av_texts_emb0, ru_av_texts_emb0), axis=None))
print('This is the list of summed and concatenated embeddings for each text pair', file=sys.stderr)
print(len(X))
print(X[:5], X[300:305])

# print('Same text as a summed embedding of size 300:', ru_av_texts_emb0[doc])

Source text represented with word vectors indices: [0, 0, 0, 0, 3814, 0, 87]
Target text represented with word vectors indices: [38362, 26358, 5961, 0, 18779, 1532, 364]
<class 'list'>
number of words=itm: 706 570
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
dims of each word embedding (300,)
300
Same text as a list of embeddings (stored as arrays): <class 'numpy.ndarray'>
I expect 300 here! 300 300
[-14.47449  -2.46295  22.62501  -7.47731  27.04624] [  4.3076   21.94698   1.03974 -11.85549   7.48334]
600
[-14.47449  -2.46295  22.62501  -7.47731  27.04624] [  4.3076   21.94698   1.03974 -11.85549   7.48334]


This is the list of summed and concatenated embeddings for each text pair


from here on I am constructing a discriminative classifier that is going to use (pca-transformed to size 100)  summed and joint 200 embedding components as features for 542 texts

This does not make theoretical sense, because there are more features than observations (therefore pca-transform and gridsearch are added)

To address the problem of our model/task being underdetermined (the number of features is greater than the number of data points) we use regularization: hyperparameter tuning via grid search (see https://www.oreilly.com/library/view/feature-engineering-for/9781491953235/ch04.html)

====Future=====

What I need (as a real model that would heed word order and syntactic properties and not only BOW semantics as in the above scenario) is a neural model with an LSTM layer that is going to look at every word in ST+TT represented by corresponding bilingual embeddings of size 300

Number of features is for the LSTM input is 300 as it looks at texts as a sequence of embeddings, processig one at a time

In [27]:
## represent texts with embeddings, get summed up vectors, maybe reduce their dimensionality 
def get_summed_emb(embs, w2idx, texts, reduce=0):
    indexed0 = [[get_dic_value(w,w2idx) for w in text.split()] for text in texts]
    ## ignore OOV words by deleting id=0
    indexed = [[idx for idx in text if idx != 0] for text in indexed0]
    print('OOV lost:', indexed[0][:7])
    embedded = [[embs[idx] for idx in itm] for itm in indexed]
    out = [np.sum(itm, 0) for itm in embedded]
    if reduce:
        pca = PCA(n_components=100)
        out = pca.fit_transform(out)
    else:
        out = out
        
    return out

In [28]:
en_av_texts_emb = get_summed_emb(src_embeddings, src_word2id, en_xtrain0)
print(type(en_av_texts_emb))
print(len(en_av_texts_emb))
print(len(en_av_texts_emb[0]))
ru_av_texts_emb = get_summed_emb(tgt_embeddings, tgt_word2id, ru_xtrain0)
print(len(ru_av_texts_emb))

OOV lost: [3814, 87, 948, 163, 5402, 9, 807]
<class 'list'>
542
300
OOV lost: [38362, 26358, 5961, 18779, 1532, 364, 42515]
542


In [29]:
## and concatenate to be added to the df
paired = list(zip(list(en_av_texts_emb), list(ru_av_texts_emb)))
print(len(paired))

XX = np.array([np.concatenate((p[0],p[1]), axis=None) for p in paired])
print('This is the list of summed and concatenated embeddings for each text pair', file=sys.stderr)
print(len(XX))
print(XX[0][:5], XX[0][300:305])

542
542
[ -2.26905  -0.66583   6.94597 -10.57235   3.26352] [-2.10258 -2.70856  8.13564 -9.80701 -0.97098]


This is the list of summed and concatenated embeddings for each text pair


In [30]:
print(len(y_train))
print(set(y_train))
print(XX.shape)

542
{'good', 'bad'}
(542, 600)


In [89]:
## lets classify!
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def my_SVM(x,y, algo='SVM', grid=False):
    ## StandardScaler scales each column (feature) to have 0 mean and unit variance.
    sc = StandardScaler() ## same as z-transformation; is your data normally distributed to use this?
    x = sc.fit_transform(x)
    
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    if algo == 'SVM':
        clf = SVC(decision_function_shape='ovo', kernel='rbf', gamma='auto',
                  random_state=100, verbose=False, probability=True, class_weight="balanced")
        if grid:
            refit_score='f1_score'
            scoring='f1_macro'
            tuned_parameters = {'kernel': ['rbf','linear'], 'gamma': [0.1, 1, 10, 100],'C': [1, 10, 100, 1000]}
            grid_search = GridSearchCV(clf, tuned_parameters, refit=refit_score, cv=skf, scoring=scoring)
            grid_search.fit(x, y)

            clf.set_params(**grid_search.best_params_)
            best_dict = grid_search.best_params_
            print('Best param:\n', best_dict)
        print('we are classifying with:\n', clf)
        
    elif algo == 'dummy':
        strategy='stratified'
        print('\n====DummyBaseline (%s)====' % strategy)
        clf = DummyClassifier(strategy=strategy,random_state=42) # 'stratified','uniform', 'most_frequent'

    ## this function is simpler than cross_validate, it returns a list of scores on just one specified metric, rather than a dict and tons of other info in that dict
    scores_f = cross_val_score(clf, x, y, cv=skf, scoring='f1_macro')  # cv = loo f1_macro
    scores_acc = cross_val_score(clf, x, y, cv=skf, scoring='accuracy')

    print("F1 over 10folds: ", scores_f.mean())
    print("Accuracy over 10folds: ", scores_acc.mean())
    
    preds = cross_val_predict(clf, x, y, cv=skf)
    print('Cross-validated estimates for data points')
    print(classification_report(y, preds))
    
    print('===ConfusionMatrix===')
    cnf_matrix = confusion_matrix(y, preds)
    print(cnf_matrix)
    
    
    # get measures on the minority class
    print('\n====MinorityClass====')
    my_dict = classification_report(y, preds, output_dict=True)
    minorf1 = my_dict['bad']['f1-score']
    print('F1 on the minority class:', np.average(minorf1).round(3))

In [90]:
print(XX.shape)
my_SVM(XX, y_train, algo='SVM', grid=False)
my_SVM(XX, y_train, algo='dummy', grid=False)

(542, 600)
we are classifying with:
 SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=100, shrinking=True,
  tol=0.001, verbose=False)
F1 over 10folds:  0.6068528732661612
Accuracy over 10folds:  0.6088609364081062
Cross-validated estimates for data points
              precision    recall  f1-score   support

         bad       0.50      0.77      0.61       213
        good       0.77      0.51      0.61       329

   micro avg       0.61      0.61      0.61       542
   macro avg       0.64      0.64      0.61       542
weighted avg       0.66      0.61      0.61       542

===ConfusionMatrix===
[[163  50]
 [162 167]]

====MinorityClass====
F1 on the minority class: 0.606

====DummyBaseline (stratified)====
F1 over 10folds:  0.4782165369017871
Accuracy over 10folds:  0.5145086080935137
Cross-validated estimates for data points
              precision    

lets try cosine similarity between ST and TT as a single feature

In [78]:
## smaller cosine = bigger angle/distance
def cosine_similarity(x, y):
    return np.dot(x, y) / (np.sqrt(np.dot(x, x)) * np.sqrt(np.dot(y, y)))

In [79]:
sims = [cosine_similarity(p[0],p[1]) for p in paired]
print(len(sims))
print(sims[:2])
Xsims = np.array([sims]).transpose()
print(Xsims.shape)

542
[0.9579350638592249, 0.9524320243437777]
(542, 1)


In [80]:
my_SVM(Xsims, y_train, grid=True)
my_SVM(Xsims, y_train, algo='dummy', grid=False)



Best param:
 {'C': 1, 'gamma': 0.1, 'kernel': 'linear'}
we are classifying with:
 SVC(C=1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma=0.1, kernel='linear',
  max_iter=-1, probability=True, random_state=100, shrinking=True,
  tol=0.001, verbose=False)
F1 over 10folds:  0.5106311003943864
Accuracy over 10folds:  0.5128962581792769
Cross-validated estimates for data points
              precision    recall  f1-score   support

         bad       0.43      0.72      0.54       213
        good       0.68      0.38      0.48       329

   micro avg       0.51      0.51      0.51       542
   macro avg       0.55      0.55      0.51       542
weighted avg       0.58      0.51      0.51       542

===ConfusionMatrix===
[[154  59]
 [205 124]]

====MinorityClass====
F1 on the minority class: 0.538

====DummyBaseline (most_frequent)====
F1 over 10folds:  0.37773326572008115
Accuracy over 10folds:  0.6070440251572327
Cross-validated estima

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


another baseline is QuEst features from https://www.quest.dcs.shef.ac.uk/

I need to update RusLTC to have all quality-labeled data in the bitext format

see a separate script for:

a neural model with an LSTM layer that is going to look at every word in ST+TT represented by corresponding bilingual embeddings of size 300

Number of features is for the LSTM input is 300 as it looks at texts as a sequence of embeddings, processig one at a time