In [1]:
#libraries 
import numpy as np
import pandas as pd
import re

#preprocessing
import string

#translation 
from nltk.translate.bleu_score import sentence_bleu
from gensim import models

#ignore warnings
import warnings
warnings.filterwarnings("ignore")

#visualization 
import seaborn as sns



Base: https://medium.com/@adriensieg/text-similarities-da019229c894

REF: 
    
https://arxiv.org/pdf/1301.3781.pdf
    
https://www.aclweb.org/anthology/N19-1181.pdf
    
http://proceedings.mlr.press/v37/kusnerb15.pdf

https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/37842.pdf

https://arxiv.org/pdf/1609.08144.pdf

In [2]:
df = pd.read_csv(r"C:\Users\hirom\OneDrive - NOVAIMS\NOVA IMS\OneDrive_1_25-02-2021\Text Mining\Project\corpus\de-en\scores.csv")
df.head(4)

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators
0,"Ihr Zeitlupentempo maßen sie, als sie vor Spit...",Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024,76.0,1
1,"Er sagte, dass die Bereiche ruhige Treffpunkte...",He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.9038,97.5,2
2,Für die Geschäftsleute an der B 27 ist es nur ...,"For businessmen at the B 27, it's only a small...",This is only a small consolation for businesse...,0.700503,94.0,1
3,Diese Fähigkeit sei möglicherweise angeboren o...,This ability may be born or developed with gen...,"This ability may be innate, or may develop as ...",-1.256572,51.5,2


In [3]:
df.shape

(21704, 6)

In [4]:
df.describe()

Unnamed: 0,z-score,avg-score,annotators
count,21704.0,21704.0,21704.0
mean,0.000898,71.85289,1.502995
std,0.85514,26.348469,0.810923
min,-5.806322,0.0,1.0
25%,-0.499574,56.0,1.0
50%,0.216756,79.0,1.0
75%,0.640273,94.0,2.0
max,2.535434,100.0,9.0


In [5]:
pd.pivot_table(df, values = 'annotators', index =['translation'], aggfunc = 'count')

Unnamed: 0_level_0,annotators
translation,Unnamed: 1_level_1
"""A Generation Is Protesting"" in Ethiopia, Long a U.S. Ally",7
"""A Square Meal"" is a feast of historical tidbits.",4
"""A cultural change, a mental change, a physical change,"" Bam said.",4
"""A lot of former customers got priced out of Williamsburg too,"" he said.",5
"""A lot of the stuff I get is late 60s, early 70s, things that came out when I was 11 or 12, things I was probably a bit too young to get,"" he says.",10
...,...
"You're not singing by yourself - and in a group of 50 people singing, who'll hear if a note's out here or there?",9
YouTube Gaming is owned by Google's parent company Alphabet.,9
Young people are particularly vulnerable to this.,5
Your doctor will examine you to see if the problems have a physical cause.,6


In [6]:
reference =df['reference']
translation = df['translation']

## Preprocessing 

In [7]:
df.isnull().sum()

source         0
reference      0
translation    0
z-score        0
avg-score      0
annotators     0
dtype: int64

In [8]:
def preprocessing(column):
    pp_column = []
    for sentence in column:
        #exclude punctuation 
        sentence = re.sub(r'[^A-Za-z0-9]',' ',sentence)
        #transform everything in lowercase
        sentence = sentence.lower()
        pp_column.append(sentence)
        
    processed = pd.Series(pp_column)
    return processed

In [9]:
p_reference = preprocessing(reference)
p_translation = preprocessing(translation)

In [10]:
df_cp = df.copy()

In [11]:
df_cp = pd.concat([df_cp,p_reference.rename('p_reference')], axis=1)

In [12]:
df_cp = pd.concat([df_cp,p_translation.rename('p_translation')], axis=1)

In [13]:
df_cp.head(4)

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators,p_reference,p_translation
0,"Ihr Zeitlupentempo maßen sie, als sie vor Spit...",Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024,76.0,1,her timeless pace measures them when they equi...,their slow speed was measured by researchers o...
1,"Er sagte, dass die Bereiche ruhige Treffpunkte...",He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.9038,97.5,2,he said the areas offer quiet meeting points b...,he said the spaces provided calm meeting point...
2,Für die Geschäftsleute an der B 27 ist es nur ...,"For businessmen at the B 27, it's only a small...",This is only a small consolation for businesse...,0.700503,94.0,1,for businessmen at the b 27 it s only a small...,this is only a small consolation for businesse...
3,Diese Fähigkeit sei möglicherweise angeboren o...,This ability may be born or developed with gen...,"This ability may be innate, or may develop as ...",-1.256572,51.5,2,this ability may be born or developed with gen...,this ability may be innate or may develop as ...


In [14]:
df_cp.iloc[21,:]

source            Olympia: Dreister Betrug bei Doping-Test
reference           Olympia: threesty fraud in doping test
translation      Olympics: Brazen cheating in doping tests
z-score                                          -0.980437
avg-score                                               39
annotators                                               1
p_reference         olympia  threesty fraud in doping test
p_translation    olympics  brazen cheating in doping tests
Name: 21, dtype: object

In [15]:
teste = df_cp[df_cp['p_translation'] == 'olympics  brazen cheating in doping tests']
teste

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators,p_reference,p_translation
21,Olympia: Dreister Betrug bei Doping-Test,Olympia: threesty fraud in doping test,Olympics: Brazen cheating in doping tests,-0.980437,39.0,1,olympia threesty fraud in doping test,olympics brazen cheating in doping tests
2240,Olympia: Dreister Betrug bei Doping-Test,Olympia: Brazen fraud with doping test,Olympics: Brazen cheating in doping tests,0.582416,95.25,4,olympia brazen fraud with doping test,olympics brazen cheating in doping tests
4775,Olympia: Dreister Betrug bei Doping-Test,Olympia: Three scams on doping test,Olympics: Brazen cheating in doping tests,-0.085961,73.0,1,olympia three scams on doping test,olympics brazen cheating in doping tests
6867,Olympia: Dreister Betrug bei Doping-Test,Olympia: Trieste cheating on doping test,Olympics: Brazen cheating in doping tests,-0.594918,43.0,1,olympia trieste cheating on doping test,olympics brazen cheating in doping tests
8509,Olympia: Dreister Betrug bei Doping-Test,Olympia: Brazen fraud in doping test,Olympics: Brazen cheating in doping tests,0.758588,95.0,1,olympia brazen fraud in doping test,olympics brazen cheating in doping tests
13797,Olympia: Dreister Betrug bei Doping-Test,Olympia: Dreist fraud in doping test,Olympics: Brazen cheating in doping tests,0.317103,76.0,1,olympia dreist fraud in doping test,olympics brazen cheating in doping tests
15245,Olympia: Dreister Betrug bei Doping-Test,Olympics: triple fraud in doping test,Olympics: Brazen cheating in doping tests,0.099333,77.0,1,olympics triple fraud in doping test,olympics brazen cheating in doping tests
21368,Olympia: Dreister Betrug bei Doping-Test,Olympics: Triple fraud in doping test,Olympics: Brazen cheating in doping tests,-1.098754,25.0,1,olympics triple fraud in doping test,olympics brazen cheating in doping tests


In [23]:
ref = teste['p_reference'].to_list()

['olympia  threesty fraud in doping test',
 'olympia  brazen fraud with doping test',
 'olympia  three scams on doping test',
 'olympia  trieste cheating on doping test',
 'olympia  brazen fraud in doping test',
 'olympia  dreist fraud in doping test',
 'olympics  triple fraud in doping test',
 'olympics  triple fraud in doping test']

In [26]:
cand = 'olympics  brazen cheating in doping tests'

## Cosine Similarity 

REF: https://www.machinelearningplus.com/nlp/cosine-similarity/

The cosine similarity is advantageous because even if the two similar documents are far apart by the Euclidean distance (due to the size of the document), chances are they may still be oriented closer together. The smaller the angle, higher the cosine similarity.

In [121]:
#Libraries 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [17]:
count_vectorizer = CountVectorizer()

In [24]:
ref[0]

'olympia  threesty fraud in doping test'

In [27]:
cand

'olympics  brazen cheating in doping tests'

In [28]:
sparse_matrix = count_vectorizer.fit_transform([ref[0],cand])
sparse_matrix

<2x10 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [120]:
cosine_similarity(sparse_matrix)

array([[1.        , 0.33333333],
       [0.33333333, 1.        ]])

In [30]:
sparse_matrix6 = count_vectorizer.fit_transform([ref[6],cand])
sparse_matrix6

<2x9 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [31]:
pd.DataFrame(cosine_similarity(sparse_matrix6))

Unnamed: 0,0,1
0,1.0,0.5
1,0.5,1.0


In [126]:
#Libraries
import re
import math
from collections import Counter

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def text_to_vector(text):
    word = re.compile(r'\w+')
    words = word.findall(text)
    return Counter(words)

def get_result(content_a, content_b):
    text1 = content_a
    text2 = content_b

    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)

    cosine_result = get_cosine(vector1, vector2)
    return cosine_result

In [130]:
get_result(ref[0],cand)

0.33333333333333337

In [131]:
get_result(ref[6],cand)

0.5000000000000001

**Interpretation** : A cosine value of 0 means that the two vectors are at 90 degrees to each other (orthogonal) and have no match. The closer the cosine value to 1, the smaller the angle and the greater the match between vectors. 

## Word Mover Distance

REF: https://towardsdatascience.com/word-movers-distance-for-text-similarity-7492aeca71b0


In [32]:
cand_f = 'olympics  brazen cheating in doping tests'.split()

In [33]:
#libraries 
from time import time 
#remove stop words 
import os
from gensim import models
import gensim.downloader as api
from pyemd import emd

In [50]:
#tokenized data 
def delete_sw(lista):
    stop_words = stopwords.words('english')
    finalresult = []
    for sentence in lista:
        processed = sentence.split()
        clean = []
        for word in processed:
            if word not in stop_words:
                clean.append(word)
        finalresult.append(clean)
    return finalresult 

In [52]:
stop_words = stopwords.words('english')

In [53]:
clean_ref = delete_sw(ref)

In [55]:
clean_cand = [w for w in cand_f if w not in stop_words]

In [36]:
clean_ref = delete_sw(ref)

In [49]:
clean_ref

[['olympia', 'threesty', 'fraud', 'doping', 'test'],
 ['olympia', 'brazen', 'fraud', 'doping', 'test'],
 ['olympia', 'three', 'scams', 'doping', 'test'],
 ['olympia', 'trieste', 'cheating', 'doping', 'test'],
 ['olympia', 'brazen', 'fraud', 'doping', 'test'],
 ['olympia', 'dreist', 'fraud', 'doping', 'test'],
 ['olympics', 'triple', 'fraud', 'doping', 'test'],
 ['olympics', 'triple', 'fraud', 'doping', 'test']]

In [148]:
model = api.load('word2vec-google-news-300')


In [41]:
distance = model.wmdistance(clean_ref[0],clean_cand)

https://towardsdatascience.com/word-distance-between-word-embeddings-cc3e9cf1d632
https://radimrehurek.com/gensim/auto_examples/tutorials/run_wmd.html
    
The sentence can have no similar words, but by the relevand words in each sentence, it's possible to identify the "semantical distance" between them. 
With this method we can evaluate how the reference of translation is close to the real translation.

In [42]:
clean_ref

[['olympia', 'threesty', 'fraud', 'doping', 'test'],
 ['olympia', 'brazen', 'fraud', 'doping', 'test'],
 ['olympia', 'three', 'scams', 'doping', 'test'],
 ['olympia', 'trieste', 'cheating', 'doping', 'test'],
 ['olympia', 'brazen', 'fraud', 'doping', 'test'],
 ['olympia', 'dreist', 'fraud', 'doping', 'test'],
 ['olympics', 'triple', 'fraud', 'doping', 'test'],
 ['olympics', 'triple', 'fraud', 'doping', 'test']]

In [54]:
clean_cand

['olympics', 'brazen', 'cheating', 'doping', 'tests']

In [44]:
distance

0.8289396995482206

In [45]:
distance6 = model.wmdistance(clean_ref[6],clean_cand)
distance6

0.5927309122713804

**Interpretation**: more similar words between sentences, less distant they are. 

## ADD the embedding comparation in the table 

In [146]:
df_cp['cos_similarity'] = df_cp.apply(lambda x:get_result(x['p_reference'],x['p_translation']), axis=1)

In [149]:
df_cp['WMD'] = df_cp.apply(lambda x:model.wmdistance(x['p_reference'],x['p_translation']), axis=1)

In [150]:
df_cp

Unnamed: 0,source,reference,translation,z-score,avg-score,annotators,p_reference,p_translation,cos_similarity,WMD
0,"Ihr Zeitlupentempo maßen sie, als sie vor Spit...",Her timeless pace measures them when they equi...,Their slow speed was measured by researchers o...,-0.345024,76.0,1,her timeless pace measures them when they equi...,their slow speed was measured by researchers o...,0.258199,0.216127
1,"Er sagte, dass die Bereiche ruhige Treffpunkte...",He said the areas offer quiet meeting points b...,He said the spaces provided calm meeting point...,0.903800,97.5,2,he said the areas offer quiet meeting points b...,he said the spaces provided calm meeting point...,0.750000,0.153579
2,Für die Geschäftsleute an der B 27 ist es nur ...,"For businessmen at the B 27, it's only a small...",This is only a small consolation for businesse...,0.700503,94.0,1,for businessmen at the b 27 it s only a small...,this is only a small consolation for businesse...,0.679366,0.110323
3,Diese Fähigkeit sei möglicherweise angeboren o...,This ability may be born or developed with gen...,"This ability may be innate, or may develop as ...",-1.256572,51.5,2,this ability may be born or developed with gen...,this ability may be innate or may develop as ...,0.553399,0.197048
4,Weil sie Wassertemperaturen um die sechs Grad ...,Because they prefer water temperatures around ...,They generally only come to the surface in win...,0.293909,87.0,2,because they prefer water temperatures around ...,they generally only come to the surface in win...,0.783547,0.077533
...,...,...,...,...,...,...,...,...,...,...
21699,"Lt. Cmdr. Patrick Evans, ein Pressesprecher de...","Lt. Cmdr. Patrick Evans, a press officer at th...","Lt. Cmdr. Patrick Evans, a Pentagon spokesman,...",1.246459,100.0,1,lt cmdr patrick evans a press officer at th...,lt cmdr patrick evans a pentagon spokesman ...,0.802260,0.066873
21700,"""Um ein Beispiel zu geben: Wenn ich ihn etwas ...","""To give an example: If I ask him something th...","""To give an example: If I ask him what happene...",0.792878,98.0,1,to give an example if i ask him something th...,to give an example if i ask him what happene...,0.724569,0.104635
21701,"Ein Grund dafür, dass nicht alle Nachbarn das ...",One reason that not all neighbours view this a...,One reason for not all neighbours seeing this ...,0.597068,76.0,1,one reason that not all neighbours view this a...,one reason for not all neighbours seeing this ...,0.814174,0.088088
21702,Der Gewinn vor Zinsen und Steuern erhöhte sich...,Profit before interest and tax increased from ...,Profits before interest and taxes increased fr...,-0.305719,61.0,1,profit before interest and tax increased from ...,profits before interest and taxes increased fr...,0.729397,0.100805


## Predictive Model 

In [156]:
X = df_cp[['cos_similarity','WMD']]
y = df_cp['z-score']

In [187]:
#Libraries
    
#https://medium.com/codex/step-by-step-guide-to-simple-and-multiple-linear-regression-in-python-867ac9a30298    
    
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, confusion_matrix

In [157]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [179]:
# Note the difference in argument order
model = LinearRegression().fit(X_train,y_train)
predictions = model.predict(X_test)

In [180]:
slr_slope = model.coef_
slr_intercept = model.intercept_

In [181]:
model.score(X_train, y_train)

0.08907757252656401

In [182]:
model.score(X_test, y_test)

0.08773109836871795

In [None]:
## BERT 

REF:https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#what-is-bert

https://medium.com/analytics-vidhya/bert-word-embeddings-deep-dive-32f6214f02bf

#libraries 
import torch
from pytorch_transformers import BertTokenizer
from pytorch_transformers import BertModel

from nltk.corpus import stopwords

import matplotlib.pyplot as plt
%matplotlib inline

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


def tokenize_bert(text):
    lista = []
    for sentence in text: 
        processed = tokenizer.tokenize("[CLS] " + sentence + " [SEP]")
        lista.append(processed)
    return lista

def convert(lst):
      
    return '/'.join(lst)

# Split the sentence into tokens.
tokenized_ref = tokenize_bert(ref)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_ref[0])
indexed_tokens

for tup in zip(tokenized_ref[0], indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_ref[0])
segments_ids

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

tokens_tensor

segments_tensors

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

