In [None]:
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


metrics = ['f1_cd','f1_ln']
mappings = ['incremental','procrustes','twec']
w2vec_algorithms = ['word2vec','lda2vec']
embeddings = ['pretrained','None']
results_path = './output/**/**/results'


results_df = pd.DataFrame()
path = '../output/**/**/results'
text_files = set(glob.glob(path + "/**/*.pkl", recursive=True))
for file in text_files:
    results_df = pd.concat([results_df, pd.read_pickle(file)], ignore_index=True, axis=0)

results_df.to_csv('../language_drift_results', index=False)

results_df = pd.read_csv('../language_drift_results')
print(results_df.head())

## Research Questions
### 1. Which vectors' alignment method performs better?

### Null Hypothesis: 
Αll mapping methods we investigate(‘procrustes’, ‘incremental’, ‘twec’) perform equally across different executions on the same datasets and parameters.


In [None]:
# deep copy
mapping_df = results_df.copy(deep=True)
# remove "lda2vec" and "pretrained" executions
mapping_df = mapping_df.drop(mapping_df[((mapping_df['pretrained'] != 'None') 
                                         | (mapping_df['w2vec_algorithm'] == 'lda2vec'))].index)
                                         
# remove unnecessary columns
mapping_df = mapping_df.drop(['precision_cd','precision_ln','accuracy_cd','accuracy_ln',
          'recall_cd','recall_ln','data_set_id','dim','window_size','pretrained','t'],axis=1)

mapping_df = mapping_df.melt(id_vars=["language", "w2vec_algorithm","mapping"], 
        var_name="metric", 
        value_name="f1_score")


# remove rows with Nan values at f1_scores 
mapping_df = mapping_df[mapping_df['f1_score'].notna()]

# remove unnecessary columns
mapping_df = mapping_df.drop('metric',axis=1)

# create pivot table
#pivot_mappings = mapping_df.pivot_table(index=['language','w2vec_algorithm'], columns="mapping", values=['f1_score']).reset_index()
#pivot_cd['mean_f1'] = pivot_cd.loc[:, (['f1_cd','f1_ln'], slice(None))].mean(axis=1)
print(mapping_df.info())

### 1.1 Shapiro-Wilk Test (checks normality of distribution)

In [None]:
# Shapiro-Wilk test
from collections import defaultdict
from scipy import stats

alpha =0.05

print("Shapiro-Wilk test for normal distribution: \n")
mapping_normality_dict = defaultdict()
for mapping in mappings:
    stat, p = stats.shapiro( mapping_df.loc[mapping_df['mapping'] == str(mapping),'f1_score'])
    print(mapping)
    if p >= alpha:
        print("\t has a normal distribution with pvalue = "+ str(p) + ", stat=",str(stat))
        mapping_normality_dict[mapping] = True
    else:
        print("\t has NOT a normal distribution with pvalue = ", p, "stat=",stat)
        mapping_normality_dict[mapping] = False

### Normality Results
**Incremental** and **twec** method can be described by the normal distribution.

However since **procrustes** method does not meet the criteria of the normality <u>we have to go through with non parapetric tests.</u>

### 1.2 Friedman test (non parametric - normality is not a prerequisite)
* Prerequisites (non normal distributions, paired samples, more than two groups)
* Samples are paired since all variables except the under investigation variable are shared among the different populations
* H0: Populations have same distributions


In [None]:
# Friedman test (non parametric - normality is not a prerequisite)
# Compare groups of incremental, procrustes and twec

alpha =0.05

print("Friedman H-test: \n\n incremental-procrustes")
stat, p = stats.friedmanchisquare(
    mapping_df.loc[mapping_df['mapping'] == 'incremental','f1_score'],
    mapping_df.loc[mapping_df['mapping'] == 'procrustes','f1_score'],
    mapping_df.loc[mapping_df['mapping'] == 'twec','f1_score'])

if p >= alpha:
    print("    Same distributions (fail to reject H0) with pvalue = ",p, "stat=",stat)
else:
    print("    Different distributions (reject H0) = ", p, "stat=",stat)


In [None]:
### 1.3 Wilcoxon Signed-Rank Test (non parametric - normality is not a prerequisite)
* Prerequisites (non normal distributions, paired samples, two populations)
* Samples are paired since all variables except the under investigation variable are shared among the different populations
* H0: Populations have same distributions


In [None]:
# Wilcoxon Signed-Rank Test (non parametric - normality is not a prerequisite)
# Compare groups of incremental, procrustes and twec

alpha =0.05

print("Wilcoxon Signed-Rank H-test: \n\n incremental-procrustes")
stat, p = stats.wilcoxon(
    mapping_df.loc[mapping_df['mapping'] == 'incremental','f1_score'],
    mapping_df.loc[mapping_df['mapping'] == 'procrustes','f1_score'])

if p < alpha:
    print("    Null Hypothesis REJECTED with pvalue = ", p, "stat=",stat)
else:
    print("    Null hypothesis was ACCEPTED with pvalue = ",p, "stat=",stat)
    
print("\n incremental-twec")
stat, p = stats.wilcoxon(
    mapping_df.loc[mapping_df['mapping'] == 'incremental','f1_score'],
    mapping_df.loc[mapping_df['mapping'] == 'twec','f1_score'])

if p < alpha:
    print("    Null Hypothesis REJECTED with pvalue = ", p, "stat=",stat)
else:
    print("    Null hypothesis was ACCEPTED with pvalue = ",p, "stat=",stat)

print("\n procrustes-twec")
stat, p = stats.wilcoxon(
    mapping_df.loc[mapping_df['mapping'] == 'procrustes','f1_score'],
    mapping_df.loc[mapping_df['mapping'] == 'twec','f1_score'])

if p < alpha:
    print("    Null Hypothesis REJECTED with pvalue = ", p, "stat=",stat)
else:
    print("    Null hypothesis was ACCEPTED with pvalue = ",p, "stat=",stat)

### 1.4 Results
After the execution on the following combinations:
- **cbow-sgns** (algorithm)
- **incremental-procrustes-twec** (alignment)
- **en-de-swe-lat** (languages)
- **cd-ln** (metrics of cosine distance and local_neighborhood measure)

On the following **Word2Vec** parameter setting:
- **embeddings_dimension = 100**
- **window_size = 10**
- **min_count = 3** (number of occurences)
- **s = 0.001** (threshold for configuring which higher-frequency words are randomly downsampled)
- **k = 5** number of negative samples parameter 
- **epochs = 5**

We investigated the f1_scores of **48** executions (algorithms * alignmen * languages * metrics).

The result was that **there are no significate differences between the embeddings' alignment methods we used**.
We need to mention that there were not executions with pretrained embeddings at the above analysis.

In [None]:
sns.set()
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot( saturation=1, palette='BuGn',ax=ax, whis=[5, 95],x=mapping_df['mapping'],y=mapping_df['f1_score'])
# specify axis labels
plt.xlabel('', size=14, family='monospace')
plt.ylabel('', size=14, family='monospace')
plt.title('F1 Scores per Alignment Method')
plt.show()

## Research Questions
### 2. Do pretrained embeddings improve performance? in cases of procrustes and incremental alignment methods?

### Null Hypothesis: 
Executions with pretrained embeddings perform equally with those that haven't been prior initialized.



In [None]:
# deep copy
pretrained_df = results_df.copy(deep=True)
# remove "lda2vec" and "twec" executions
pretrained_df = pretrained_df.drop(pretrained_df[((pretrained_df['mapping'] == 'twec') 
                                         | (pretrained_df['w2vec_algorithm'] == 'lda2vec'))].index)
                                         
# remove unnecessary columns
pretrained_df = pretrained_df.drop(['precision_cd','precision_ln','accuracy_cd','accuracy_ln',
          'recall_cd','recall_ln','data_set_id','dim','window_size','mapping','t'],axis=1)

pretrained_df = pretrained_df.melt(id_vars=["language", "w2vec_algorithm","pretrained"], 
        var_name="metric", 
        value_name="f1_score")


# remove rows with Nan values at f1_scores 
pretrained_df = pretrained_df[pretrained_df['f1_score'].notna()]

# remove unnecessary columns
pretrained_df = pretrained_df.drop('metric',axis=1)

print(pretrained_df)

### 2.1 Shapiro-Wilk Test (checks normality of distribution)

In [None]:
# Shapiro-Wilk test
from collections import defaultdict
from scipy import stats

alpha =0.05

print("Shapiro-Wilk test for normal distribution: \n")
embedding_normality_dict = defaultdict()
for embedding in embeddings:
    if embedding == 'None':
        stat, p = stats.shapiro( pretrained_df.loc[pretrained_df['pretrained'] == 'None','f1_score'])
    else:
        stat, p = stats.shapiro( pretrained_df.loc[pretrained_df['pretrained'] != 'None','f1_score'])
    print(embedding)
    if p >= alpha:
        print("\t has a normal distribution with pvalue = "+ str(p) + ", stat=",str(stat))
        embedding_normality_dict[embedding] = True
    else:
        print("\t has NOT a normal distribution with pvalue = ", p, "stat=",stat)
        embedding_normality_dict[embedding] = False
        

### Normality Results
Executions with usage of **Pretrained** embeddings and **Non Pretained** embeddings can be described by the normal distribution.

The next step is to conduct a **paired-T test**.</u>

### 2.2 Paired T-Test (parametric - normality is  a prerequisite)
- Prerequisites:

    - normal distribution of dependent variable
    - continuous dependent variable
    - independent observations 
    - same subject for each group 
    - dependent variable does not contain outliers. 
  
- H0: means of the populations are equal to zero
* H1: p1 is not equal to p2  || p1 – p2 is not equal to zero.

**Samples are paired** since all variables except the under investigation variable are shared among the different populations



In [None]:
# Ensure we don't have any outliers
# IQR
Q1 = np.percentile(pretrained_df.loc[(pretrained_df['pretrained'] != 'None'), ['f1_score']], 
                   25,interpolation = 'midpoint')
 
Q3 = np.percentile(pretrained_df.loc[(pretrained_df['pretrained'] != 'None'), ['f1_score']], 
           75,interpolation = 'midpoint')

IQR = Q3 - Q1


# Above Upper bound
upper = pretrained_df['f1_score'] >= (Q3+1.5*IQR)
 
print("Upper bound:",upper)
#print(np.where(upper))
 
# Below Lower bound
lower = pretrained_df['f1_score'] <= (Q1-1.5*IQR)
print("Lower bound:", lower)
#print(np.where(lower))

In [None]:
# Paired T-Test (parametric - normality is not a prerequisite)
# Compare groups of incremental, procrustes and twec

alpha =0.05

print("Paired T-Test H-test: \n\n pretrained - NOT pretrained")
stat, p = stats.ttest_rel(
    pretrained_df.loc[pretrained_df['pretrained'] != 'None','f1_score'],
    pretrained_df.loc[pretrained_df['pretrained'] == 'None','f1_score'])

if p < alpha:
    print("    Null Hypothesis REJECTED with pvalue = ", p, "stat=",stat)
else:
    print("    Null hypothesis was ACCEPTED with pvalue = ",p, "stat=",stat)
    

### 2.3 Results
After the execution on the following combinations:
- **cbow-sgns** (algorithm)
- **incremental-procrustes** (alignment)
- **en-de-swe-lat** (languages)
- **cd-ln** (metrics of cosine distance and local_neighborhood measure)

On the following **Word2Vec** parameter setting:
- **embeddings_dimension = 100**
- **window_size = 10**
- **min_count = 3** (number of occurences)
- **s = 0.001** (threshold for configuring which higher-frequency words are randomly downsampled)
- **k = 5** number of negative samples parameter 
- **epochs = 5**

For half of the samples vector weights were prior initialized with pretrained embeddings.

We investigated the f1_scores of **64 executions** (algorithms * alignmen * languages * metrics).

The result was that **there are significate differences** between the model which were prior initialized and those hadn't.
We need to mention that there were not executions with twec alignment method at the above analysis.

In [None]:
sns.set()
fig, ax = plt.subplots(figsize=(12,8))

pretrained_df.loc[pretrained_df["pretrained"] != "None", "pretrained"] = "pretrained"
sns.boxplot( saturation=1, palette='BuGn',ax=ax, whis=[5, 95],x=pretrained_df['pretrained'],y=pretrained_df['f1_score'])
# specify axis labels
plt.xlabel('', size=14, family='monospace')
plt.ylabel('', size=14, family='monospace')
plt.title('F1 Scores per type of embeddings')
plt.show()

## Research Questions
### 3. Lda2Vec and word2vec models performs the same?

### Null Hypothesis: 
Executions with lda2vec equally perform with those from a word2vec across different executions on the same datasets and parameters.


In [None]:
# deep copy
model_df = results_df.copy(deep=True)
# remove "lda2vec" and "pretrained" executions
model_df = model_df.drop(model_df[((model_df['pretrained'] != 'None') )].index)
                                         
# remove unnecessary columns
model_df = model_df.drop(['precision_cd','precision_ln','accuracy_cd','accuracy_ln',
          'recall_cd','recall_ln','data_set_id','dim','window_size','pretrained','t'],axis=1)

model_df = model_df.melt(id_vars=["language", "w2vec_algorithm","mapping"], 
        var_name="metric", 
        value_name="f1_score")


# remove rows with Nan values at f1_scores 
model_df = model_df[model_df['f1_score'].notna()]

# transform cbow/sgns to word2vec
model_df["w2vec_algorithm"] = np.where(model_df["w2vec_algorithm"] == "lda2vec", 'lda2vec', 'word2vec')


# keep experiments with same mapping method of lda2vec and word2vec
model_df = model_df[(model_df['mapping'] == 'procrustes') ]

# keep experiments with same language
model_df = model_df[model_df['language'].isin(['en','lat'])]

# remove unnecessary columns
model_df = model_df.drop('metric',axis=1)

print(model_df)

### 3.1 Shapiro-Wilk Test (checks normality of distribution)

In [None]:
# Shapiro-Wilk test
from collections import defaultdict
from scipy import stats

alpha = 0.05
count = 0

print("Shapiro-Wilk test for normal distribution: \n")
model_normality_dict = defaultdict()
for w2vec_algorith in w2vec_algorithms:
    if w2vec_algorith in 'lda2vec':
        stat, p = stats.shapiro( model_df.loc[model_df['w2vec_algorithm'] == w2vec_algorith,'f1_score'])
    else:
        count +=1
        stat, p = stats.shapiro( model_df.loc[model_df['w2vec_algorithm'] != 'lda2vec','f1_score'])
    if count==2:
        print("word2vec")
    stat, p = stats.shapiro( model_df.loc[model_df['w2vec_algorithm'] == str(w2vec_algorith),'f1_score'])
    if p >= alpha:
        print("\t has a normal distribution with pvalue = "+ str(p) + ", stat=",str(stat))
        model_normality_dict[mapping] = True
    else:
        print("\t has NOT a normal distribution with pvalue = ", p, "stat=",stat)
        model_normality_dict[mapping] = False

### Normality Results
**SGNS** and **CBOW** models can be described by the normal distribution.

However since **Lda2Vec** method does not meet the criteria of the normality <u>we have to go through with non parapetric tests.</u>

### 3.2 Wilcoxon Signed-Rank Test (non parametric - normality is not a prerequisite)
* Prerequisites (non normal distributions, paired samples, two populations)
* Samples are paired since all variables except the under investigation variable are shared among the different populations
* H0: Populations have same distributions


In [None]:
# Wilcoxon Signed-Rank Test (non parametric - normality is not a prerequisite)
# Compare groups of incremental, procrustes and twec

alpha =0.05

print("Wilcoxon Signed-Rank H-test: \n\n lda2vec-word2vec")
stat, p = stats.wilcoxon(
    model_df.loc[model_df['w2vec_algorithm'] == 'lda2vec','f1_score'],
    model_df.loc[model_df['w2vec_algorithm'] != 'lda2vec','f1_score'])

if p < alpha:
    print("    Null Hypothesis REJECTED with pvalue = ", p, "stat=",stat)
else:
    print("    Null hypothesis was ACCEPTED with pvalue = ",p, "stat=",stat)
    


### 3.3 Results
The usage of pretrained embeddings improves F1 scores. 

In [None]:
sns.set()
fig, ax = plt.subplots(figsize=(12,8))
sns.boxplot( saturation=1, palette='BuGn',ax=ax, whis=[5, 95],x=model_df['w2vec_algorithm'],y=model_df['f1_score'])
# specify axis labels
plt.xlabel('', size=14, family='monospace')
plt.ylabel('', size=14, family='monospace')
plt.title('F1 Scores per Represenation model')
plt.show()

In [None]:
### 2.3 Results
After the execution on the following combinations:
- **lda2vec-word2vec** (model)
- **incremental-procrustes** (alignment)
- **en-lat** (languages)
- **cd-ln** (metrics of cosine distance and local_neighborhood measure)

On the following **Word2Vec** parameter setting:
- **embeddings_dimension = 100**
- **window_size = 10**
- **min_count = 3** (number of occurences)
- **s = 0.001** (threshold for configuring which higher-frequency words are randomly downsampled)
- **k = 5** number of negative samples parameter 
- **epochs = 5**

For half of the samples vector weights were prior initialized with pretrained embeddings.

We investigated the f1_scores of **16 executions** (models * languages * metrics).

The result was that **there are significate differences** between the model which were prior initialized and those hadn't.
We need to mention that there were not executions with twec alignment method at the above analysis.