# IHLT Project


In [14]:
# Autoreload
%load_ext autoreload
%autoreload 2

import pandas as pd
import csv
import nltk

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor

from features.statistical_features import FeatureExtractor

import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet_ic')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\paubl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\paubl\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\paubl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\paubl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\paubl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet_ic to
[nltk_data]     C:\Users\paubl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet_ic is already u

True

## Load data

In [15]:
TRAIN_PATH = '../data/train/01_raw/'
TRAIN_GS_PATH = '../data/train/scores/'
TEST_PATH = '../data/test/01_raw/'
TEST_GS_PATH = '../data/test/scores/'
TRAIN_SAVE_PATH = '../data/train/02_preprocessed/preprocessed_train_data.csv'
TEST_SAVE_PATH = '../data/test/02_preprocessed/preprocessed_test_data.csv'
PREDICTED_SAVE_PATH = '../data/test/03_predicted/'

def load_data(path_f, path_gs, files):
    # Read first file
    dt = pd.read_csv(path_f + 'STS.input.' + files[0] + '.txt', sep='\t', quoting=csv.QUOTE_NONE, header=None, names=['s1', 's2'])
    dt['gs'] = pd.read_csv(path_gs + 'STS.gs.' + files[0] + '.txt', sep='\t', header=None, names=['gs'])
    dt['file']=files[0]
    # Concatenate the rest of files
    for f in files[1:]:
        dt2 = pd.read_csv(path_f + 'STS.input.' + f + '.txt', sep='\t', quoting=csv.QUOTE_NONE, header=None, names=['s1', 's2'])
        dt2['gs'] = pd.read_csv(path_gs + 'STS.gs.' + f + '.txt', sep='\t', header=None, names=['gs'])
        dt2['file']=f
        dt = pd.concat([dt, dt2], ignore_index=True)
    return dt

## Create models

In [16]:
import numpy as np
from scikeras.wrappers import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as st
from sklearn.metrics import make_scorer
from scipy.stats import pearsonr

def create_model(input_dim, learning_rate=0.001, neurons=10, hidden_layers=2):
    """
    Function to create a Keras model with specified hyperparameters.
    """
    model = Sequential()
    model.add(Dense(neurons, input_dim=input_dim, activation='relu'))  # Input layer
    for _ in range(hidden_layers):  # Add hidden layers
        model.add(Dense(neurons, activation='relu'))
    model.add(Dense(1))  # Output layer
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'])
    return model

def train_NN(df, input, output):
    """
    Train a neural network with grid search to find the best hyperparameters.
    """
    X = df[input]
    y = df[output]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    input_dim = X.shape[1]

    # Wrap Keras model for compatibility with GridSearchCV
    model = KerasRegressor(
        model=create_model,
        model__input_dim=input_dim,  
        verbose=0
    )

    pearson_score = make_scorer(pearson_scorer, greater_is_better=True)

    # TODO: Run this grid with beefier hardware, it's too slow for me to test
    # Define hyperparameter grid
    # param_grid = {
    #     'batch_size': [16, 32, 64],
    #     'epochs': [50, 100],
    #     'learning_rate': [0.001, 0.01],
    #     'neurons': [5, 10, 20],
    #     'hidden_layers': [2, 3, 4]
    # }

    param_grid = {
        "model__neurons": [5, 10],
        "model__hidden_layers": [2],
        "model__learning_rate": [0.001],
        "batch_size": [16, 32],
        "epochs": [50, 100],
    }

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=pearson_score,
        cv=3,
        verbose=2,
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    print("Best Parameters:", grid_search.best_params_)

    return best_model

def train_MLP(df, input, output):
    X = df[input]
    y = df[output]

    # Define the base model
    model = MLPRegressor(max_iter=1000, early_stopping=True, validation_fraction=0.1, verbose=False)

    pearson_score = make_scorer(pearson_scorer, greater_is_better=True)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'hidden_layer_sizes': [(10,), (50,), (100,), (50, 100)],  # Vary number and size of layers
        'activation': ['relu', 'tanh'],                                       # Activation functions
        'solver': ['adam'],                                            # Optimization solvers
        'alpha': [0.0001, 0.001, 0.01],                                       # Regularization strength
        'learning_rate': ['constant', 'adaptive'],
        'max_iter': [200, 500, 1000]                                                      # Learning rate schedules
    }

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=pearson_score,  
        cv=5,                              # 3-fold cross-validation
        verbose=2,                         # Display progress
        n_jobs=-1,                          # Use all available cores
    )

    # Fit the grid search to the data
    grid_search.fit(X, y)

    results = grid_search.cv_results_
    for mean_score, params in zip(results['mean_test_score'], results['params']):
        if np.isnan(mean_score):
            print("Failed combination:", params)

    # Best model from grid search
    best_model = grid_search.best_estimator_
    print("Best Hyperparameters:", grid_search.best_params_)

    return best_model

def pearson_scorer(y_true, y_pred):
    # pearsonr returns a tuple (correlation, p-value)
    return pearsonr(y_true, y_pred)[0]

def train_RF(df, input, output):
    X = df[input]
    y = df[output]

    # Define the model
    model = RandomForestRegressor()

    # TODO: Run this grid with beefier hardware, it's too slow for me to test
    # Define the grid of hyperparameters to search
    # param_grid = {
    #     'n_estimators': [100, 200, 300],  
    #     'max_depth': [None, 10, 20, 30], 
    #     'min_samples_split': [2, 5, 10], 
    #     'min_samples_leaf': [1, 2, 4],   
    #     'max_features': ['sqrt', 'log2'],
    #     'bootstrap': [True, False],      
    # }

    param_grid = {
        'n_estimators': [100, 200, 300],  
        'max_depth': [None, 10], 
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],  
        'max_features': ['sqrt', 'log2'],
        'bootstrap': [True, False],     
    }

    pearson_score = make_scorer(pearson_scorer, greater_is_better=True)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=pearson_score,  
        cv=5,                              
        verbose=2,                         
        n_jobs=-1                          
    )

    # Fit the grid search to the data
    grid_search.fit(X, y)

    # Best model from grid search
    best_model = grid_search.best_estimator_
    print("Best Hyperparameters:", grid_search.best_params_)

    return best_model, grid_search.best_params_

# TODO: No longer slow but extremely poor performance, investigate
def train_SVR(df, input, output):
    X = df[input]
    y = df[output]
    
    # Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    pearson_score = make_scorer(pearson_scorer, greater_is_better=True)

    model = SVR(cache_size=2000)  # cache kernel computations

    # Reduced / randomized parameter space
    param_grid = {
        'kernel': ['rbf'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto']
    }

    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=20,         # fewer random samples
        cv=3,              # fewer folds
        scoring=pearson_score,
        verbose=1,
        n_jobs=10,
    )

    random_search.fit(X_scaled, y)
    best_params = random_search.best_params_
    print("Best params:", best_params)
    best_model = random_search.best_estimator_

    return best_model

def train_single_RF(df, input, output, params):
    X = df[input]
    y = df[output]

    # Define the model
    pearson_score = make_scorer(pearson_scorer, greater_is_better=True)
    model = RandomForestRegressor(**params) #, scoring=pearson_score)

    # Fit the grid search to the data
    model.fit(X, y)

    return model



In [17]:
def plot_history(history):
    import matplotlib.pyplot as plt
    plt.plot(history.history['mae'])
    plt.plot(history.history['val_mae'])
    plt.title('model mae')
    plt.ylabel('mae')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()

## Basic pipeline

### Compute features

In [34]:
feature_extractor = FeatureExtractor()

# Create the desired features
def add_features(dt):
    feature_extractor.add_POS_statistics(dt)
    feature_extractor.add_synset_statistics_ext(dt)
    feature_extractor.add_lemma_statistics(dt)

# Load train data
print('Loading train data')
all_train_files = ['SMTeuroparl', 'MSRvid', 'MSRpar']
df_train = load_data(TRAIN_PATH, TRAIN_GS_PATH, all_train_files)

# Add features to the train data
add_features(df_train)

# Save df to a file
df_train.to_csv(TRAIN_SAVE_PATH, index=False)

# Load test data
print('Loading test data')
all_test_files = ['SMTeuroparl', 'MSRvid', 'MSRpar', 'surprise.OnWN', 'surprise.SMTnews']
df_test = load_data(TEST_PATH, TEST_GS_PATH, all_test_files)

# Add the features to the test data
add_features(df_test)

# Save df_test to a file
df_test.to_csv(TEST_SAVE_PATH, index=False)

print('Train and test datasets ready')

Loading train data
Adding POS based features...
Adding synset based features...
Processed 2234/2234 rows (100.0%)    
Adding lemma based features...
Loading test data
Adding POS based features...
Adding synset based features...
Processed 3108/3108 rows (100.0%)    
Adding lemma based features...
Train and test datasets ready


In [83]:
feature_extractor = FeatureExtractor()
#feature_extractor.add_Word_statistics()

In [117]:
# Load dataframes from files
df_train = pd.read_csv(TRAIN_SAVE_PATH)
df_test = pd.read_csv(TEST_SAVE_PATH)


In [None]:
# Features to use in the model
features = [
            's1_n_words', 's1_n_verbs_tot', 's1_n_verbs_pres', 's1_n_verbs_past', 's1_n_nouns', 's1_n_adjectives', 's1_n_adverbs', 
            's2_n_words', 's2_n_verbs_tot', 's2_n_verbs_pres', 's2_n_verbs_past', 's2_n_nouns', 's2_n_adjectives', 's2_n_adverbs', 
            'dif_n_words', 'dif_n_verbs_tot', 'dif_n_verbs_pres', 'dif_n_verbs_past', 'dif_n_nouns', 'dif_n_adjectives', 'dif_n_adverbs', 
            
            'all_all_shared_synsets_count', 'all_all_shared_synsets_ratio', 'all_all_avg_synset_similarity', 'all_all_max_synset_similarity',
            'all_verb_shared_synsets_count', 'all_verb_shared_synsets_ratio', 'all_verb_avg_synset_similarity', 'all_verb_max_synset_similarity',
            'all_noun_shared_synsets_count', 'all_noun_shared_synsets_ratio', 'all_noun_avg_synset_similarity', 'all_noun_max_synset_similarity',
            'all_adj_shared_synsets_count', 'all_adj_shared_synsets_ratio', 'all_adj_avg_synset_similarity', 'all_adj_max_synset_similarity',
            'all_adv_shared_synsets_count', 'all_adv_shared_synsets_ratio', 'all_adv_avg_synset_similarity', 'all_adv_max_synset_similarity',

            'best_all_shared_synsets_count', 'best_all_shared_synsets_ratio', 'best_all_avg_synset_similarity', 'best_all_max_synset_similarity',
            'best_verb_shared_synsets_count', 'best_verb_shared_synsets_ratio', 'best_verb_avg_synset_similarity', 'best_verb_max_synset_similarity',
            'best_noun_shared_synsets_count', 'best_noun_shared_synsets_ratio', 'best_noun_avg_synset_similarity', 'best_noun_max_synset_similarity',
            'best_adj_shared_synsets_count', 'best_adj_shared_synsets_ratio', 'best_adj_avg_synset_similarity', 'best_adj_max_synset_similarity',
            'best_adv_shared_synsets_count', 'best_adv_shared_synsets_ratio', 'best_adv_avg_synset_similarity', 'best_adv_max_synset_similarity',

            'lemma_diversity', 'shared_lemmas_ratio', 'lemma_jackard_similarity', 'avg_lemma_similarity', 'max_lemma_similarity', 'shared_lemma_count', 'dice_coefficient',
            'lemma_bigram_overlap', 'lemma_lcs_length', 'lemma_edit_distance', 'proportion_s1_in_s2', 'proportion_s2_in_s1', 'lemma_position_similarity'
            ]


# Train a NN
# model = train_NN(df_train, features, 'gs')
# plot_history(hist)

# Train a MLP
#best_mlp_model = train_MLP(df_train, features, 'gs')

# Train a Random Forest
model, params = train_RF(df_train, features, 'gs')
# Best params found: {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
# model = train_single_RF(df_train, features, 'gs', params)

# Train  an SVR
# model = train_SVR(df_train, features, 'gs')

Fitting 5 folds for each of 96 candidates, totalling 480 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Hyperparameters: {'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


## Compute correlation of the model

In [40]:
from scipy.stats import pearsonr

# Fill column of the dataset with the predictions of the model
# df_test['predicted'] = best_rf_model.predict(df_test[features])
#df_test['predicted'] = best_mlp_model.predict(df_test[features])
df_test['predicted'] = model.predict(df_test[features])

# Compute the Pearson correlation between the predictions and the gold standard
corr = pearsonr(df_test['gs'], df_test['predicted'])[0]
print('Pearson correlation:', corr)


Pearson correlation: 0.7540496030246525


In [37]:
# Save the predicted dataset
# Add timestamp to the name of the file
import re
import datetime
now = datetime.datetime.now()
timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")
df_test.to_csv(PREDICTED_SAVE_PATH + timestamp + '_predicted_test_data.csv', index=False)


def clean_illegal_characters(df):
    illegal_characters_re = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
    
    def clean_value(value):
        if isinstance(value, str):
            return illegal_characters_re.sub('', value)
        return value
    
    return df.applymap(clean_value)

df_clean = clean_illegal_characters(df_test)
df_clean.to_excel(PREDICTED_SAVE_PATH + timestamp + '_predicted_test_data.xlsx', index=False)


  return df.applymap(clean_value)


In [41]:
N_ITERS = 100
rf_p = 0

for i in range(N_ITERS):
    # print('Iteration:', i)
    # hist, model = train_NN(df, features, 'gs')
    # df_test['predicted'] = model.predict(df_test[features])
    # corr = pearsonr(df_test['gs'], df_test['predicted'])[0]
    # print('NN Pearson correlation:', corr)
    # nn_p += corr

    model = train_single_RF(df_train, features, 'gs', params)
    df_test['predicted'] = model.predict(df_test[features])
    corr = pearsonr(df_test['gs'], df_test['predicted'])[0]
    rf_p += corr
    print(i, 'RF Pearson correlation:', corr, "=>", rf_p/(i+1))

# print('NN Pearson correlation:', nn_p/N_ITERS)
print('Mean RF Pearson correlation:', rf_p/N_ITERS)

0 RF Pearson correlation: 0.7551433485286597 => 0.7551433485286597
1 RF Pearson correlation: 0.755104949541115 => 0.7551241490348873
2 RF Pearson correlation: 0.7554078851901957 => 0.7552187277533235
3 RF Pearson correlation: 0.7532854549570337 => 0.7547354095542511
4 RF Pearson correlation: 0.7533938134124563 => 0.7544670903258922
5 RF Pearson correlation: 0.7534126050655634 => 0.754291342782504
6 RF Pearson correlation: 0.7510654619861754 => 0.7538305026687427
7 RF Pearson correlation: 0.7550385904860417 => 0.7539815136459052
8 RF Pearson correlation: 0.753789450172195 => 0.7539601732599374
9 RF Pearson correlation: 0.7552045902018234 => 0.7540846149541259
10 RF Pearson correlation: 0.755361727297691 => 0.7542007160762683
11 RF Pearson correlation: 0.7536979282713231 => 0.7541588170925229
12 RF Pearson correlation: 0.7536678058128037 => 0.7541210469940829
13 RF Pearson correlation: 0.753927025512459 => 0.754107188316824
14 RF Pearson correlation: 0.7551580325298214 => 0.7541772445976

## Compare correlation based on file and features

In [42]:
all_features = [
            's1_n_words', 's1_n_verbs_tot', 's1_n_verbs_pres', 's1_n_verbs_past', 's1_n_nouns', 's1_n_adjectives', 's1_n_adverbs', 
            's2_n_words', 's2_n_verbs_tot', 's2_n_verbs_pres', 's2_n_verbs_past', 's2_n_nouns', 's2_n_adjectives', 's2_n_adverbs', 
            'dif_n_words', 'dif_n_verbs_tot', 'dif_n_verbs_pres', 'dif_n_verbs_past', 'dif_n_nouns', 'dif_n_adjectives', 'dif_n_adverbs', 
            
            'all_all_shared_synsets_count', 'all_all_shared_synsets_ratio', 'all_all_avg_synset_similarity', 'all_all_max_synset_similarity',
            'all_verb_shared_synsets_count', 'all_verb_shared_synsets_ratio', 'all_verb_avg_synset_similarity', 'all_verb_max_synset_similarity',
            'all_noun_shared_synsets_count', 'all_noun_shared_synsets_ratio', 'all_noun_avg_synset_similarity', 'all_noun_max_synset_similarity',
            'all_adj_shared_synsets_count', 'all_adj_shared_synsets_ratio', 'all_adj_avg_synset_similarity', 'all_adj_max_synset_similarity',
            'all_adv_shared_synsets_count', 'all_adv_shared_synsets_ratio', 'all_adv_avg_synset_similarity', 'all_adv_max_synset_similarity',

            'best_all_shared_synsets_count', 'best_all_shared_synsets_ratio', 'best_all_avg_synset_similarity', 'best_all_max_synset_similarity',
            'best_verb_shared_synsets_count', 'best_verb_shared_synsets_ratio', 'best_verb_avg_synset_similarity', 'best_verb_max_synset_similarity',
            'best_noun_shared_synsets_count', 'best_noun_shared_synsets_ratio', 'best_noun_avg_synset_similarity', 'best_noun_max_synset_similarity',
            'best_adj_shared_synsets_count', 'best_adj_shared_synsets_ratio', 'best_adj_avg_synset_similarity', 'best_adj_max_synset_similarity',
            'best_adv_shared_synsets_count', 'best_adv_shared_synsets_ratio', 'best_adv_avg_synset_similarity', 'best_adv_max_synset_similarity',

            'lemma_diversity', 'shared_lemmas_ratio', 'avg_lemma_similarity', 'max_lemma_similarity', 'shared_lemma_count', 'dice_coefficient',
            'lemma_bigram_overlap', 'lemma_lcs_length', 'lemma_edit_distance', 'proportion_s1_in_s2', 'proportion_s2_in_s1', 'lemma_position_similarity'
            ]
word_features = [
            's1_n_words', 's1_n_verbs_tot', 's1_n_verbs_pres', 's1_n_verbs_past', 's1_n_nouns', 's1_n_adjectives', 's1_n_adverbs', 
            's2_n_words', 's2_n_verbs_tot', 's2_n_verbs_pres', 's2_n_verbs_past', 's2_n_nouns', 's2_n_adjectives', 's2_n_adverbs', 
            'dif_n_words', 'dif_n_verbs_tot', 'dif_n_verbs_pres', 'dif_n_verbs_past', 'dif_n_nouns', 'dif_n_adjectives', 'dif_n_adverbs', 
            ]
synset_features = [
            'all_all_shared_synsets_count', 'all_all_shared_synsets_ratio', 'all_all_avg_synset_similarity', 'all_all_max_synset_similarity',
            'all_verb_shared_synsets_count', 'all_verb_shared_synsets_ratio', 'all_verb_avg_synset_similarity', 'all_verb_max_synset_similarity',
            'all_noun_shared_synsets_count', 'all_noun_shared_synsets_ratio', 'all_noun_avg_synset_similarity', 'all_noun_max_synset_similarity',
            'all_adj_shared_synsets_count', 'all_adj_shared_synsets_ratio', 'all_adj_avg_synset_similarity', 'all_adj_max_synset_similarity',
            'all_adv_shared_synsets_count', 'all_adv_shared_synsets_ratio', 'all_adv_avg_synset_similarity', 'all_adv_max_synset_similarity',
            ]
lemma_features = [
            'lemma_diversity', 'shared_lemmas_ratio', 'avg_lemma_similarity', 'max_lemma_similarity', 'shared_lemma_count', 'dice_coefficient',
            'lemma_bigram_overlap', 'lemma_lcs_length', 'lemma_edit_distance', 'proportion_s1_in_s2', 'proportion_s2_in_s1', 'lemma_position_similarity'
            ]
lexical_features = [
            'all_all_shared_synsets_count', 'all_all_shared_synsets_ratio', 'all_all_avg_synset_similarity', 'all_all_max_synset_similarity',
            'all_verb_shared_synsets_count', 'all_verb_shared_synsets_ratio', 'all_verb_avg_synset_similarity', 'all_verb_max_synset_similarity',
            'all_noun_shared_synsets_count', 'all_noun_shared_synsets_ratio', 'all_noun_avg_synset_similarity', 'all_noun_max_synset_similarity',
            'all_adj_shared_synsets_count', 'all_adj_shared_synsets_ratio', 'all_adj_avg_synset_similarity', 'all_adj_max_synset_similarity',
            'all_adv_shared_synsets_count', 'all_adv_shared_synsets_ratio', 'all_adv_avg_synset_similarity', 'all_adv_max_synset_similarity',

            'best_all_shared_synsets_count', 'best_all_shared_synsets_ratio', 'best_all_avg_synset_similarity', 'best_all_max_synset_similarity',
            'best_verb_shared_synsets_count', 'best_verb_shared_synsets_ratio', 'best_verb_avg_synset_similarity', 'best_verb_max_synset_similarity',
            'best_noun_shared_synsets_count', 'best_noun_shared_synsets_ratio', 'best_noun_avg_synset_similarity', 'best_noun_max_synset_similarity',
            'best_adj_shared_synsets_count', 'best_adj_shared_synsets_ratio', 'best_adj_avg_synset_similarity', 'best_adj_max_synset_similarity',
            'best_adv_shared_synsets_count', 'best_adv_shared_synsets_ratio', 'best_adv_avg_synset_similarity', 'best_adv_max_synset_similarity',

            'lemma_diversity', 'shared_lemmas_ratio', 'avg_lemma_similarity', 'max_lemma_similarity', 'shared_lemma_count', 'dice_coefficient',
            'lemma_bigram_overlap', 'lemma_lcs_length', 'lemma_edit_distance', 'proportion_s1_in_s2', 'proportion_s2_in_s1', 'lemma_position_similarity'
]
features_sets = [('All', all_features), ('Synsets', synset_features), ('Lemmas', lemma_features), ('PoS (Syntactical)', word_features), ('Lexical', lexical_features), ]
files_sets = [
    ('SMTeuroparl', ['SMTeuroparl'], ['SMTeuroparl', 'surprise.OnWN', 'surprise.SMTnews']),
    ('MSRvid', ['MSRvid'], ['MSRvid', 'surprise.OnWN', 'surprise.SMTnews']),
    ('MSRpar', ['MSRpar'], ['MSRpar', 'surprise.OnWN', 'surprise.SMTnews']),
    ('All', ['SMTeuroparl', 'MSRvid', 'MSRpar'], ['SMTeuroparl', 'MSRvid', 'MSRpar', 'surprise.OnWN', 'surprise.SMTnews'])
]

N_ITERS = 10
# Train a Random Forest for each feature set and each file set
for t_name, tr_set, vl_set in files_sets:
    print()
    for f_name, f_set in features_sets:
        train = df_train[df_train['file'].isin(tr_set)]
        
        test = df_test[df_test['file'].isin(tr_set)]
        corr = 0
        for i in range(N_ITERS):
            model = train_single_RF(train, f_set, 'gs', params)
            test.loc[:, 'predicted'] = model.predict(test[f_set])
            corr += pearsonr(test['gs'], test['predicted'])[0]
        print(f_name, t_name, corr / N_ITERS)

        test = df_test[df_test['file'].isin(vl_set)]
        corr = 0
        for i in range(N_ITERS):
            model = train_single_RF(train, f_set, 'gs', params)
            test.loc[:, 'predicted'] = model.predict(test[f_set])
            corr += pearsonr(test['gs'], test['predicted'])[0]
        print(f_name, t_name, corr / N_ITERS, '(with surprise files)')



All SMTeuroparl 0.5323692675481492
All SMTeuroparl 0.6311187599991405 (with surprise files)
Synsets SMTeuroparl 0.48866057960531
Synsets SMTeuroparl 0.5603342595419987 (with surprise files)
Lemmas SMTeuroparl 0.48391137589586364
Lemmas SMTeuroparl 0.58081638594915 (with surprise files)
PoS (Syntactical) SMTeuroparl 0.2788364964179575
PoS (Syntactical) SMTeuroparl 0.3112440728739055 (with surprise files)
Lexical SMTeuroparl 0.5328963938746243
Lexical SMTeuroparl 0.6266142158276385 (with surprise files)

All MSRvid 0.8613777484336452
All MSRvid 0.7915211726047531 (with surprise files)
Synsets MSRvid 0.8324145109864043
Synsets MSRvid 0.7352174235135138 (with surprise files)
Lemmas MSRvid 0.6575259648502403
Lemmas MSRvid 0.6414286178310427 (with surprise files)
PoS (Syntactical) MSRvid 0.20795505045582274
PoS (Syntactical) MSRvid 0.3528823063515935 (with surprise files)
Lexical MSRvid 0.8615321031746392
Lexical MSRvid 0.7774928297940386 (with surprise files)

All MSRpar 0.59860392673409
A

## Is it possible to predict separating by source file?

In [None]:
from keras.utils import to_categorical
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Get a filtered test set
filt_df_test = df_test[df_test['file'].isin(all_train_files)].copy()

# Encode the file column to numerical labels
label_encoder = LabelEncoder()
df_train['file_encoded'] = label_encoder.fit_transform(df_train['file'])
filt_df_test.loc[:, 'file_encoded'] = label_encoder.transform(filt_df_test['file'])

# On the train set, do a categorical encoding for the file column
y_train = to_categorical(df_train['file_encoded'], num_classes=len(all_train_files))
# Filter the test dataset and do the categorical encoding
filt_y_test = to_categorical(filt_df_test['file_encoded'], num_classes=len(all_train_files))

# Create a random forest classification model from df_train to y_train
clf = RandomForestClassifier(n_estimators=100)

# Train the model
clf.fit(df_train[all_features], y_train)

# Predict on the test set
y_pred = clf.predict(filt_df_test[all_features])

# Evaluate the model
accuracy = np.mean(np.argmax(filt_y_test, axis=1) == np.argmax(y_pred, axis=1))
print('Accuracy in predicting file:', accuracy)

# In the global train_set, assign the predicted file classç
print(np.argmax(clf.predict(df_train[all_features]), axis=1))
df_train['pred_file'] = np.argmax(clf.predict(df_train[all_features]), axis=1)

# Train a regression random forest for each partition of df_train based on pred_file
partitioned_models = []
for file_class in range(len(all_train_files)):
    partition = df_train[df_train['pred_file']== file_class]
    print("Number of rows in partition ", file_class, ":", partition.shape[0])
    if not partition.empty:
        print("One model")
        model, params = train_RF(partition, all_features, 'gs')
        # model = RandomForestRegressor(**params)
        # model.fit(partition[all_features], partition['gs'])
        partitioned_models.append(model)

# For each row in df_test, predict the file class and use the corresponding model to predict the predicted_gs
# Predict the file class for each row in df_test
df_test['pred_file'] = np.argmax(clf.predict(df_test[all_features]), axis=1)

# For each partition of df_test based on pred_file, use the corresponding model to predict the gs
for fcls in range(len(all_train_files)):
    pred_gs = partitioned_models[fcls].predict(df_test[df_test['pred_file'] == fcls][all_features])
    print("Predicted len: ", len(pred_gs))
    df_test.loc[df_test['pred_file'] == fcls, 'gs_predicted'] = pred_gs

# Compute the pearson correlation
final_corr = pearsonr(df_test['gs'], df_test['gs_predicted'])[0]
print("Final correlation: ", final_corr)

Accuracy in predicting file: 0.8249106687085248
[2 2 2 ... 0 0 0]
Number of rows in partition  0 : 750
One model
Fitting 5 folds for each of 96 candidates, totalling 480 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Hyperparameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Number of rows in partition  1 : 750
One model
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best Hyperparameters: {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}
Number of rows in partition  2 : 734
One model
Fitting 5 folds for each of 96 candidates, totalling 480 fits
Best Hyperparameters: {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Predicted len:  1454
Predicted len:  1354
Predicted len:  300
Final correlation:  0.7283554075731684
