# IHLT Project


In [40]:
# Autoreload
%load_ext autoreload
%autoreload 2

import pandas as pd
import csv
import nltk

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor

from features.statistical_features import FeatureExtractor

import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet_ic')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\paubl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\paubl\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\paubl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\paubl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\paubl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet_ic to
[nltk_data]     C:\Users\paubl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet_ic is already u

True

## Load data

In [41]:
TRAIN_PATH = '../data/train/01_raw/'
TRAIN_GS_PATH = '../data/train/scores/'
TEST_PATH = '../data/test/01_raw/'
TEST_GS_PATH = '../data/test/scores/'
TRAIN_SAVE_PATH = '../data/train/02_preprocessed/preprocessed_train_data.csv'
TEST_SAVE_PATH = '../data/test/02_preprocessed/preprocessed_test_data.csv'
PREDICTED_SAVE_PATH = '../data/test/03_predicted/'

def load_data(path_f, path_gs, files):
    # Read first file
    dt = pd.read_csv(path_f + 'STS.input.' + files[0] + '.txt', sep='\t', quoting=csv.QUOTE_NONE, header=None, names=['s1', 's2'])
    dt['gs'] = pd.read_csv(path_gs + 'STS.gs.' + files[0] + '.txt', sep='\t', header=None, names=['gs'])
    dt['file']=files[0]
    # Concatenate the rest of files
    for f in files[1:]:
        dt2 = pd.read_csv(path_f + 'STS.input.' + f + '.txt', sep='\t', quoting=csv.QUOTE_NONE, header=None, names=['s1', 's2'])
        dt2['gs'] = pd.read_csv(path_gs + 'STS.gs.' + f + '.txt', sep='\t', header=None, names=['gs'])
        dt2['file']=f
        dt = pd.concat([dt, dt2], ignore_index=True)
    return dt

## Create models

In [42]:
def train_NN(df, input, output):
    X = df[input]
    y = df[output]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)
    model = Sequential()
    # model.add(Dense(10, input_dim=len(input), activation='relu'))
    model.add(Dense(5, activation='relu'))
    model.add(Dense(5, activation='relu'))               
    model.add(Dense(5, activation='relu'))               
    model.add(Dense(1))                                   
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=32, verbose=0)
    return history, model

def train_MLP(df, input, output):
    X = df[input]
    y = df[output]
    model = MLPRegressor(
        hidden_layer_sizes=(5, 5, 5), 
        max_iter=500, 
        random_state=42, 
        early_stopping=True, 
        validation_fraction=0.1,  
        verbose=True
    )
    history = model.fit(X, y)
    return history, model

def train_random_forest(df, input, output):
    X = df[input]
    y = df[output]

    # Model with additional parameters - TODO: Grid search for hyperparameters
    model = RandomForestRegressor(
        n_estimators=200,          
        max_depth=10,              
        min_samples_split=5,      
        min_samples_leaf=2,        
        max_features='sqrt',      
        bootstrap=True,           
    )
    
    model.fit(X, y)
    return model

def train_svr(df, input, output, kernel='rbf', C=1.0, epsilon=0.1):
    X = df[input]
    y = df[output]
    
    model = SVR(kernel=kernel, C=C, epsilon=epsilon)
    model.fit(X, y)
    
    return model

def train_svr_with_grid_search(df, input, output):
    X = df[input]
    y = df[output]
    
    model = SVR()
    
    param_grid = {
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto'],
        'degree': [2, 3, 4] 
    }
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=10)
    grid_search.fit(X, y)
    
    best_params = grid_search.best_params_
    print("Best params:", best_params)
    
    best_model = grid_search.best_estimator_
    
    return best_model

In [43]:
def plot_history(history):
    import matplotlib.pyplot as plt
    plt.plot(history.history['mae'])
    plt.plot(history.history['val_mae'])
    plt.title('model mae')
    plt.ylabel('mae')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()

## Basic pipeline

### Compute features

In [44]:
feature_extractor = FeatureExtractor()

# Create the desired features
def add_features(dt):
    feature_extractor.add_POS_statistics(dt)
    feature_extractor.add_synset_statistics_ext(dt)
    feature_extractor.add_lemma_statistics(dt)

# Load train data
print('Loading train data')
all_train_files = ['SMTeuroparl', 'MSRvid', 'MSRpar']
df_train = load_data(TRAIN_PATH, TRAIN_GS_PATH, all_train_files)

# Add features to the train data
add_features(df_train)

# Save df to a file
df_train.to_csv(TRAIN_SAVE_PATH, index=False)

# Load test data
print('Loading test data')
all_test_files = ['SMTeuroparl', 'MSRvid', 'MSRpar', 'surprise.OnWN', 'surprise.SMTnews']
df_test = load_data(TEST_PATH, TEST_GS_PATH, all_test_files)

# Add the features to the test data
add_features(df_test)

# Save df_test to a file
df_test.to_csv(TEST_SAVE_PATH, index=False)

print('Train and test datasets ready')

Loading train data
Adding POS based features...
Adding synset based features...
Processed 2234/2234 rows (100.0%)    
Loading test data
Adding POS based features...
Adding synset based features...
Processed 3108/3108 rows (100.0%)    
Train and test datasets ready


In [39]:
feature_extractor = FeatureExtractor()
feature_extractor.add_Word_statistics()

Adding Word based features...
[('but', 'CC'), ('they', 'PRP'), ('were', 'VBD'), ('necessary', 'JJ'), ('.', '.')]
[('necessary', 'JJ')]
[('but', 'CC'), ('they', 'PRP'), ('were', 'VBD'), ('needed', 'VBN'), ('.', '.')]
[('needed', 'VBN')]


In [11]:
# Load dataframes from files
df_train = pd.read_csv(TRAIN_SAVE_PATH)
df_test = pd.read_csv(TEST_SAVE_PATH)


Unnamed: 0,s1,s2,gs,file,s1_n_words,s2_n_words,s1_n_verbs_tot,s2_n_verbs_tot,s1_n_verbs_pres,s2_n_verbs_pres,...,avg_lemma_similarity,max_lemma_similarity,shared_lemma_count,dice_coefficient,lemma_bigram_overlap,lemma_lcs_length,lemma_edit_distance,proportion_s1_in_s2,proportion_s2_in_s1,lemma_position_similarity
0,The leaders have now been given a new chance a...,The leaders benefit aujourd' hui of a new luck...,4.5,SMTeuroparl,8.0,10.0,4.0,3.0,0.0,1.0,...,0.203041,1.0,4,0.470588,0.0,4,6,0.5,0.444444,0.85
1,Amendment No 7 proposes certain changes in the...,Amendment No 7 is proposing certain changes in...,5.0,SMTeuroparl,6.0,6.0,2.0,2.0,2.0,2.0,...,0.299639,1.0,6,0.857143,0.5,6,1,0.857143,0.857143,1.0
2,Let me remind you that our allies include ferv...,I would like to remind you that among our alli...,4.25,SMTeuroparl,7.0,7.0,3.0,1.0,1.0,0.0,...,0.197031,1.0,3,0.428571,0.0,3,6,0.428571,0.428571,0.857143
3,The vote will take place today at 5.30 p.m.,The vote will take place at 5.30pm,4.5,SMTeuroparl,5.0,3.0,1.0,1.0,0.0,0.0,...,0.268684,1.0,3,0.6,0.333333,3,3,0.5,0.75,1.0
4,"The fishermen are inactive, tired and disappoi...","The fishermen are inactive, tired and disappoi...",5.0,SMTeuroparl,4.0,4.0,2.0,2.0,1.0,1.0,...,0.522319,1.0,4,1.0,1.0,4,0,1.0,1.0,1.0


In [63]:
# Features to use in the model
features = [
            # 's1_n_words', 's1_n_verbs_tot', 's1_n_verbs_pres', 's1_n_verbs_past', 's1_n_nouns', 's1_n_adjectives', 's1_n_adverbs', 
            # 's2_n_words', 's2_n_verbs_tot', 's2_n_verbs_pres', 's2_n_verbs_past', 's2_n_nouns', 's2_n_adjectives', 's2_n_adverbs', 
            # 'dif_n_words', 'dif_n_verbs_tot', 'dif_n_verbs_pres', 'dif_n_verbs_past', 'dif_n_nouns', 'dif_n_adjectives', 'dif_n_adverbs', 
            
            'all_all_shared_synsets_count', 'all_all_shared_synsets_ratio', 'all_all_avg_synset_similarity', 'all_all_max_synset_similarity',
            'all_verb_shared_synsets_count', 'all_verb_shared_synsets_ratio', 'all_verb_avg_synset_similarity', 'all_verb_max_synset_similarity',
            'all_noun_shared_synsets_count', 'all_noun_shared_synsets_ratio', 'all_noun_avg_synset_similarity', 'all_noun_max_synset_similarity',
            'all_adj_shared_synsets_count', 'all_adj_shared_synsets_ratio', 'all_adj_avg_synset_similarity', 'all_adj_max_synset_similarity',
            'all_adv_shared_synsets_count', 'all_adv_shared_synsets_ratio', 'all_adv_avg_synset_similarity', 'all_adv_max_synset_similarity',

            'best_all_shared_synsets_count', 'best_all_shared_synsets_ratio', 'best_all_avg_synset_similarity', 'best_all_max_synset_similarity',
            'best_verb_shared_synsets_count', 'best_verb_shared_synsets_ratio', 'best_verb_avg_synset_similarity', 'best_verb_max_synset_similarity',
            'best_noun_shared_synsets_count', 'best_noun_shared_synsets_ratio', 'best_noun_avg_synset_similarity', 'best_noun_max_synset_similarity',
            'best_adj_shared_synsets_count', 'best_adj_shared_synsets_ratio', 'best_adj_avg_synset_similarity', 'best_adj_max_synset_similarity',
            'best_adv_shared_synsets_count', 'best_adv_shared_synsets_ratio', 'best_adv_avg_synset_similarity', 'best_adv_max_synset_similarity',

            # 'lemma_diversity', 'shared_lemmas_ratio', 'avg_lemma_similarity', 'max_lemma_similarity', 'shared_lemma_count', 'dice_coefficient',
            # 'lemma_bigram_overlap', 'lemma_lcs_length', 'lemma_edit_distance', 'proportion_s1_in_s2', 'proportion_s2_in_s1', 'lemma_position_similarity'
            ]


# Train a NN
# hist, model = train_NN(df, features, 'gs')
# plot_history(hist)

# Train a MLP
# hist, model = train_MLP(df, features, 'gs')

# Train a Random Forest
model = train_random_forest(df_train, features, 'gs')

# Train a Support vector regression
# model = train_svr(df, features, 'gs')

# Find best svr
# model = train_svr_with_grid_search(df, features, 'gs')

## Compute correlation of the model

In [64]:
from scipy.stats import pearsonr

# Fill column of the dataset with the predictions of the model
df_test['predicted'] = model.predict(df_test[features])

# Compute the Pearson correlation between the predictions and the gold standard
corr = pearsonr(df_test['gs'], df_test['predicted'])[0]
print('Pearson correlation:', corr)


Pearson correlation: 0.7120224907857646


In [65]:
# Save the predicted dataset
# Add timestamp to the name of the file
import re
import datetime
now = datetime.datetime.now()
timestamp = now.strftime("%Y-%m-%d_%H-%M-%S")
df_test.to_csv(PREDICTED_SAVE_PATH + timestamp + '_predicted_test_data.csv', index=False)


def clean_illegal_characters(df):
    # Definir una expressió regular per trobar caràcters il·legals
    illegal_characters_re = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
    
    # Funció per netejar un valor individual
    def clean_value(value):
        if isinstance(value, str):
            return illegal_characters_re.sub('', value)
        return value
    
    # Aplicar la funció de neteja a cada cel·la del DataFrame
    return df.applymap(clean_value)

df_clean = clean_illegal_characters(df_test)
df_clean.to_excel(PREDICTED_SAVE_PATH + timestamp + '_predicted_test_data.xlsx', index=False)


  return df.applymap(clean_value)


In [66]:
N_ITERS = 10
nn_p = 0
rf_p = 0

for i in range(N_ITERS):
    # print('Iteration:', i)
    # hist, model = train_NN(df, features, 'gs')
    # df_test['predicted'] = model.predict(df_test[features])
    # corr = pearsonr(df_test['gs'], df_test['predicted'])[0]
    # print('NN Pearson correlation:', corr)
    # nn_p += corr

    model = train_random_forest(df_train, features, 'gs')
    df_test['predicted'] = model.predict(df_test[features])
    corr = pearsonr(df_test['gs'], df_test['predicted'])[0]
    print('RF Pearson correlation:', corr)
    rf_p += corr

# print('NN Pearson correlation:', nn_p/N_ITERS)
print('RF Pearson correlation:', rf_p/N_ITERS)

RF Pearson correlation: 0.7114133972203263
RF Pearson correlation: 0.7089882260352499
RF Pearson correlation: 0.7128500076042451
RF Pearson correlation: 0.7142873108528178
RF Pearson correlation: 0.7123363771115678
RF Pearson correlation: 0.7155416859979122
RF Pearson correlation: 0.7129274303810641
RF Pearson correlation: 0.7142129979242231
RF Pearson correlation: 0.7130897149177514
RF Pearson correlation: 0.7141304458031449
RF Pearson correlation: 0.7129777593848302
