In [1]:
# Imports

import utils
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from utils import train_model
from utils import mean_of_all
import matplotlib.pyplot as plt

In [None]:
# Load the dataset into a pandas dataframe.
df = pd.read_csv('iphi2802.csv', delimiter='\t')

# Print the original shape of the dataset
print(f"Original shape of Dataset: {df.shape}")

# Print information about the dataset
print(f'\nDataset Information:')
df.info()

# Print the number of NULL values in each column
print(f'\nNumber of NULL values per column:')
print(df.isnull().sum())

# Print the number of unique values in each column
print(f'\nNumber of unique values per column:')
print(df.nunique())

# Create a new column 'mean_date' in the dataframe, which is the mean of the two dates  
df['mean_date'] = df[['date_min', 'date_max']].mean(axis=1)


# Iitializing the tf-idf vectorizer, using a stopword list from the nltk library and and transform the 'text' column into a TF-IDF matrix of 1000 columns
stopwords = nltk.corpus.stopwords.words('greek') 
vectorizer = TfidfVectorizer(stop_words=stopwords, max_features=8000)
index_matrix = vectorizer.fit_transform(df['text'].to_list())

# Visualize the output of the vectorizer(words and their idf values)
shape = index_matrix.shape
idf_values = vectorizer.idf_
vocab = sorted(vectorizer.vocabulary_)

# Convert the input/target martices for normalization
texts = index_matrix.toarray()
dates = df['mean_date'].values.reshape(-1,1)

# Initialize a MinMaxScaler and scale both the TF-IDF matrix (input) and the mean_dates (output) column
scaler = MinMaxScaler()
X = scaler.fit_transform(texts)
y = scaler.fit_transform(dates)

# Initialize 5-Fold Cross Validation, create dictionary of each fold, store all dictionaries to fold_dataset list
fold_dataset = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold_index, (train_index, test_index) in enumerate(kf.split(X), 1):
    
    # split data to train/test sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Store the training and test datasets along with fold index
    fold_data = {
        "fold_index": fold_index,
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
    }
    fold_dataset.append(fold_data)

In [None]:
# Train and evaluate various models 
mean_losses_tr = []
mean_losses_ts = []

mean_train_loss, mean_test_loss = train_model(250, 0.001, 0.9, fold_dataset, 32, 50, False, False, 0.0, 0.0)
mean_losses_tr.append(mean_train_loss)
mean_losses_ts.append(mean_test_loss)

mean_train_loss, mean_test_loss = train_model([500,300], 0.001, fold_dataset, 32, 50, False, False, 0.0, 0.0)
mean_losses_tr.append(mean_train_loss)
mean_losses_ts.append(mean_test_loss)

mean_train_loss, mean_test_loss = train_model([400,200,100], 0.001, fold_dataset, 32, 50, False, False, 0.0, 0.0)
mean_losses_tr.append(mean_train_loss)
mean_losses_ts.append(mean_test_loss)

mean_train_loss, mean_test_loss = train_model([500,300,100], 0.001, fold_dataset, 32, 50, False, False, 0.0, 0.0)
mean_losses_tr.append(mean_train_loss)
mean_losses_ts.append(mean_test_loss)

mean_train_loss, mean_test_loss = train_model([800,400,200], 0.001, fold_dataset, 32, 50, False, False, 0.0, 0.0)
mean_losses_tr.append(mean_train_loss)
mean_losses_ts.append(mean_test_loss)



In [None]:
# Plot the training and validation losses for each one individually
plt.figure(figsize=(12, 4))

for i, (train_loss, test_loss) in enumerate(zip(mean_losses_tr, mean_losses_ts)):
    plt.plot(train_loss, label=f'Mean Train Loss Network {i + 1}')
    plt.plot(test_loss, linestyle='--', label=f'Mean Test Loss Network {i + 1}')

plt.legend(fontsize = 6)
plt.title('Mean Loss Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')

In [None]:
# Run the best model of the above using the early stopping criterion and plot the loss per fold for all the epochs.

train_loss, test_loss = train_model([500,300,100], 0.001, fold_dataset, 32, 150, True, False, 0.0, 0.0)

plt.figure(figsize=(12, 4))

for i, (tr_loss, ts_loss) in enumerate(zip(train_loss, test_loss)):
    plt.plot(tr_loss, label=f'Train Loss Fold {i + 1}')
    plt.plot(ts_loss, linestyle='--', label=f'Test Loss Fold {i + 1}')

plt.legend(fontsize = 6)
plt.title('Loss Over Epochs for every Fold with Early Stopping')
plt.xlabel('Epochs')
plt.ylabel('Loss')

In [None]:
# Test the effects of combinations of various learning rate and momentum values

final_means = []

train_loss, test_loss = train_model([500,300,100], 0.001, 0.2, fold_dataset, 32, 150, True, False, 0.0, 0.0)
final_means.append(mean_of_all(train_loss))

plt.figure(figsize=(12, 4))

for i, (tr_loss, ts_loss) in enumerate(zip(train_loss, test_loss)):
    plt.plot(tr_loss, label=f'Train Loss Fold {i + 1}')
    plt.plot(ts_loss, linestyle='--', label=f'Test Loss Fold {i + 1}')

plt.legend(fontsize = 6)
plt.title('Loss Over Epochs per Fold with Early Stopping, η=0.001, m=0.6')
plt.xlabel('Epochs')
plt.ylabel('Loss')

train_loss, test_loss = train_model([500,300,100], 0.001, 0.6, fold_dataset, 32, 150,True, False, 0.0, 0.0)
final_means.append(mean_of_all(train_loss))

plt.figure(figsize=(12, 4))

for i, (tr_loss, ts_loss) in enumerate(zip(train_loss, test_loss)):
    plt.plot(tr_loss, label=f'Train Loss Fold {i + 1}')
    plt.plot(ts_loss, linestyle='--', label=f'Test Loss Fold {i + 1}')

plt.legend(fontsize = 6)
plt.title('Loss Over Epochs per Fold with Early Stopping, η=0.001, m=0.6')
plt.xlabel('Epochs')
plt.ylabel('Loss')

train_loss, test_loss = train_model([500,300,100], 0.05, 0.6, fold_dataset, 32, 150, True, False, 0.0, 0.0)
final_means.append(mean_of_all(train_loss))

plt.figure(figsize=(12, 4))

for i, (tr_loss, ts_loss) in enumerate(zip(train_loss, test_loss)):
    plt.plot(tr_loss, label=f'Train Loss Fold {i + 1}')
    plt.plot(ts_loss, linestyle='--', label=f'Test Loss Fold {i + 1}')

plt.legend(fontsize = 6)
plt.title('Loss Over Epochs per Fold with Early Stopping, η=0.05, m=0.6')
plt.xlabel('Epochs')
plt.ylabel('Loss')

train_loss, test_loss = train_model([500,300,100], 0.1, 0.6, fold_dataset, 32, 150, True, False, 0.0, 0.0)
final_means.append(mean_of_all(train_loss))

plt.figure(figsize=(12, 4))

for i, (tr_loss, ts_loss) in enumerate(zip(train_loss, test_loss)):
    plt.plot(tr_loss, label=f'Train Loss Fold {i + 1}')
    plt.plot(ts_loss, linestyle='--', label=f'Test Loss Fold {i + 1}')

plt.legend(fontsize = 6)
plt.title('Loss Over Epochs per Fold with Early Stopping, η=0.1, m=0.6')
plt.xlabel('Epochs')
plt.ylabel('Loss')

In [None]:
# Test the network using dropout regularization with various values for dropout rates for the input and hidden layers 
final_means = []

train_loss, test_loss = train_model([500,300,100], 0.001, 0.9, fold_dataset, 32, 150, have_callback=True, dropout=True, dropout_in=0.8, dropout_h=0.5)
final_means.append(mean_of_all(train_loss))

plt.figure(figsize=(12, 4))

for i, (tr_loss, ts_loss) in enumerate(zip(train_loss, test_loss)):
    plt.plot(tr_loss, label=f'Train Loss Fold {i + 1}')
    plt.plot(ts_loss, linestyle='--', label=f'Test Loss Fold {i + 1}')

plt.legend(fontsize = 6)
plt.title('Loss Over Epochs per Fold with Dropout regularization, r_in=0.8, r_h= 0.5')
plt.xlabel('Epochs')
plt.ylabel('Loss')

train_loss, test_loss = train_model([500,300,100], 0.001, 0.9, fold_dataset, 32, 150, have_callback=True, dropout=True, dropout_in=0.5, dropout_h=0.5)
final_means.append(mean_of_all(train_loss))

plt.figure(figsize=(12, 4))

for i, (tr_loss, ts_loss) in enumerate(zip(train_loss, test_loss)):
    plt.plot(tr_loss, label=f'Train Loss Fold {i + 1}')
    plt.plot(ts_loss, linestyle='--', label=f'Test Loss Fold {i + 1}')

plt.legend(fontsize = 6)
plt.title('Loss Over Epochs per Fold with Dropout regularization, r_in=0.5, r_h= 0.5')
plt.xlabel('Epochs')
plt.ylabel('Loss')

train_loss, test_loss = train_model([500,300,100], 0.05, 0.9, fold_dataset, 32, 150, have_callback=True, dropout=True, dropout_in=0.8, dropout_h=0.2)
final_means.append(mean_of_all(train_loss))

plt.figure(figsize=(12, 4))

for i, (tr_loss, ts_loss) in enumerate(zip(train_loss, test_loss)):
    plt.plot(tr_loss, label=f'Train Loss Fold {i + 1}')
    plt.plot(ts_loss, linestyle='--', label=f'Test Loss Fold {i + 1}')

plt.legend(fontsize = 6)
plt.title('Loss Over Epochs per Fold with Dropout regularization, r_in=0.8, r_h= 0.2')
plt.xlabel('Epochs')
plt.ylabel('Loss')