In [1]:
# Imports

import neural_network 
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from neural_network import train_model
import matplotlib.pyplot as plt

In [None]:
# Load the dataset into a pandas dataframe.
df = pd.read_csv('iphi2802.csv', delimiter='\t')

# Print the original shape of the dataset
print(f"Original shape of Dataset: {df.shape}")

# Print information about the dataset
print(f'\nDataset Information:')
df.info()

# Print the number of NULL values in each column
print(f'\nNumber of NULL values per column:')
print(df.isnull().sum())

# Print the number of unique values in each column
print(f'\nNumber of unique values per column:')
print(df.nunique())

# Create a new column 'mean_date' in the dataframe, which is the mean of the two dates  
df['mean_date'] = df[['date_min', 'date_max']].mean(axis=1)


# Iitializing the tf-idf vectorizer, using a stopword list from the nltk library and and transform the 'text' column into a TF-IDF matrix of 1000 columns
stopwords = nltk.corpus.stopwords.words('greek') 
vectorizer = TfidfVectorizer(stop_words=stopwords, max_features=8000)
index_matrix = vectorizer.fit_transform(df['text'].to_list())

# Visualize the output of the vectorizer(words and their idf values)
shape = index_matrix.shape
idf_values = vectorizer.idf_
vocab = sorted(vectorizer.vocabulary_)

# Convert the input/target martices for normalization
texts = index_matrix.toarray()
dates = df['mean_date'].values.reshape(-1,1)

# Initialize a MinMaxScaler and scale both the TF-IDF matrix (input) and the mean_dates (output) column
scaler = MinMaxScaler()
X = scaler.fit_transform(texts)
y = scaler.fit_transform(dates)

# Initialize 5-Fold Cross Validation, create dictionary of each fold, store all dictionaries to fold_dataset list
fold_dataset = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold_index, (train_index, test_index) in enumerate(kf.split(X), 1):
    
    # split data to train/test sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Store the training and test datasets along with fold index
    fold_data = {
        "fold_index": fold_index,
        "X_train": X_train,
        "y_train": y_train,
        "X_test": X_test,
        "y_test": y_test,
    }
    fold_dataset.append(fold_data)

In [None]:
# Train and evaluate models of 1 hidden layer

mean_train_losses = []
tr_loss1, ts_loss1, mean_tr_loss1 = train_model(1000, 0.001, True, fold_dataset, 32, 50)
mean_train_losses.append(mean_tr_loss1)
tr_loss2, ts_loss2, mean_tr_loss2 = train_model(750, 0.001, True, fold_dataset, 32, 50)
mean_train_losses.append(mean_tr_loss2)
tr_loss3, ts_loss3, mean_tr_loss3 = train_model(500, 0.001, True, fold_dataset, 32, 50)
mean_train_losses.append(mean_tr_loss3)
tr_loss4, ts_loss4, mean_tr_loss4 = train_model(250, 0.001, True, fold_dataset, 32, 50)
mean_train_losses.append(mean_tr_loss4)

# Plot the mean train losses over epochs and test losses over folds for the models
plt.figure(figsize=(12, 4))
 
for i,loss in enumerate(mean_train_losses):
    plt.plot(loss, label='Mean Train Loss Network {}'.format(i+1))

plt.legend()
plt.title('Mean Loss Over Epochs')

plt.figure(figsize=(12, 4))

plt.plot(ts_loss1, label='Test Loss Network 1')
plt.plot(ts_loss2, label='Test Loss Network 2')
plt.plot(ts_loss3, label='Test Loss Network 3')
plt.plot(ts_loss4, label='Test Loss Network 4')

plt.legend()
plt.title('Train Loss Over Folds per Model')

In [None]:
# Train and evaluate models for two hidden layers
mean_loss_ml = []

tr_loss_ml_1, ts_loss_ml_1, mean_tr_loss_ml_1 = train_model(
    [250,50], 0.001, True, fold_dataset, 32, 50)
mean_loss_ml.append(mean_tr_loss_ml_1)
tr_loss_ml_2, ts_loss_ml_2, mean_tr_loss_ml_2 = train_model(
    [400,200], 0.001, True, fold_dataset, 32, 50)
mean_loss_ml.append(mean_tr_loss_ml_2)
tr_loss_ml_3, ts_loss_ml_3, mean_tr_loss_ml_3 = train_model(
    [500,300], 0.001, True, fold_dataset, 32, 50)
mean_loss_ml.append(mean_tr_loss_ml_3)


plt.figure(figsize=(12, 4))
 
for i,loss in enumerate(mean_loss_ml):
    plt.plot(loss, label='Mean Train Loss Multilayered Network {}'.format(i+1))

plt.legend()
plt.title('Mean Loss Over Epochs')

plt.figure(figsize=(12, 4))
plt.plot(ts_loss_ml_1, label='Test Loss Network 1')
plt.plot(ts_loss_ml_2, label='Test Loss Network 2')
plt.plot(ts_loss_ml_3, label='Test Loss Network 3')
plt.legend()
plt.title('Train Loss Over Folds per Model')