# Fourth Task: Prediction

### Import of the needed libraries and the dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

import pydotplus
import os

from IPython.display import Image  
from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.class_weight import compute_class_weight
from joblib import dump

import matplotlib.pyplot as plt

cyclists = pd.read_csv('./dataset/cyclists_trasformed.csv')
races = pd.read_csv('./dataset/races_trasformed.csv')

os.makedirs('./models', exist_ok=True)

# Data Preparation

Before employing the learning algorithms, we need to perform a few adjustments to our data. 

Firstly, it is necessary to merge the two dataset and delete not useful columns for the purpose.

Then, we will add an attribute for specifying if a certain cyclist was in the first 20 position of a race or not. 

We delete:

• *name* (coming from cyclist), since it is a repetition for *_url* in cyclist

• *name* (coming from races), since it is a repetition for *_url* in races

• *weight* and *height* (coming from cyclist), since these characteristics are combined in the *bmi* feature we created 

• *avg_position* and *avg_delta* (coming from cyclist), since it is better to consider more precise columns like *position* (see next) and *delta* from races

• *position* (coming from races) since we consider only the first 20 we will get from the new column 

• *birth_year* (coming from cyclist) because it's redundant having *cyclist_age* from races

• *cyclist_team* (coming from races) since we consider single cyclists

• *date* (coming from races) whose format is: "YYYY-MM-DD HH-MM-SS". From it, we can extract the information we need: we exclude the "HH-MM-SS" and the "MM-DD" part because we can easily group races based on the *season* of the year (attribute that we already have). At the end, we only care about the year for splitting the races for the training set.

We add:

• *top_20*, having value =1 if the corresponding cyclist was in between these positions or, on the opposite, =0.

**NB**: we drop *position* after creating the column *top_20*, since it is needed to fill the new one correctly.

In [None]:
# `races` and `cyclists` merge, matching rows where the `cyclist` column in `races` corresponds to the `_url` column in `cyclists`. The `indicator=True` argument adds a `_merge` column to indicate whether rows are present in both DataFrames, only in the left (`races`), or only in the right (`cyclists`).
data_merged = pd.merge(races, cyclists, left_on='cyclist', right_on='_url', how='outer', indicator=True)

# Filter non-corresponding rows: these cyclists never participate to a competition, so we don't consider them either for the prediction
mismatched = data_merged[data_merged['_merge'] != 'both']
print(f"Number of mismatched entries: {len(mismatched)}")
print(mismatched)
# Filter the merged DataFrame, `data_merged` to include only rows present in both the `races` and `cyclists` 
data_merged = data_merged[data_merged['_merge'] == 'both']

# Drop the '_merge' column as it's no longer needed
data_merged.drop('_merge', axis=1, inplace=True)


Following the specification, we create the new attribute *top_20* where the value is '1' if in the row the attribute *position* ranges from 0 to 19, '0' otherwise. 

As stated before, we drop *position* since is useless from now on and modify *date* so that it contains only the year. 


In [None]:
# Rename '_url_x' in 'race_url' and '_url_y' in 'cyclist_url'
data_merged.rename(columns={'_url_x': 'race_url', '_url_y':'cyclist_url'}, inplace=True)

# Delete useless columns we cited, except position
data_merged = data_merged.drop(columns=['name_x', 'name_y', 'cyclist', 'weight', 'height', 'birth_year', 'avg_position', 'avg_delta', 'cyclist_team'])

# Create 'top_20'
data_merged['top_20'] = (data_merged['position'] < 20).astype(int)

# Drop position 
data_merged = data_merged.drop(columns=['position']) 

data_merged.head()

In [None]:
data_merged.info()

The learning algorithms require the categorical data to be transformed into numerical ones.

In order to do this, we define the following function that assign a number for each different value inside the 
attribute starting from 1 and substitutes the categorical.  

We cast the boolean values for *is_tarmac* to int. 

At the end, we are ready to define our 'train_set' and 'test_set' variables based on the year we get from *date*:

- Training set: needed to train models.
- Test set: need to test the model on never-seen data.

In [117]:
# Function to discretize the variables
# Input: the dataset and the list of variables' names to discretize
def discretize_data(dataset, variables):
    for variable in variables:
        # Get the unique variable's values
        var = sorted(dataset[variable].unique())
        # Generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))
        dataset[variable] = dataset[variable].map(mapping).astype(int)
    return dataset

In [118]:
# Attribute to transform
categorical_variables = ['race_url', 'season', 'cyclist_url', 'nationality', 'continent']
data_merged = discretize_data(data_merged, categorical_variables)

# Other casting
data_merged['is_tarmac'] = data_merged['is_tarmac'].astype(int)
data_merged['date'] = pd.DatetimeIndex(data_merged['date']).year

# Save the dataset
data_merged.to_csv('./dataset/data_merged.csv', index=False)

# Training set
train_data = data_merged[data_merged['date'] < 2022]
# Test set
test_data = data_merged[data_merged['date'] >= 2022]


In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
test_data.head()

In [None]:
test_data.info()

For our machine learning purpose, the dataset has to be divided into two parts:

- Features: The input data containing the information needed by the model to make predictions (every attribute except *top_20*)
- Target: The output data you want the model to predict (precisely *top_20*).

So, we create two variables for both *train_data* and *test_data*.

In [123]:
# Feature and target for training
train_feature = train_data.drop(columns=['top_20'])
train_target = train_data['top_20']

# Feature and target for testing
test_feature = test_data.drop(columns=['top_20'])
test_target = test_data['top_20']

# Learning Algorithms

The models we choose can be divided in different categories:

1) Tree-Based Models (Decision Tree, Random Forest)

2) AdaBoost

3) Naïve Bayes

4) K-Nearest Neighbors (KNN)

5) Neural Network


`NB`: We tried to implement XGBoost, Rule-Based and SVM but after 15 minutes the methods were still running. 

In [124]:
def report_scores(test_label, test_pred):
    print(classification_report(test_label, 
                            test_pred, 
                            target_names=['Non-Top 20', 'Top 20']))

## Tree-Based Models (Decision Tree, Random Forest, AdaBoost)


### Decision Tree Model

In [None]:
# Creating and configuring the Decision Tree
dt = tree.DecisionTreeClassifier(
    criterion='gini',               # Use the Gini index to evaluate the purity of splits
    splitter='best',                # Splits the nodes by choosing the best split
    class_weight='balanced',        # Gives more weight to 1-class since it has less support
    max_depth=5,                    # Limit tree depth to 5 levels
    min_samples_split=3,            # A node must have at least 3 samples to be split
    min_samples_leaf=4,             # Each leaf must contain at least 4 examples
    random_state=42                 # Ensures repeatability of results
)

# Train the model
dt = dt.fit(train_feature, train_target)

# Save the model
dump(dt, './models/decision_tree.joblib')

Visualizing the actual Decision Tree obtained: 

In [None]:
#To visualize the Decision Tree, you must intsall GraphViz
#MacOs: brew install graphviz
#Linux: sudo apt-get install graphviz
#Windows: Install from here https://graphviz.org/download/ 
#and add the following enviroment variable (the path can change)
#import os
#os.environ["PATH"] += os.pathsep + 'C:\Program Files (x86)\Graphviz2.38/bin/'

dot_data = tree.export_graphviz(dt, out_file=None, 
                                feature_names=list(train_feature.columns),  
                                class_names=['Non-Top 20', 'Top 20'],  
                                filled=True, rounded=True)  
graph = pydotplus.graph_from_dot_data(dot_data)  
Image(graph.create_png())


In [None]:
# Prediction over test dataset employing Decision Tree
test_pred_dt = dt.predict(test_feature)    

# Compute the performance of the model
report_scores(test_target, test_pred_dt)

How to read the result:
- **Non-Top 20**: 
    - Precision: 0.95 - Of all the predictions that the model classified as ‘Non-Top 20’, 95% were correct.
    - Recall: 0.75: 75% of the riders actually ‘Non-Top 20’ were correctly identified by the model.
    - F1-Score: 0.84 - Represents the balance between precision and recall, and is very high for this class, indicating that the model is excellent at correctly distinguishing ‘Non-Top 20’ cyclists.
    - Support: 30.466 - Indicates the total number of true samples belonging to the ‘Non-Top 20’ class.
- **Top 20**: 
    - Precision: 0.33 - Of all predictions classified as ‘Top 20’, only 33% are correct. This indicates that the model tends to include false positives.
    - Recall: 0.77 - 77% of the cyclists actually in the ‘Top 20’ were recognised correctly. 
    - F1-Score: 0.46 - Being the balance between precision and recall, the mid-low value suggests that the model has difficulty with the ‘Top 20’ class.
    - Support: 4.940 - Indicates the total number of true samples belonging to the ‘Top 20’ class.
- **Accuracy** 
    - Accuracy: 0.75 - Percentage of correct predictions out of the total data. Although the value is high, it is influenced by the strong dominance of the Non-Top 20 class (majority class).
- **Macro Average**
    - Precision: 0.64 - Arithmetic mean of the precision of the two classes.
    - Recall: 0.76 - Arithmetic mean of the recall of the two classes. Low due to extremely low recall for the ‘Top 20’ class.
    - F1-Score: 0.65 - Arithmetic mean of the F1-Score of the two classes. Reflects the difficulty of the model in handling the ‘Top 20’ class.
- **Weighted Average**.
    - Precision: 0.87 - Weighted average of the precision, considering the support (size) of each class.
    - Recall: 0.75 - Weighted average of recall, strongly influenced by the high recall of the ‘Non-Top 20’ class.
    - F1-Score: 0.79 - Weighted average of the F1-Score.

In [None]:
# Compute and visualize Confusion Matrix
cm = confusion_matrix(test_target, test_pred_dt)
print("Confusion Matrix:")
print(cm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dt.classes_)
disp.plot()
plt.show()

How to read the results:

- True Negatives (TN): 22794 - Correct predictions for class 0.

- False Positives (FP): 7672 - Incorrect predictions that indicated 1 instead of 0.

- False Negatives (FN): 1132 - Incorrect predictions that indicated 0 instead of 1.

- True Positives (TP): 3808 - Correct predictions for class 1.

### Random Forest Model

In [None]:
# Random Forest definition and training
rf = RandomForestClassifier(
    n_estimators=100,         # Number of trees in the forest
    criterion='gini',         # Measure to evaluate the purity of the split
    class_weight='balanced',  # Gives more weight to 1-class since it has less support
    max_depth=10,             # Maximum depth of trees
    min_samples_split=5,      # Minimum number of samples to split a node
    random_state=42,          # Ensures repeatability of results
)

# Model training
rf = rf.fit(train_feature, train_target)

# Save the model
dump(rf, './models/random_forest.joblib')

In [None]:
# Prediction employing Random Forest
test_pred_rf = rf.predict(test_feature)    

# Compute the performance of the model
report_scores(test_target, test_pred_rf)

How to read the result:
- **Non-Top 20**: 
    - Precision: 0.96 - Of all the predictions that the model classified as ‘Non-Top 20’, 88% were correct.
    - Recall: 0.81: 81% of the riders actually ‘Non-Top 20’ were correctly identified by the model.
    - F1-Score: 0.88 - Represents the balance between precision and recall, and is very high for this class, indicating that the model is excellent at correctly distinguishing ‘Non-Top 20’ cyclists.
    - Support: 30.466 - Indicates the total number of true samples belonging to the ‘Non-Top 20’ class.
- **Top 20**: 
    - Precision: 0.40 - Of all predictions classified as ‘Top 20’, only 40% are correct. This indicates that the model tends to include false positives.
    - Recall: 0.78 - 78% of the cyclists actually in the ‘Top 20’ were recognised correctly. 
    - F1-Score: 0.53 - Being the balance between precision and recall, the mid-low value suggests that the model has difficulty with the ‘Top 20’ class.
    - Support: 4.940 - Indicates the total number of true samples belonging to the ‘Top 20’ class.
- **Accuracy** 
    - Accuracy: 0.81 - Percentage of correct predictions out of the total data. Although the value is high, it is influenced by the strong dominance of the Non-Top 20 class (majority class).
- **Macro Average**
    - Precision: 0.68 - Arithmetic mean of the precision of the two classes.
    - Recall: 0.80 - Arithmetic mean of the recall of the two classes. 
    - F1-Score: 0.70 - Arithmetic mean of the F1-Score of the two classes. 
- **Weighted Average**.
    - Precision: 0.88 - Weighted average of the precision, considering the support (size) of each class.
    - Recall: 0.81 - Weighted average of recall, strongly influenced by the high recall of the ‘Non-Top 20’ class.
    - F1-Score: 0.83 - Weighted average of the F1-Score.

In [None]:
# Compute and visualize Confusion Matrix
cm = confusion_matrix(test_target, test_pred_rf)
print("Confusion Matrix:")
print(cm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf.classes_)
disp.plot()
plt.show()

How to read the results:

- True Negatives (TN): 24635 - Correct predictions for class 0.

- False Positives (FP): 5831 - Incorrect predictions that indicated 1 instead of 0.

- False Negatives (FN): 1065 - Incorrect predictions that indicated 0 instead of 1.

- True Positives (TP): 3875 - Correct predictions for class 1.

### Final comparisons


In [None]:
# Dictionary containing trained models
models = {
    "Decision Tree": dt,          
    "Random Forest": rf   
}

# Iteration on models to calculate and print metrics
for model_name, model in models.items():
    test_pred = model.predict(test_feature)
    acc = accuracy_score(test_target, test_pred)
    conf = confusion_matrix(test_target, test_pred)
    report = classification_report(test_target, test_pred, target_names=["Non Top 20", "Top 20"], zero_division=0)
    print(f"\n--- {model_name} ---")
    print(f"Accuracy: {acc:.4f}")
    print(f"Confusion Matrix:\n{conf}")
    print(f"Classification Report:\n{report}")


## AdaBoost

Two versions of the AdaBoost have been implemented. 

The first one considers its basics features, so:
- *base*: Decision Tree with height = 1 
- *n_estimators*: 50
- *learning rate*: 1

Notice this method is less complex and fast to implement, but the low number of estimator and the high learning rate could lead to imprecise or tendentially wrong results. 

In [None]:
clf = AdaBoostClassifier()
clf.fit(train_feature, train_target)

# Save the model
dump(clf, './models/adaboost.joblib')

test_pred_clf = clf.predict(test_feature)
print(classification_report(test_target, test_pred_clf, target_names=['Non-Top 20', 'Top 20']))

How to read the result:
- **Non-Top 20**: 
    - Precision: 0.87 - Of all the predictions that the model classified as ‘Non-Top 20’, 87% were correct.
    - Recall: 1.00: 100% of the riders actually ‘Non-Top 20’ were correctly identified by the model.
    - F1-Score: 0.93 - Represents the balance between precision and recall, and is very high for this class, indicating that the model is excellent at correctly distinguishing ‘Non-Top 20’ cyclists.
    - Support: 30.466 - Indicates the total number of true samples belonging to the ‘Non-Top 20’ class.
- **Top 20**: 
    - Precision: 0.85 - Of all predictions classified as ‘Top 20’, 85% are correct. 
    - Recall: 0.05 - Only 5% of the cyclists actually in the ‘Top 20’ were recognised correctly. This indicates that the model is not effective in capturing true positives in this class.
    - F1-Score: 0.10 - Being the balance between precision and recall, the low value suggests that the model has difficulty with the ‘Top 20’ class.
    - Support: 4.940 - Indicates the total number of true samples belonging to the ‘Top 20’ class.
- **Accuracy** 
    - Accuracy: 0.87 - Percentage of correct predictions out of the total data. Although the value is high, it is influenced by the strong dominance of the Non-Top 20 class (majority class).
- **Macro Average**
    - Precision: 0.86 - Arithmetic mean of the precision of the two classes.
    - Recall: 0.53 - Arithmetic mean of the recall of the two classes. Low due to extremely low recall for the ‘Top 20’ class.
    - F1-Score: 0.51 - Arithmetic mean of the F1-Score of the two classes. Reflects the difficulty of the model in handling the ‘Top 20’ class.
- **Weighted Average**.
    - Precision: 0.86 - Weighted average of the precision, considering the support (size) of each class.
    - Recall: 0.87 - Weighted average of recall, strongly influenced by the high recall of the ‘Non-Top 20’ class.
    - F1-Score: 0.81 - Weighted average of the F1-Score.

In [None]:
# Compute and visualize Confusion Matrix
cm = confusion_matrix(test_target, test_pred_clf)
print("Confusion Matrix:")
print(cm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf.classes_)
disp.plot()
plt.show()

How to read the results:

- True Negatives (TN): 30421 - Correct predictions for class 0.

- False Positives (FP): 45 - Incorrect predictions that indicated 1 instead of 0.

- False Negatives (FN): 4677 - Incorrect predictions that indicated 0 instead of 1.

- True Positives (TP): 263 - Correct predictions for class 1.

The second versionm instead, employs the previously computed decision tree as a base, has a higher number of estimators (200) and a lower learning rate.

In [None]:
clf2 = AdaBoostClassifier(estimator=dt, n_estimators=200, learning_rate=0.1)
clf2.fit(train_feature, train_target)

# Save the model
dump(clf2, './models/adaboost2.joblib')

test_pred_clf2 = clf2.predict(test_feature)
print(classification_report(test_target, test_pred_clf2, target_names=['Non-Top 20', 'Top 20']))

How to read the result:
- **Non-Top 20**: 
    - Precision: 0.96 - Of all the predictions that the model classified as ‘Non-Top 20’, 96% were correct.
    - Recall: 0.75: 75% of the riders actually ‘Non-Top 20’ were correctly identified by the model.
    - F1-Score: 0.84 - Represents the balance between precision and recall, and is very high for this class, indicating that the model is excellent at correctly distinguishing ‘Non-Top 20’ cyclists.
    - Support: 30.466 - Indicates the total number of true samples belonging to the ‘Non-Top 20’ class.
- **Top 20**: 
    - Precision: 0.34 - Of all predictions classified as ‘Top 20’, only 34% are correct. This indicates that the model tends to include false positives.
    - Recall: 0.80 - 80% of the cyclists actually in the ‘Top 20’ were recognised correctly. 
    - F1-Score: 0.47 - Being the balance between precision and recall, the low value suggests that the model has difficulty with the ‘Top 20’ class.
    - Support: 4.940 - Indicates the total number of true samples belonging to the ‘Top 20’ class.
- **Accuracy** 
    - Accuracy: 0.75 - Percentage of correct predictions out of the total data. Although the value is high, it is influenced by the strong dominance of the Non-Top 20 class (majority class).
- **Macro Average**
    - Precision: 0.65 - Arithmetic mean of the precision of the two classes.
    - Recall: 0.77 - Arithmetic mean of the recall of the two classes. Low due to extremely low recall for the ‘Top 20’ class.
    - F1-Score: 0.66 - Arithmetic mean of the F1-Score of the two classes. Reflects the difficulty of the model in handling the ‘Top 20’ class.
- **Weighted Average**.
    - Precision: 0.87 - Weighted average of the precision, considering the support (size) of each class.
    - Recall: 0.75 - Weighted average of recall, strongly influenced by the high recall of the ‘Non-Top 20’ class.
    - F1-Score: 0.79 - Weighted average of the F1-Score.

In [None]:
# Compute and visualize Confusion Matrix
cm = confusion_matrix(test_target, test_pred_clf2)
print("Confusion Matrix:")
print(cm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf.classes_)
disp.plot()
plt.show()

How to read the results:

- True Negatives (TN): 22710 - Correct predictions for class 0.

- False Positives (FP): 7756 - Incorrect predictions that indicated 1 instead of 0.

- False Negatives (FN): 1001 - Incorrect predictions that indicated 0 instead of 1.

- True Positives (TP): 3939 - Correct predictions for class 1.

## Naïve Bayes

In [None]:
# Naïve Bayes training
gnb = GaussianNB()
gnb.fit(train_feature, train_target)

# Save the model
dump(gnb, './models/naive_bayes.joblib')

# Prediction employing Naive Bayes
test_pred_gnb = gnb.predict(test_feature)  

# Compute the performance of the model
report_scores(test_target, test_pred_gnb)


How to read the result:
- **Non-Top 20**: 
    - Precision: 0.88 - Of all the predictions that the model classified as ‘Non-Top 20’, 88% were correct.
    - Recall: 0.88: 88% of the riders actually ‘Non-Top 20’ were correctly identified by the model.
    - F1-Score: 0.88 - Represents the balance between precision and recall, and is very high for this class, indicating that the model is excellent at correctly distinguishing ‘Non-Top 20’ cyclists.
    - Support: 30.466 - Indicates the total number of true samples belonging to the ‘Non-Top 20’ class.
- **Top 20**: 
    - Precision: 0.24 - Of all predictions classified as ‘Top 20’, only 33% are correct. This indicates that the model tends to include false positives.
    - Recall: 0.23 - 77% of the cyclists actually in the ‘Top 20’ were recognised correctly. 
    - F1-Score: 0.23 - Being the balance between precision and recall, the mid-low value suggests that the model has difficulty with the ‘Top 20’ class.
    - Support: 4.940 - Indicates the total number of true samples belonging to the ‘Top 20’ class.
- **Accuracy** 
    - Accuracy: 0.79 - Percentage of correct predictions out of the total data. Although the value is high, it is influenced by the strong dominance of the Non-Top 20 class (majority class).
- **Macro Average**
    - Precision: 0.56 - Arithmetic mean of the precision of the two classes.
    - Recall: 0.56 - Arithmetic mean of the recall of the two classes. Low due to extremely low recall for the ‘Top 20’ class.
    - F1-Score: 0.56 - Arithmetic mean of the F1-Score of the two classes. Reflects the difficulty of the model in handling the ‘Top 20’ class.
- **Weighted Average**.
    - Precision: 0.79 - Weighted average of the precision, considering the support (size) of each class.
    - Recall: 0.79 - Weighted average of recall, strongly influenced by the high recall of the ‘Non-Top 20’ class.
    - F1-Score: 0.79 - Weighted average of the F1-Score.

In [None]:
# Compute and visualize Confusion Matrix
cm = confusion_matrix(test_target, test_pred_gnb)
print("Confusion Matrix:")
print(cm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dt.classes_)
disp.plot()
plt.show()

How to read the results:

- True Negatives (TN): 26782 - Correct predictions for class 0.

- False Positives (FP): 3684 - Incorrect predictions that indicated 1 instead of 0.

- False Negatives (FN): 3795 - Incorrect predictions that indicated 0 instead of 1.

- True Positives (TP): 1145 - Correct predictions for class 1.

## K-Nearest Neighbors (KNN)

In [None]:
# Definition of KNN model with 3 neighbours and ‘ball_tree’ algorithm
knn = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree', metric='minkowski')

# Train KNN on the training data
knn.fit(train_feature, train_target) 

# Save the model
dump(knn, './models/knn.joblib')

# Prediction
test_pred_knn = knn.predict(test_feature)    # Predictions on the test set

# Compute the performance of the model
report_scores(test_target, test_pred_knn)


How to read the result:
- **Non-Top 20**: 
    - Precision: 0.90 - Of all the predictions that the model classified as ‘Non-Top 20’, 90% were correct.
    - Recall: 0.86: 86% of the riders actually ‘Non-Top 20’ were correctly identified by the model.
    - F1-Score: 0.88 - Represents the balance between precision and recall, and is very high for this class, indicating that the model is excellent at correctly distinguishing ‘Non-Top 20’ cyclists.
    - Support: 30.466 - Indicates the total number of true samples belonging to the ‘Non-Top 20’ class.
- **Top 20**: 
    - Precision: 0.30 - Of all predictions classified as ‘Top 20’, only 63% are correct. This indicates that the model tends to include false positives.
    - Recall: 0.38 - Only 38% of the cyclists actually in the ‘Top 20’ were recognised correctly. This indicates that the model is not effective in capturing true positives in this class.
    - F1-Score: 0.34 - Being the balance between precision and recall, the low value suggests that the model has difficulty with the ‘Top 20’ class.
    - Support: 4.940 - Indicates the total number of true samples belonging to the ‘Top 20’ class.
- **Accuracy** 
    - Accuracy: 0.79 - Percentage of correct predictions out of the total data. Although the value is high, it is influenced by the strong dominance of the Non-Top 20 class (majority class).
- **Macro Average**
    - Precision: 0.60 - Arithmetic mean of the precision of the two classes.
    - Recall: 0.62 - Arithmetic mean of the recall of the two classes. Low due to extremely low recall for the ‘Top 20’ class.
    - F1-Score: 0.61 - Arithmetic mean of the F1-Score of the two classes. Reflects the difficulty of the model in handling the ‘Top 20’ class.
- **Weighted Average**.
    - Precision: 0.81 - Weighted average of the precision, considering the support (size) of each class.
    - Recall: 0.79 - Weighted average of recall, strongly influenced by the high recall of the ‘Non-Top 20’ class.
    - F1-Score: 0.80 - Weighted average of the F1-Score.

In [None]:
# Compute and visualize Confusion Matrix
cm = confusion_matrix(test_target, test_pred_knn)
print("Confusion Matrix:")
print(cm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dt.classes_)
disp.plot()
plt.show()

How to read the results:

- True Negatives (TN): 26211 - Correct predictions for class 0.

- False Positives (FP): 4255 - Incorrect predictions that indicated 1 instead of 0.

- False Negatives (FN): 3075 - Incorrect predictions that indicated 0 instead of 1.

- True Positives (TP): 1865 - Correct predictions for class 1.

## Neural Network

In [None]:
# `train_target` is converted to a NumPy array of type `float32` and reshaped into a 2D array with one column. 
# `train_feature` is reshaped into a 3D array to match the expected input format for a neural network (samples, timesteps, features).

y_train = np.asarray(train_target).astype('float32').reshape((-1,1))
x_train = np.reshape(train_feature.values, (train_feature.shape[0], 1, train_feature.shape[1]))

print(y_train.shape)
print(x_train.shape)

In [None]:
# This block defines a Sequential neural network model using TensorFlow:
   # - The input layer expects data with a shape of `(1, 17)`
   # - A `Flatten` layer reduces the dimensions to a single vector.
   # - Two dense (fully connected) layers, each with 256 units and a sigmoid activation function, are included, followed by a 10% dropout layer to prevent overfitting.
   # - The final dense layer outputs a single value with a sigmoid activation function, suitable for binary classification.
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(1, 17)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation='sigmoid'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(256, activation='sigmoid'),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Model compilation
model.compile(optimizer='adamax',  # Adamax optimizer
              loss='binary_crossentropy',  # Binary cross-entropy for classification (appropriate for binary classification tasks)
              metrics=['accuracy']) # Accuracy as the evaluation metric

history = model.fit(
    x_train, y_train,
    epochs=20,            # The model will go through the entire training dataset 20 times
    batch_size=256,       # The training data will be divided into batches of 256 samples
    validation_split=0.2  # Reserves 20% of the training data for validation
)

# Save the model
model.save('./models/neural_network.keras')

In [None]:
# Plot the training and validation accuracy
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training Acc')
plt.plot(epochs, val_acc, 'b', label='Validation Acc')
plt.title('Training and validation Acc')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend()
plt.show()

In [None]:
# Make predictions
x_test = np.reshape(test_feature.values, (test_feature.shape[0], 1, test_feature.shape[1]))
test_pred = (model.predict(x_test) > 0.5).astype('int32')

# Compute the performance of the model
report_scores(test_target, test_pred)

How to read the result:
- **Non-Top 20**: 
    - Precision: 0.87 - Of all the predictions that the model classified as ‘Non-Top 20’, 87% were correct.
    - Recall: 1.00: 100% of the riders actually ‘Non-Top 20’ were correctly identified by the model.
    - F1-Score: 0.93 - Represents the balance between precision and recall, and is very high for this class, indicating that the model is excellent at correctly distinguishing ‘Non-Top 20’ cyclists.
    - Support: 30.466 - Indicates the total number of true samples belonging to the ‘Non-Top 20’ class.
- **Top 20**: 
    - Precision: 0.76 - Of all predictions classified as ‘Top 20’, 76% is correct. 
    - Recall: 0.06 - 6% of the cyclists actually in the ‘Top 20’ were recognised correctly. The model is not effective in capturing true positives in this class.
    - F1-Score: 0.11 - Being the balance between precision and recall, the low value suggests that the model has difficulty with the ‘Top 20’ class.
    - Support: 4.940 - Indicates the total number of true samples belonging to the ‘Top 20’ class.
- **Accuracy** 
    - Accuracy: 0.87 - Percentage of correct predictions out of the total data. Although the value is high, it is influenced by the strong dominance of the Non-Top 20 class (majority class).
- **Macro Average**
    - Precision: 0.81 - Arithmetic mean of the precision of the two classes.
    - Recall: 0.53 - Arithmetic mean of the recall of the two classes. Low due to extremely low recall for the ‘Top 20’ class.
    - F1-Score: 0.52 - Arithmetic mean of the F1-Score of the two classes. Reflects the difficulty of the model in handling the ‘Top 20’ class.
- **Weighted Average**.
    - Precision: 0.85 - Weighted average of the precision, considering the support (size) of each class.
    - Recall: 0.87 - Weighted average of recall, strongly influenced by the high recall of the ‘Non-Top 20’ class.
    - F1-Score: 0.81 - Weighted average of the F1-Score.

In [None]:
# Compute and visualize Confusion Matrix
cm = confusion_matrix(test_target, test_pred)
print("Confusion Matrix:")
print(cm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dt.classes_)
disp.plot()
plt.show()

How to read the results:

- True Negatives (TN): 30378 - Correct predictions for class 0.

- False Positives (FP): 88 - Incorrect predictions that indicated 1 instead of 0.

- False Negatives (FN): 4660 - Incorrect predictions that indicated 0 instead of 1.

- True Positives (TP): 280 - Correct predictions for class 1.

## Comparison between all models

- **ROC curve**: Displays the relationship between the True Positive Rate and the False Positive Rate. A curve closer to the upper left corner indicates better performance.

- **AUC (Area Under Curve)**: A higher AUC value indicates a better predictive ability of the model. The maximum value is 1 (perfect classifier), while 0.5 indicates a random model.

In [None]:
plt.figure(0).clf()  # Clear the current figure

# Decision Tree
fpr, tpr, thresh = metrics.roc_curve(test_target, test_pred_dt)
auc = metrics.roc_auc_score(test_target, test_pred_dt)
plt.plot(fpr, tpr, label="DecisionTree, auc=" + str(round(auc, 3)))

# Random Forest
fpr, tpr, thresh = metrics.roc_curve(test_target, test_pred_rf)
auc = metrics.roc_auc_score(test_target, test_pred_rf)
plt.plot(fpr, tpr, label="RandomForest, auc=" + str(round(auc, 3)))

# AdaBoost
fpr, tpr, thresh = metrics.roc_curve(test_target, test_pred_clf)
auc = metrics.roc_auc_score(test_target, test_pred_clf)
plt.plot(fpr,tpr,label="AdaBoost, auc="+str(auc))

# Enhanced AdaBoost
fpr, tpr, thresh = metrics.roc_curve(test_target, test_pred_clf2)
auc = metrics.roc_auc_score(test_target, test_pred_clf2)
plt.plot(fpr,tpr,label="Enhanced AdaBoost, auc="+str(auc))

# Naive Bayes
fpr, tpr, thresh = metrics.roc_curve(test_target, test_pred_gnb)
auc = metrics.roc_auc_score(test_target, test_pred_gnb)
plt.plot(fpr, tpr, label="Naive Bayes, auc=" + str(round(auc, 3)))


# K-Nearest Neighbor
fpr, tpr, thresh = metrics.roc_curve(test_target, test_pred_knn)
auc = metrics.roc_auc_score(test_target, test_pred_knn)
plt.plot(fpr, tpr, label="KNN, auc=" + str(round(auc, 3)))

# Neural Network
test_pred_nn = model.predict(x_test).ravel()  # Predict probabilities for Neural Network
fpr, tpr, thresh = metrics.roc_curve(test_target, test_pred_nn)
auc = metrics.roc_auc_score(test_target, test_pred_nn)
plt.plot(fpr, tpr, label="Neural Network, auc=" + str(round(auc, 3)))

# Layout
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Comparison of ROC Curves')
plt.legend(loc=0)
plt.show()
