In [1]:
import numpy as numpy
import pandas as pd
from model import KNN
from metrics import *

# Preprocessing
The preprocessing step is divided into multiple parts, one is importing the data from a csv to a compatible format for further processing.<br>
This can be done using the `pandas.read_csv()` function, which returns a `DataFrame` object which canthen be used to create the model.

In [34]:
df = pd.read_csv("./weather_forecast_data.csv")
df.head()

Unnamed: 0,Temperature,Humidity,Wind_Speed,Cloud_Cover,Pressure,Rain
0,23.720338,89.592641,7.335604,50.501694,1032.378759,rain
1,27.879734,46.489704,5.952484,4.990053,992.61419,no rain
2,25.069084,83.072843,1.371992,14.855784,1007.23162,no rain
3,23.62208,74.367758,7.050551,67.255282,982.632013,rain
4,20.59137,96.858822,4.643921,47.676444,980.825142,no rain


## Data Cleaning
The dataset can contain invalid values, which could lead to errors when trying to process the dataset. In order to maintain reliability of the dataset,<br> invalid rows are removed using the `DataFrame.fillna()` method with the filling value being the mean of the dataset. Furthermore, the classification<br>
column's datatype is converted to an integer for easier comparison.

In [35]:
df.rename(str.strip, axis='columns', inplace=True)
df.rename(str.lower, axis='columns', inplace=True)
category_column = df[df.columns[-1]]
del category_column
print("Null columns:\n")
display(df.isnull().sum().T.to_frame().rename(columns={0:"null_values"}))
for col in df.columns:
  if df[col].isnull().sum() > 0:
    val = df[col].mean()
    df[col] = df[col].fillna(val)
# convert to appropriate datatype if needed
df[df.columns[:-1]] = df[df.columns[:-1]].astype(np.float64)
df[df.columns[-1]] = df[df.columns[-1]].apply(lambda cell: np.int64(0) if cell.startswith("no") else np.int64(1))
df.head()

Null columns:



Unnamed: 0,null_values
temperature,0
humidity,0
wind_speed,0
cloud_cover,0
pressure,0
rain,0


Unnamed: 0,temperature,humidity,wind_speed,cloud_cover,pressure,rain
0,23.720338,89.592641,7.335604,50.501694,1032.378759,1
1,27.879734,46.489704,5.952484,4.990053,992.61419,0
2,25.069084,83.072843,1.371992,14.855784,1007.23162,0
3,23.62208,74.367758,7.050551,67.255282,982.632013,1
4,20.59137,96.858822,4.643921,47.676444,980.825142,0


# Further Processing
The dataset contains the "day" column which is not needed as the datatype for the column is not numeric, and conversion to a numeric value<br> 
is not feasable within the current context. As a result, the column is dropped from the dataset to reduce processing complexity. The dataset<br>
also contains an unbalanced amount of points since there was no rain in the majority of the days. To balance the dataset, the dataset is reduced<br>
so that the amount of class instances are equal. The correlation between each columns with the target is also needed to determine whether the<br> 
columns are required in classifying new data. This can be done using the `DataFrame.corrwith()` method.

In [36]:
sp_cor: pd.Series = df[df.columns[:-1]].corrwith(df[df.columns[-1]], method="spearman")
pr_cor: pd.Series = df[df.columns[:-1]].corrwith(df[df.columns[-1]], method="pearson")
corr: pd.DataFrame = pd.concat([sp_cor, pr_cor], axis=1)
corr.columns = ['spearman', 'pearson']
display(corr.head(corr.shape[0]))
del sp_cor
del pr_cor
del corr

Unnamed: 0,spearman,pearson
temperature,-0.264626,-0.265882
humidity,0.381074,0.382464
wind_speed,-0.001575,-0.001389
cloud_cover,0.328231,0.326216
pressure,0.008334,0.008273


In [37]:
tmp = df.groupby(df.columns[-1])[df.columns[:]]
g1 = tmp.get_group(0)
g2 = tmp.get_group(1)

del tmp
if g1.shape[0] > g2.shape[0]:
	indexes = g1.sample(g1.shape[0] - g2.shape[0]).index
	g1 = g1.drop(index=indexes, axis=0)
elif g2.shape[0] > g1.shape[0]:
	indexes = g2.sample(g2.shape[0] - g1.shape[0]).index
	g2 = g2.drop(index=indexes, axis=0)

df = pd.concat([g1, g2], ignore_index=True).sample(frac=1).reset_index(drop=True)
del g1
del g2
df.head()

Unnamed: 0,temperature,humidity,wind_speed,cloud_cover,pressure,rain
0,10.615363,49.352053,7.765742,52.595381,1008.575658,0
1,16.259955,54.575162,6.642316,46.526024,1039.019194,0
2,13.736208,97.481427,11.235005,36.834326,1031.676071,0
3,24.616572,52.178983,9.527765,48.561755,1032.042841,0
4,32.979275,45.09729,13.067886,0.352408,1028.162688,0


# Training and Evaluation
When training the model, the accuracy and precision of the model might be lower than previously expected, even after processing the data. This<br>
is because of potential imbalances in the data indexes, which could lead to the model being biased towards a specific class. By splitting the dataset<br>
into multiple subsets, one for training and another for testing, the bias towards a specific class can be minimized.

In [41]:
folds: int = 5

from typing import NamedTuple
model_evaluations: list[str] = ['accuracy', 'macro_precision', 'macro_recall']
distance_measures: list[str] = ['euclidian', 'mahalanobis', 'cosine', 'centroid']
Metrics = NamedTuple("Metrics", [(m, list[np.float64]) for m in model_evaluations])
k_values: list[int] = [3, 5, 7, 9, 11]
models: dict[int, list[KNN]] = { k:[] for k in k_values }
best_models: dict[int, dict[str, KNN]] = { k:{ measure:None for measure in distance_measures } for k in k_values }
model_metrics: dict[int, dict[str, Metrics]] = { k:{ measure:Metrics([], [], []) for measure in distance_measures } for k in k_values }
benchmarks: dict[int, dict[str, dict[str, list[np.float64]]]] = { k:{ measure:{m:[] for m in model_evaluations} for measure in distance_measures } for k in k_values }
for k in k_values:
	print(f"using k value of {k}")
	for train, test in split_data(df, folds):
		model = KNN(k=k)
		model.fit(train)
		actual = test[test.columns[-1]].to_numpy()
		for distance_measure in distance_measures:
			curr_measure = model_metrics[k][distance_measure]
			print(f"distance measure: {distance_measure}")
			pred = model.predict(test[test.columns[:-1]], distance_measure)
			acc, macro_prec, macro_rec = calculate_macro_metrics(pred, actual)
			print(f"accuracy: {acc:.2%}, macro precision: {macro_prec:.2%}, macro recall: {macro_rec:.2%}")
			curr_measure.accuracy.append(acc)
			curr_measure.macro_precision.append(macro_prec)
			curr_measure.macro_recall.append(macro_rec)
			benchmarks[k][distance_measure]['accuracy'].append(acc)
			benchmarks[k][distance_measure]['macro_precision'].append(macro_prec)
			benchmarks[k][distance_measure]['macro_recall'].append(macro_rec)
		models[k].append(model)
		

using k value of 3
distance measure: euclidian
accuracy: 92.64%, macro precision: 92.70%, macro recall: 93.18%
distance measure: mahalanobis
accuracy: 93.24%, macro precision: 93.30%, macro recall: 93.82%
distance measure: cosine
accuracy: 92.64%, macro precision: 92.70%, macro recall: 93.18%
distance measure: centroid
accuracy: 86.88%, macro precision: 86.91%, macro recall: 86.98%
distance measure: euclidian
accuracy: 92.64%, macro precision: 92.73%, macro recall: 93.32%
distance measure: mahalanobis
accuracy: 91.85%, macro precision: 91.94%, macro recall: 92.60%
distance measure: cosine
accuracy: 92.64%, macro precision: 92.73%, macro recall: 93.32%
distance measure: centroid
accuracy: 83.10%, macro precision: 83.23%, macro recall: 84.25%
distance measure: euclidian
accuracy: 93.64%, macro precision: 93.37%, macro recall: 94.37%
distance measure: mahalanobis
accuracy: 92.45%, macro precision: 92.23%, macro recall: 92.90%
distance measure: cosine
accuracy: 93.64%, macro precision: 93.

In [None]:
# rename the columns to avoid _ error in LaTeX
dk = pd.DataFrame.from_dict({(i,' '.join(map(str.capitalize, k.split('_'))),l): {j:benchmarks[i][j][k][l] for j in benchmarks[i].keys()}
                           for i in benchmarks.keys()
						   for j in benchmarks[i].keys()
						   for k in benchmarks[i][j].keys()
						   for l in range(folds)},
                       orient='index')
with pd.option_context('display.max_rows', None):
	display(dk)

# delete unnecessary variables
benchmarks.clear()
del benchmarks
del dk

Unnamed: 0,Unnamed: 1,Unnamed: 2,euclidian,mahalanobis,cosine,centroid
3,Accuracy,0,0.926441,0.932406,0.926441,0.868787
3,Accuracy,1,0.926441,0.918489,0.926441,0.831014
3,Accuracy,2,0.936382,0.924453,0.936382,0.88668
3,Accuracy,3,0.934394,0.920477,0.934394,0.84493
3,Accuracy,4,0.912525,0.910537,0.912525,0.854871
3,Macro Precision,0,0.927007,0.932992,0.927007,0.869051
3,Macro Precision,1,0.92734,0.919442,0.92734,0.832282
3,Macro Precision,2,0.933698,0.922291,0.933698,0.883619
3,Macro Precision,3,0.934796,0.921237,0.934796,0.845509
3,Macro Precision,4,0.913386,0.911417,0.913386,0.856062


                     euclidian  mahalanobis    cosine  centroid
3 Accuracy        0   0.926441     0.932406  0.926441  0.868787
                  1   0.926441     0.918489  0.926441  0.831014
                  2   0.936382     0.924453  0.936382  0.886680
                  3   0.934394     0.920477  0.934394  0.844930
                  4   0.912525     0.910537  0.912525  0.854871
  Macro Precision 0   0.927007     0.932992  0.927007  0.869051
                  1   0.927340     0.919442  0.927340  0.832282
                  2   0.933698     0.922291  0.933698  0.883619
                  3   0.934796     0.921237  0.934796  0.845509
                  4   0.913386     0.911417  0.913386  0.856062
  Macro Recall    0   0.931759     0.938188  0.931759  0.869788
                  1   0.933218     0.926044  0.933218  0.842463
                  2   0.943737     0.929020  0.943737  0.894070
                  3   0.935623     0.925190  0.935623  0.847322
                  4   0.924915     0.923

In [None]:
for k in k_values:
	print(f"using k value of {k}")
	for distance_measure in distance_measures:
		curr_model_metrics = model_metrics[k][distance_measure]
		print(f"model {np.argmax(curr_model_metrics.accuracy)} achieved best accuracy of {max(curr_model_metrics.accuracy):.2%}")
		print(f"model {np.argmax(curr_model_metrics.macro_precision)} achieved best macro precision of {max(curr_model_metrics.macro_precision):.2%}")
		print(f"model {np.argmax(curr_model_metrics.macro_recall)} achieved best macro recall of {max(curr_model_metrics.macro_recall):.2%}")

		best_model_index = np.argmax(np.mean(np.array([curr_model_metrics.accuracy, curr_model_metrics.macro_precision, curr_model_metrics.macro_recall]), axis=0))
		best_models[k][distance_measure] = models[k][best_model_index]
		print(f"overall best trained model using {distance_measure} {"similarity" if distance_measure == "cosine" else "distance"}: model {best_model_index}")

# delete unnecessary variables
del models
del k_values
del model_metrics
del distance_measures
del model_evaluations

using k value of 3
model 1 achieved best accuracy of 94.23%
model 1 achieved best macro precision of 94.21%
model 1 achieved best macro recall of 94.34%
overall best trained model using euclidian distance: model 1
model 1 achieved best accuracy of 90.85%
model 1 achieved best macro precision of 90.82%
model 3 achieved best macro recall of 91.92%
overall best trained model using mahalanobis distance: model 3
model 1 achieved best accuracy of 94.23%
model 1 achieved best macro precision of 94.21%
model 1 achieved best macro recall of 94.34%
overall best trained model using cosine similarity: model 1
model 0 achieved best accuracy of 86.08%
model 0 achieved best macro precision of 86.20%
model 4 achieved best macro recall of 88.00%
overall best trained model using centroid distance: model 4
using k value of 5
model 1 achieved best accuracy of 92.64%
model 1 achieved best macro precision of 92.60%
model 1 achieved best macro recall of 92.97%
overall best trained model using euclidian dista

# Prediction
After calculating the overall best model, it can be used to classify future data
based on its training dataset. 

In [32]:
arr = [33, 50, 210, 20, 12]

for k, models in best_models.items():
	print(f"using k value of {k}")
	for distance_measure, model in models.items():
		print(f"{distance_measure} {"similarity" if distance_measure == "cosine" else "distance"} model result for {arr}: { model.predict(arr, distance_measure) }")

using k value of 3
euclidian distance model result for [33, 50, 210, 20, 12]: 1
mahalanobis distance model result for [33, 50, 210, 20, 12]: 0
cosine similarity model result for [33, 50, 210, 20, 12]: 1
centroid distance model result for [33, 50, 210, 20, 12]: 1
using k value of 5
euclidian distance model result for [33, 50, 210, 20, 12]: 1
mahalanobis distance model result for [33, 50, 210, 20, 12]: 0
cosine similarity model result for [33, 50, 210, 20, 12]: 1
centroid distance model result for [33, 50, 210, 20, 12]: 1
using k value of 7
euclidian distance model result for [33, 50, 210, 20, 12]: 1
mahalanobis distance model result for [33, 50, 210, 20, 12]: 0
cosine similarity model result for [33, 50, 210, 20, 12]: 1
centroid distance model result for [33, 50, 210, 20, 12]: 1
using k value of 9
euclidian distance model result for [33, 50, 210, 20, 12]: 1
mahalanobis distance model result for [33, 50, 210, 20, 12]: 0
cosine similarity model result for [33, 50, 210, 20, 12]: 1
centroid