In [None]:
#Load the required modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

import time
import numpy as np

#Set the input directory containing the parsed data
input_directory = './parsed_data'

In [None]:
#Load the data and add final data enrichment
df = pd.read_csv(input_directory+'/data.csv')

df['date'] = pd.to_datetime(df['unix_time'],unit='s')
df = df.sort_values(by = 'date', ascending = False)

df['position'] = df.lon * df.lat
df['month'] = df.date.dt.month
df['day_of_week'] = df.date.dt.dayofweek

df = df[['sensor_id','count', 'unix_time','hour','month','day_of_week','position','date']]

Test and train data will be split based on sensor ID. We're going to simulate the real world data by removing one sensor worth of data for each fold and then predicting how well the KNN algoritmh performs on this data. This is important as it means that our algorithm won't be able to rely on historical readings from the same location, which is somthing that will be present in the real world data supplied to the algortihm.

From the output below, we can observe some 87 folds with varying observation counts per sensor location

In [None]:
#Define k folds based on the sensor numbers
sensor_folds = df.groupby(['sensor_id']).count().reset_index().sensor_id.tolist()
df.groupby(['sensor_id']).count().reset_index()

To simulate our data environment we need to do the following:
- Generate a test set from a single sensor_id location and filter in teh current month only. This is becasue under real conditions we won't have training data from the prior month. Likewise, we won't be predicting congestion for the past.
- Filter the training dataset so that it doesn't include the current month, only historical time periods.

In [None]:
#Define a function to convert regression outputs into a classificaiton output
def convert_to_classification(input_predictions):
    output_classes = []

    for item in input_predictions:
        if item < 180:
            output_classes.append('low')           
        else:
            output_classes.append('high')

    return output_classes

#Define a function to calcualte precision and recall for a give classification label
def precision_recall(true, prediction, label):
    true_positive = 0
    false_positive = 0
    false_negative = 0
    precision = 0
    recall = 0
    
    for n in range(0, len(true)):
        if true[n] != label and prediction[n] != label:
            continue
        elif true[n] == prediction[n]:
            true_positive += 1
        elif true[n] == label and prediction[n] != label:
            false_negative += 1
        elif prediction[n] == label and true[n] != label:
            false_positive += 1
        
        if true_positive + false_negative != 0:
            precision = true_positive / (true_positive + false_negative)
        else:
            precision = 0
        
        if true_positive + false_positive != 0:
            recall = true_positive / (true_positive + false_positive)
        else:
            recall = 0
            
    return (precision, recall)

In [None]:
t0 = time.time()

#Keep test samples after this date, keep training samples before this date
sample_break_date = '2022/07/01'

results_dict = {}

#Iterate over each potential value of K
for k in range(1,31):
    
    #For each fold for the given value of K
    for fold in sensor_folds:
        
        #Generate a test and train dataset
        train_df = df[df.date < pd.to_datetime('2022/07/01')]
        test_df = df[df.date > pd.to_datetime('2022/07/01')]
        
        X_train = train_df[train_df.sensor_id != fold].drop(['count', 'sensor_id','date'], axis = 1)
        X_test = test_df[test_df.sensor_id == fold].drop(['count', 'sensor_id','date'], axis = 1)
        
        #Skip any folds which are new and don't have more than 30 sampels.
        if len(X_test) < 30:
            continue
        
        #Set the y and train and test data
        y_train = train_df[train_df.sensor_id != fold]
        y_train = y_train['count']

        y_test = test_df[test_df.sensor_id == fold]
        y_test = y_test['count']
        
        #Define a scalar to ensure the magnitude differences in the attribtues are reduced
        ss = StandardScaler()
        
        #Transform both the test and train data
        X_train = ss.fit_transform(X_train)
        X_test = ss.transform(X_test)
        
        #Create a model and fit on the train data
        knn = KNeighborsRegressor(n_neighbors=k)
        knn.fit(X_train, y_train)
        
        #Add a new k entry to the results dict if it doesn't already exist
        if k not in results_dict:
            results_dict[k] = {}
        
        #Calculate the training error
        X_train_prediction = knn.predict(X_train)
        X_test_prediction = knn.predict(X_test)
        
        #Add the results and the relevant metrics to the resutls dict
        results_dict[k][fold] = [mean_squared_error(y_train, X_train_prediction, squared = False),
                                 mean_squared_error(y_test, X_test_prediction, squared = False),
                                accuracy_score(convert_to_classification(input_predictions = y_train), convert_to_classification(input_predictions = X_train_prediction)),
                                 accuracy_score( convert_to_classification(input_predictions = y_test), convert_to_classification(input_predictions = X_test_prediction)),
                                precision_recall(convert_to_classification(input_predictions = y_test), convert_to_classification(input_predictions = X_test_prediction), label = 'low'),
                                precision_recall(convert_to_classification(input_predictions = y_test), convert_to_classification(input_predictions = X_test_prediction), label = 'moderate'),
                                precision_recall(convert_to_classification(input_predictions = y_test), convert_to_classification(input_predictions = X_test_prediction), label = 'high'),
                        ]
        
t1 = time.time()

#Return how long the training event took to the user
print('Code Segment took',t1 - t0,'seconds to execute.')

In [None]:
#Create an empty dataframe to store the results
df_results = df.groupby(['sensor_id']).count().reset_index()
df_results = df_results[['sensor_id', 'count']]
df_results['sum'] = df_results['count'].sum()

In [None]:
def unpack_results(input_dict):
    #Return a df with fold, k value, test and train error as attribtues
    sensor_id = []
    k_value = []
    test = []
    train = []
    test_accuracy = []
    train_accuracy = []
    
    low_precision = []
    low_recall = []
    moderate_precision = []
    moderate_recall = []
    high_precision = []
    high_recall = []
    
    for key in input_dict:
        for fold in input_dict[key]:
            sensor_id.append(fold)
            k_value.append(key)
            test.append(input_dict[key][fold][1])
            train.append(input_dict[key][fold][0])
            test_accuracy.append(input_dict[key][fold][3])
            train_accuracy.append(input_dict[key][fold][2])
            low_precision.append(input_dict[key][fold][4][0])
            low_recall.append(input_dict[key][fold][4][1])
            moderate_precision.append(input_dict[key][fold][5][0])
            moderate_recall.append(input_dict[key][fold][5][1])
            high_precision.append(input_dict[key][fold][6][0])
            high_recall.append(input_dict[key][fold][6][1])
            
    output_df = pd.DataFrame(list(zip(sensor_id, k_value, train, test, train_accuracy, test_accuracy, low_precision, low_recall, moderate_precision, moderate_recall, high_precision, high_recall)),
                   columns =['fold', 'k','train_error','test_error','train_accuracy','test_accuracy','low_precision','low_recall','moderate_precision','moderate_recall','high_precision','high_recall'])
    
    return output_df

In [None]:
#Unpack the results into a dataframe
results = unpack_results(results_dict)

In [None]:
#Merge the results into our empty dataframe
df_results = results.merge(df_results, how = 'left', left_on = 'fold', right_on = 'sensor_id')
df_results = df_results[['fold','k','train_error','test_error','train_accuracy','test_accuracy','low_precision','low_recall','moderate_precision','moderate_recall','high_precision','high_recall','count','sum']]

In [None]:
df_results = df_results.groupby(['k']).mean().reset_index()

In [None]:
print(df_results.test_accuracy.mean())
df_results

In [None]:
#Save the results
df_results.to_csv('k_'+str(k)+'results.csv', index = False)