In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

pd.set_option('display.max_rows', 500)

import time
import numpy as np
input_directory = './parsed_data'

In [2]:
df = pd.read_csv(input_directory+'/data.csv')
#Get the currentl max time in the dataset

#Calculate the min time as six months back from the amx time
#min_time = max_time - (86400 * 30)

#Filter the dataframe to only include observations in the last six months
#df = df[df.unix_time > min_time]

#df = df.sample(10000)

df['date'] = pd.to_datetime(df['unix_time'],unit='s')
df = df.sort_values(by = 'date', ascending = False)

df['position'] = df.lon * df.lat
df['month'] = df.date.dt.month
df['day_of_week'] = df.date.dt.dayofweek

df = df[['sensor_id','count', 'unix_time','hour','month','day_of_week','position','date']]

Test and train data will be split based on sensor ID. We're going to simulate the real world data by removing one sensor worth of data for each fold and then predicting how well the KNN algoritmh performs on this data. This is important as it means that our algorithm won't be able to rely on historical readings from the same location, which is somthing that will be present in the real world data supplied to the algortihm.

From the output below, we can observe some 87 folds with varying observation counts per sensor location

In [3]:
sensor_folds = df.groupby(['sensor_id']).count().reset_index().sensor_id.tolist()
df.groupby(['sensor_id']).count().reset_index()

Unnamed: 0,sensor_id,count,unix_time,hour,month,day_of_week,position,date
0,1,2400,2400,2400,2400,2400,2400,2400
1,2,2400,2400,2400,2400,2400,2400,2400
2,3,2400,2400,2400,2400,2400,2400,2400
3,4,192,192,192,192,192,192,192
4,5,2400,2400,2400,2400,2400,2400,2400
5,6,2400,2400,2400,2400,2400,2400,2400
6,8,2400,2400,2400,2400,2400,2400,2400
7,9,2208,2208,2208,2208,2208,2208,2208
8,11,2400,2400,2400,2400,2400,2400,2400
9,12,2400,2400,2400,2400,2400,2400,2400


In [4]:
df.head()

Unnamed: 0,sensor_id,count,unix_time,hour,month,day_of_week,position,date
136440,87,12,1659308400,23,7,6,-5479.739909,2022-07-31 23:00:00
98975,31,79,1659308400,23,7,6,-5479.983048,2022-07-31 23:00:00
75456,14,139,1659308400,23,7,6,-5482.513893,2022-07-31 23:00:00
77856,26,155,1659308400,23,7,6,-5480.767167,2022-07-31 23:00:00
80256,24,180,1659308400,23,7,6,-5482.016483,2022-07-31 23:00:00


To simulate our data environment we need to do the following:
- Generate a test set from a single sensor_id location and filter in teh current month only. This is becasue under real conditions we won't have training data from the prior month. Likewise, we won't be predicting congestion for the past.
- Filter the training dataset so that it doesn't include the current month, only historical time periods.

In [4]:
def convert_to_classification(input_predictions):
    output_classes = []

    for item in input_predictions:
        if item < 180:
            output_classes.append('low')
        #elif item < 1200:
        #    output_classes.append('moderate')            
        else:
            output_classes.append('high')

    return output_classes

def precision_recall(true, prediction, label):
    true_positive = 0
    false_positive = 0
    false_negative = 0
    precision = 0
    recall = 0
    
    for n in range(0, len(true)):
        if true[n] != label and prediction[n] != label:
            continue
        elif true[n] == prediction[n]:
            true_positive += 1
        elif true[n] == label and prediction[n] != label:
            false_negative += 1
        elif prediction[n] == label and true[n] != label:
            false_positive += 1
        
        if true_positive + false_negative != 0:
            precision = true_positive / (true_positive + false_negative)
        else:
            precision = 0
        
        if true_positive + false_positive != 0:
            recall = true_positive / (true_positive + false_positive)
        else:
            recall = 0
            
    return (precision, recall)

In [5]:
t0 = time.time()

#Keep test samples after this date, keep training samples before this date
sample_break_date = '2022/07/01'

results_dict = {}
# {k: { fold:[test, train], fold:[test, train] }, k+1: {fold} }
for k in range(1,31):
    print(k)
    for fold in sensor_folds:
        
        train_df = df[df.date < pd.to_datetime('2022/07/01')]
        test_df = df[df.date > pd.to_datetime('2022/07/01')]
        
        max_time = max(train_df.unix_time)
        #Calculate the min time as six months back from the amx time
        min_time = max_time - (86400 * 100)

        #Filter the dataframe to only include observations in the last six months
        train_df = train_df[train_df.unix_time > min_time]
        
        X_train = train_df[train_df.sensor_id != fold].drop(['count', 'sensor_id','date'], axis = 1)
        X_test = test_df[test_df.sensor_id == fold].drop(['count', 'sensor_id','date'], axis = 1)
        #print(len(X_train), len(X_test))
        if len(X_test) < 30:
            continue
        
        y_train = train_df[train_df.sensor_id != fold]
        y_train = y_train['count']

        y_test = test_df[test_df.sensor_id == fold]
        y_test = y_test['count']

        ss = StandardScaler()

        X_train = ss.fit_transform(X_train)
        X_test = ss.transform(X_test)

        knn = KNeighborsRegressor(n_neighbors=k)
        knn.fit(X_train, y_train)
        
        if k not in results_dict:
            results_dict[k] = {}
        
        #Calculate the training error
        X_train_prediction = knn.predict(X_train)
        X_test_prediction = knn.predict(X_test)
        
        results_dict[k][fold] = [mean_squared_error(y_train, X_train_prediction, squared = False),
                                 mean_squared_error(y_test, X_test_prediction, squared = False),
                                accuracy_score(convert_to_classification(input_predictions = y_train), convert_to_classification(input_predictions = X_train_prediction)),
                                 accuracy_score( convert_to_classification(input_predictions = y_test), convert_to_classification(input_predictions = X_test_prediction)),
                                precision_recall(convert_to_classification(input_predictions = y_test), convert_to_classification(input_predictions = X_test_prediction), label = 'low'),
                                precision_recall(convert_to_classification(input_predictions = y_test), convert_to_classification(input_predictions = X_test_prediction), label = 'moderate'),
                                precision_recall(convert_to_classification(input_predictions = y_test), convert_to_classification(input_predictions = X_test_prediction), label = 'high'),
                        ]
        
t1 = time.time()

print('Code Segment took',t1 - t0,'seconds to execute.')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
Code Segment took 3050.2669444084167 seconds to execute.


In [6]:
results_dict

{1: {1: [0.0,
   809.3144723830247,
   1.0,
   0.8546433378196501,
   (0.6913946587537092, 0.9831223628691983),
   (0, 0),
   (0.9901477832512315, 0.7944664031620553)],
  2: [0.0,
   997.2142921905628,
   1.0,
   0.7806191117092867,
   (0.932258064516129, 0.6705336426914154),
   (0, 0),
   (0.6720554272517321, 0.9326923076923077)],
  3: [0.0,
   433.68721478639304,
   1.0,
   0.9313593539703903,
   (0.9190751445086706, 0.8112244897959183),
   (0, 0),
   (0.9350877192982456, 0.9744058500914077)],
  5: [0.0,
   772.9195093719,
   1.0,
   0.9165545087483177,
   (1.0, 0.7596899224806202),
   (0, 0),
   (0.886654478976234, 1.0)],
  6: [0.0,
   407.0251958960436,
   1.0,
   0.9690444145356663,
   (0.9438202247191011, 0.9281767955801105),
   (0, 0),
   (0.9769911504424779, 0.9822064056939501)],
  8: [0.0,
   426.00173449498084,
   1.0,
   0.7160161507402423,
   (0.5207373271889401, 0.9868995633187773),
   (0, 0),
   (0.9902912621359223, 0.5953307392996109)],
  9: [0.0,
   502.5698521185009,
 

In [7]:
df_results = df.groupby(['sensor_id']).count().reset_index()
df_results = df_results[['sensor_id', 'count']]
df_results['sum'] = df_results['count'].sum()

In [8]:
def unpack_results(input_dict):
    #Return a df with fold, k value, test and train error as attribtues
    sensor_id = []
    k_value = []
    test = []
    train = []
    test_accuracy = []
    train_accuracy = []
    
    low_precision = []
    low_recall = []
    moderate_precision = []
    moderate_recall = []
    high_precision = []
    high_recall = []
    
    for key in input_dict:
        for fold in input_dict[key]:
            sensor_id.append(fold)
            k_value.append(key)
            test.append(input_dict[key][fold][1])
            train.append(input_dict[key][fold][0])
            test_accuracy.append(input_dict[key][fold][3])
            train_accuracy.append(input_dict[key][fold][2])
            low_precision.append(input_dict[key][fold][4][0])
            low_recall.append(input_dict[key][fold][4][1])
            moderate_precision.append(input_dict[key][fold][5][0])
            moderate_recall.append(input_dict[key][fold][5][1])
            high_precision.append(input_dict[key][fold][6][0])
            high_recall.append(input_dict[key][fold][6][1])
            
    output_df = pd.DataFrame(list(zip(sensor_id, k_value, train, test, train_accuracy, test_accuracy, low_precision, low_recall, moderate_precision, moderate_recall, high_precision, high_recall)),
                   columns =['fold', 'k','train_error','test_error','train_accuracy','test_accuracy','low_precision','low_recall','moderate_precision','moderate_recall','high_precision','high_recall'])
    
    return output_df

In [9]:
results = unpack_results(results_dict)

In [10]:
df_results = results.merge(df_results, how = 'left', left_on = 'fold', right_on = 'sensor_id')
df_results = df_results[['fold','k','train_error','test_error','train_accuracy','test_accuracy','low_precision','low_recall','moderate_precision','moderate_recall','high_precision','high_recall','count','sum']]

In [11]:
#df_results['train_error'] = df_results['train_error']*(df_results['count'] /df_results['sum'] )
#df_results['test_error'] = df_results['test_error']*(df_results['count'] /df_results['sum'] )

In [12]:
df_results.head(100)

Unnamed: 0,fold,k,train_error,test_error,train_accuracy,test_accuracy,low_precision,low_recall,moderate_precision,moderate_recall,high_precision,high_recall,count,sum
0,1,1,0.0,809.314472,1.0,0.854643,0.691395,0.983122,0,0,0.990148,0.794466,103843,4117125
1,2,1,0.0,997.214292,1.0,0.780619,0.932258,0.670534,0,0,0.672055,0.932692,115915,4117125
2,3,1,0.0,433.687215,1.0,0.931359,0.919075,0.811224,0,0,0.935088,0.974406,110698,4117125
3,5,1,0.0,772.919509,1.0,0.916555,1.0,0.75969,0,0,0.886654,1.0,112796,4117125
4,6,1,0.0,407.025196,1.0,0.969044,0.94382,0.928177,0,0,0.976991,0.982206,115721,4117125
5,8,1,0.0,426.001734,1.0,0.716016,0.520737,0.9869,0,0,0.990291,0.595331,114217,4117125
6,9,1,0.0,502.569852,1.0,0.576043,1.0,0.576043,0,0,0.0,0.0,115889,4117125
7,11,1,0.0,450.480074,1.0,0.7214,0.759227,0.860558,0,0,0.597701,0.431535,114636,4117125
8,12,1,0.0,821.648745,1.0,0.726783,0.855107,0.717131,0,0,0.559006,0.746888,112645,4117125
9,14,1,0.0,754.541886,1.0,0.885599,0.681818,0.994475,0,0,0.997912,0.850534,110034,4117125


In [13]:
df_results = df_results.groupby(['k']).mean().reset_index()

In [14]:
print(df_results.test_accuracy.mean())
df_results#.sort_values('test_accuracy')

0.842630987602422


Unnamed: 0,k,fold,train_error,test_error,train_accuracy,test_accuracy,low_precision,low_recall,moderate_precision,moderate_recall,high_precision,high_recall,count,sum
0,1,42.474576,0.0,427.387923,1.0,0.80882,0.848932,0.813248,0.0,0.0,0.724479,0.739858,55981.423729,4117125.0
1,2,42.474576,279.365789,389.496715,0.913464,0.823184,0.831741,0.840212,0.0,0.0,0.777701,0.768602,55981.423729,4117125.0
2,3,42.474576,328.661217,367.164034,0.896364,0.841546,0.818575,0.874733,0.0,0.0,0.815482,0.760333,55981.423729,4117125.0
3,4,42.474576,328.84956,364.795278,0.895437,0.8443,0.815549,0.88499,0.0,0.0,0.811909,0.755792,55981.423729,4117125.0
4,5,42.474576,341.111621,374.586556,0.890739,0.848958,0.818254,0.888623,0.0,0.0,0.821013,0.759303,55981.423729,4117125.0
5,6,42.474576,352.105704,370.228745,0.887746,0.845347,0.81809,0.883405,0.0,0.0,0.805438,0.745491,55981.423729,4117125.0
6,7,42.474576,350.862517,368.361828,0.882999,0.85102,0.823249,0.885729,0.0,0.0,0.811368,0.755929,55981.423729,4117125.0
7,8,42.474576,353.867206,368.408284,0.88287,0.847938,0.810654,0.891249,0.0,0.0,0.826216,0.748054,55981.423729,4117125.0
8,9,42.474576,353.428285,370.12854,0.882733,0.851428,0.808551,0.898588,0.0,0.0,0.821432,0.733045,55981.423729,4117125.0
9,10,42.474576,352.461671,368.379371,0.881113,0.848865,0.807015,0.895474,0.0,0.0,0.817016,0.748577,55981.423729,4117125.0


In [15]:
df_results.to_csv('k_'+str(k)+'results.csv', index = False)