In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier


from sklearn.metrics import accuracy_score
pd.set_option('display.max_rows', 500)

import time
import numpy as np
input_directory = './parsed_data'

In [2]:
df = pd.read_csv(input_directory+'/data.csv')
#Get the currentl max time in the dataset
max_time = max(df.unix_time)
#Calculate the min time as six months back from the amx time
min_time = max_time - (86400 * 55)

#Filter the dataframe to only include observations in the last six months
df = df[df.unix_time > min_time]

#df = df.sample(100000)

df['date'] = pd.to_datetime(df['unix_time'],unit='s')
df = df.sort_values(by = 'date', ascending = False)

df['position'] = df.lon * df.lat
df['month'] = df.date.dt.month
df['class'] = 'low'
df.loc[df['count'] > 180, 'class'] = 'moderate'
#df.loc[df['count'] > 1200, 'class'] = 'high'

df = df[['sensor_id','class', 'unix_time','hour','position','lon','lon','month','date']]

In [3]:
#df = df[df.hour == 10]

In [4]:
df['class'].value_counts()

low         34560
moderate    34465
Name: class, dtype: int64

In [5]:
df.head()

Unnamed: 0,sensor_id,class,unix_time,hour,position,lon,lon.1,month,date
3653975,84,moderate,1659308400,23,-5482.284821,144.965034,144.965034,7,2022-07-31 23:00:00
2732835,21,moderate,1659308400,23,-5481.586789,144.967788,144.967788,7,2022-07-31 23:00:00
3022950,27,low,1659308400,23,-5480.233417,144.956447,144.956447,7,2022-07-31 23:00:00
2658536,23,low,1659308400,23,-5482.048692,144.954527,144.954527,7,2022-07-31 23:00:00
2429783,14,low,1659308400,23,-5482.513893,144.962919,144.962919,7,2022-07-31 23:00:00


In [6]:
df[df.date < pd.to_datetime('2022/07/01')]

Unnamed: 0,sensor_id,class,unix_time,hour,position,lon,lon.1,month,date
3022375,27,low,1656630000,23,-5480.233417,144.956447,144.956447,6,2022-06-30 23:00:00
1997276,11,low,1656630000,23,-5480.989213,144.939707,144.939707,6,2022-06-30 23:00:00
2203284,5,moderate,1656630000,23,-5482.502793,144.967877,144.967877,6,2022-06-30 23:00:00
195035,40,low,1656630000,23,-5481.400796,144.972276,144.972276,6,2022-06-30 23:00:00
258391,36,low,1656630000,23,-5481.929285,144.961211,144.961211,6,2022-06-30 23:00:00
...,...,...,...,...,...,...,...,...,...
898600,59,low,1654560000,0,-5480.800121,144.963049,144.963049,6,2022-06-07 00:00:00
3629088,77,low,1654560000,0,-5480.984966,144.944330,144.944330,6,2022-06-07 00:00:00
3171861,30,low,1654560000,0,-5481.362579,144.966568,144.966568,6,2022-06-07 00:00:00
924137,61,low,1654560000,0,-5480.717372,144.963091,144.963091,6,2022-06-07 00:00:00


Test and train data will be split based on sensor ID. We're going to simulate the real world data by removing one sensor worth of data for each fold and then predicting how well the KNN algoritmh performs on this data. This is important as it means that our algorithm won't be able to rely on historical readings from the same location, which is somthing that will be present in the real world data supplied to the algortihm.

From the output below, we can observe some 87 folds with varying observation counts per sensor location

In [7]:
sensor_folds = df.groupby(['sensor_id']).count().reset_index().sensor_id.tolist()
df.groupby(['sensor_id']).count().reset_index()

Unnamed: 0,sensor_id,class,unix_time,hour,position,lon,lon.1,month,date
0,1,1320,1320,1320,1320,1320,1320,1320,1320
1,2,1320,1320,1320,1320,1320,1320,1320,1320
2,3,1320,1320,1320,1320,1320,1320,1320,1320
3,5,1320,1320,1320,1320,1320,1320,1320,1320
4,6,1320,1320,1320,1320,1320,1320,1320,1320
5,9,1320,1320,1320,1320,1320,1320,1320,1320
6,11,1320,1320,1320,1320,1320,1320,1320,1320
7,14,1320,1320,1320,1320,1320,1320,1320,1320
8,17,937,937,937,937,937,937,937,937
9,19,1032,1032,1032,1032,1032,1032,1032,1032


To simulate our data environment we need to do the following:
- Generate a test set from a single sensor_id location and filter in teh current month only. This is becasue under real conditions we won't have training data from the prior month. Likewise, we won't be predicting congestion for the past.
- Filter the training dataset so that it doesn't include the current month, only historical time periods.

In [8]:
t0 = time.time()

#Keep test samples after this date, keep training samples before this date
sample_break_date = '2022/07/01'

results_dict = {}
# {k: { fold:[test, train], fold:[test, train] }, k+1: {fold} }
for k in range(5, 70):
    print(k)
    for fold in sensor_folds:
        
        train_df = df[df.date < pd.to_datetime('2022/07/01')]
        test_df = df[df.date > pd.to_datetime('2022/07/01')]
        
        X_train = train_df[train_df.sensor_id != fold].drop(['class', 'sensor_id','date'], axis = 1)
        X_test = test_df[test_df.sensor_id == fold].drop(['class', 'sensor_id','date'], axis = 1)
        
        if len(X_test) < 30:
            continue
        
        y_train = train_df[train_df.sensor_id != fold]
        y_train = y_train['class']

        y_test = test_df[test_df.sensor_id == fold]
        y_test = y_test['class']

        ss = StandardScaler()

        X_train = ss.fit_transform(X_train)
        X_test = ss.transform(X_test)

        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        
        if k not in results_dict:
            results_dict[k] = {}
        
        #Calculate the training error
        X_train_prediction = knn.predict(X_train)
        X_test_prediction = knn.predict(X_test)
        
        
        #Calculate the test error
        
        
        results_dict[k][fold] = [accuracy_score(y_train, X_train_prediction), accuracy_score(y_test, X_test_prediction)]
        
t1 = time.time()

print('Code Segment took',t1 - t0,'seconds to execute.')

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50


KeyboardInterrupt: 

In [9]:
results_dict

{5: {1: [0.9731684554363043, 0.8559892328398385],
  2: [0.9732017865475635, 0.8936742934051144],
  3: [0.9735017665488968, 0.8815612382234186],
  5: [0.9734351043263783, 0.8223418573351279],
  6: [0.9730684621025265, 0.9353970390309556],
  9: [0.9733017798813413, 0.7940780619111709],
  11: [0.9738684087727485, 0.6783310901749664],
  14: [0.9737017532164522, 0.8344549125168237],
  17: [0.9737350843277115, 0.44722222222222224],
  19: [0.9723892746785394, 0.7749546279491834],
  20: [0.9737017532164522, 0.800807537012113],
  21: [0.973668422105193, 0.8896366083445492],
  23: [0.9737350843277115, 0.800807537012113],
  24: [0.9735017665488968, 0.8842530282637954],
  26: [0.9732351176588228, 0.5289367429340511],
  27: [0.9741683887740817, 0.7090592334494773],
  28: [0.9733684421038598, 0.8694481830417228],
  29: [0.9737684154389707, 0.7133243606998654],
  30: [0.9735017665488968, 0.8613728129205922],
  35: [0.9732351176588228, 0.9340511440107672],
  36: [0.9730322880989593, 0.7335127860026918

In [10]:
df_results = df.groupby(['sensor_id']).count().reset_index()
#df_results = df_results[['sensor_id', 'count']]
df_results['sum'] = df_results['class'].sum()

In [11]:
def unpack_results(input_dict):
    #Return a df with fold, k value, test and train error as attribtues
    sensor_id = []
    k_value = []
    test = []
    train = []
    for key in input_dict:
        for fold in input_dict[key]:
            sensor_id.append(fold)
            k_value.append(key)
            test.append(input_dict[key][fold][1])
            train.append(input_dict[key][fold][0])
            
    output_df = pd.DataFrame(list(zip(sensor_id, k_value, train, test)),
                   columns =['fold', 'k','train_error','test_error'])
    
    return output_df

In [12]:
results = unpack_results(results_dict)
df_results = results.merge(df_results, how = 'left', left_on = 'fold', right_on = 'sensor_id')
#df_results = df_results[['fold','k','train_error','test_error']]

In [13]:
#df_results['train_error'] = df_results['train_error']*(df_results['class'] /df_results['sum'] )
#df_results['test_error'] = df_results['test_error']*(df_results['class'] /df_results['sum'] )

In [14]:
df_results

Unnamed: 0,fold,k,train_error,test_error,sensor_id,class,unix_time,hour,position,lon,lon.1,month,date,sum
0,1,5,0.973168,0.855989,1,1320,1320,1320,1320,1320,1320,1320,1320,69025
1,2,5,0.973202,0.893674,2,1320,1320,1320,1320,1320,1320,1320,1320,69025
2,3,5,0.973502,0.881561,3,1320,1320,1320,1320,1320,1320,1320,1320,69025
3,5,5,0.973435,0.822342,5,1320,1320,1320,1320,1320,1320,1320,1320,69025
4,6,5,0.973068,0.935397,6,1320,1320,1320,1320,1320,1320,1320,1320,69025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2472,67,50,0.874008,0.843876,67,1320,1320,1320,1320,1320,1320,1320,1320,69025
2473,68,50,0.875708,0.647376,68,1320,1320,1320,1320,1320,1320,1320,1320,69025
2474,69,50,0.877008,0.581427,69,1320,1320,1320,1320,1320,1320,1320,1320,69025
2475,70,50,0.874108,0.576043,70,1320,1320,1320,1320,1320,1320,1320,1320,69025


In [15]:
df_results = df_results.groupby(['k']).mean().reset_index()

In [16]:
df_results.head(100)

Unnamed: 0,k,fold,train_error,test_error,sensor_id,class,unix_time,hour,position,lon,lon.1,month,date,sum
0,5,44.296296,0.973581,0.76154,44.296296,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,69025.0
1,6,44.296296,0.95966,0.767942,44.296296,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,69025.0
2,7,44.296296,0.958111,0.768323,44.296296,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,69025.0
3,8,44.296296,0.94512,0.772344,44.296296,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,69025.0
4,9,44.296296,0.945129,0.771635,44.296296,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,69025.0
5,10,44.296296,0.935597,0.773831,44.296296,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,69025.0
6,11,44.296296,0.937139,0.76879,44.296296,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,69025.0
7,12,44.296296,0.933787,0.769945,44.296296,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,69025.0
8,13,44.296296,0.933956,0.772676,44.296296,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,69025.0
9,14,44.296296,0.929919,0.773591,44.296296,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,1278.240741,69025.0


In [17]:
df_results.to_csv('k_'+str(k)+'results_classification.csv', index = False)