In [1]:
import json
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Subsystems and TSCs linked to Coronation Drive (subsystem: tcs)

In [2]:
ss_tsc = {2446: [87], 2447: [88, 278], 2458: [132], 2469: [181], 2473: [188], 2475: [203], 2480: [221], 2519: [364]}

## Preprocess the data

In [3]:
# Dataframe template
df = pd.DataFrame(columns=["dbid", "recorded", "ct", "link_plan", "married", "ss", "tsc", "lane",
                           "ds1", "mf1", "rf1", "ds2", "mf2", "rf2", "ds3", "mf3", "rf3", "ds4", "mf4", "rf4"])

In [4]:
# Apply filter to only get subsystems linked to Coronation Drive in each .json file
files = list(os.listdir("R:/data"))
# Capture all errors along the way
errors_list = []

for i in range(len(files)):
    file = files[i]
    try:
        # Read json file
        temp_df = pd.read_json(f"R:/data/{file}")
        # Filter to only get subsystems linked to Coronation Drive (1:1 assumption)
        temp_df = temp_df[temp_df["ss"].isin(ss_tsc.keys())]
        # Concatenate dataframes
        df = pd.concat([df, temp_df])
    except Exception as e:
        errors_list.append([file, e])
        continue
    
    # Print progress
    if i % 100 == 0:
        print(f"Preprocessing file: {i}")
        print(f"Total progress: {i/len(files) * 100}%")
        print(f"Dataframe size: {df.shape}")
        print("\n")

Preprocessing file: 0
Total progress: 0.0%
Dataframe size: (40, 20)


Preprocessing file: 100
Total progress: 0.07800494551354556%
Dataframe size: (2241, 20)


Preprocessing file: 200
Total progress: 0.15600989102709112%
Dataframe size: (4559, 20)


Preprocessing file: 300
Total progress: 0.23401483654063668%
Dataframe size: (7310, 20)


Preprocessing file: 400
Total progress: 0.31201978205418224%
Dataframe size: (9710, 20)


Preprocessing file: 500
Total progress: 0.39002472756772777%
Dataframe size: (11842, 20)


Preprocessing file: 600
Total progress: 0.46802967308127336%
Dataframe size: (14037, 20)


Preprocessing file: 700
Total progress: 0.5460346185948188%
Dataframe size: (16464, 20)


Preprocessing file: 800
Total progress: 0.6240395641083645%
Dataframe size: (18845, 20)


Preprocessing file: 900
Total progress: 0.70204450962191%
Dataframe size: (21294, 20)


Preprocessing file: 1000
Total progress: 0.7800494551354555%
Dataframe size: (23629, 20)


Preprocessing file: 1100
Tota

Preprocessing file: 9100
Total progress: 7.0984500417326455%
Dataframe size: (219483, 20)


Preprocessing file: 9200
Total progress: 7.176454987246192%
Dataframe size: (222025, 20)


Preprocessing file: 9300
Total progress: 7.254459932759737%
Dataframe size: (224496, 20)


Preprocessing file: 9400
Total progress: 7.332464878273283%
Dataframe size: (226827, 20)


Preprocessing file: 9500
Total progress: 7.410469823786828%
Dataframe size: (228828, 20)


Preprocessing file: 9600
Total progress: 7.488474769300374%
Dataframe size: (231538, 20)


Preprocessing file: 9700
Total progress: 7.5664797148139185%
Dataframe size: (234104, 20)


Preprocessing file: 9800
Total progress: 7.644484660327465%
Dataframe size: (236264, 20)


Preprocessing file: 9900
Total progress: 7.72248960584101%
Dataframe size: (238486, 20)


Preprocessing file: 10000
Total progress: 7.800494551354556%
Dataframe size: (240826, 20)


Preprocessing file: 10100
Total progress: 7.878499496868101%
Dataframe size: (242995, 20

Preprocessing file: 18000
Total progress: 14.040890192438201%
Dataframe size: (430462, 20)


Preprocessing file: 18100
Total progress: 14.118895137951744%
Dataframe size: (432792, 20)


Preprocessing file: 18200
Total progress: 14.196900083465291%
Dataframe size: (435483, 20)


Preprocessing file: 18300
Total progress: 14.274905028978838%
Dataframe size: (437784, 20)


Preprocessing file: 18400
Total progress: 14.352909974492384%
Dataframe size: (440287, 20)


Preprocessing file: 18500
Total progress: 14.430914920005927%
Dataframe size: (442705, 20)


Preprocessing file: 18600
Total progress: 14.508919865519474%
Dataframe size: (445216, 20)


Preprocessing file: 18700
Total progress: 14.58692481103302%
Dataframe size: (447596, 20)


Preprocessing file: 18800
Total progress: 14.664929756546567%
Dataframe size: (449713, 20)


Preprocessing file: 18900
Total progress: 14.74293470206011%
Dataframe size: (452443, 20)


Preprocessing file: 19000
Total progress: 14.820939647573656%
Dataframe 

Preprocessing file: 26900
Total progress: 20.983330343143756%
Dataframe size: (639823, 20)


Preprocessing file: 27000
Total progress: 21.0613352886573%
Dataframe size: (642273, 20)


Preprocessing file: 27100
Total progress: 21.13934023417085%
Dataframe size: (644572, 20)


Preprocessing file: 27200
Total progress: 21.21734517968439%
Dataframe size: (646631, 20)


Preprocessing file: 27300
Total progress: 21.29535012519794%
Dataframe size: (649285, 20)


Preprocessing file: 27400
Total progress: 21.373355070711483%
Dataframe size: (651710, 20)


Preprocessing file: 27500
Total progress: 21.45136001622503%
Dataframe size: (654008, 20)


Preprocessing file: 27600
Total progress: 21.529364961738573%
Dataframe size: (656252, 20)


Preprocessing file: 27700
Total progress: 21.60736990725212%
Dataframe size: (658531, 20)


Preprocessing file: 27800
Total progress: 21.685374852765666%
Dataframe size: (660760, 20)


Preprocessing file: 27900
Total progress: 21.76337979827921%
Dataframe size: 

Preprocessing file: 35800
Total progress: 27.92577049384931%
Dataframe size: (851299, 20)


Preprocessing file: 35900
Total progress: 28.003775439362855%
Dataframe size: (853711, 20)


Preprocessing file: 36000
Total progress: 28.081780384876403%
Dataframe size: (856140, 20)


Preprocessing file: 36100
Total progress: 28.159785330389948%
Dataframe size: (858861, 20)


Preprocessing file: 36200
Total progress: 28.23779027590349%
Dataframe size: (861231, 20)


Preprocessing file: 36300
Total progress: 28.31579522141704%
Dataframe size: (863689, 20)


Preprocessing file: 36400
Total progress: 28.393800166930582%
Dataframe size: (865959, 20)


Preprocessing file: 36500
Total progress: 28.471805112444127%
Dataframe size: (868284, 20)


Preprocessing file: 36600
Total progress: 28.549810057957675%
Dataframe size: (870690, 20)


Preprocessing file: 36700
Total progress: 28.62781500347122%
Dataframe size: (873249, 20)


Preprocessing file: 36800
Total progress: 28.705819948984768%
Dataframe si

Preprocessing file: 44700
Total progress: 34.868210644554864%
Dataframe size: (1065486, 20)


Preprocessing file: 44800
Total progress: 34.946215590068405%
Dataframe size: (1067975, 20)


Preprocessing file: 44900
Total progress: 35.024220535581954%
Dataframe size: (1070480, 20)


Preprocessing file: 45000
Total progress: 35.1022254810955%
Dataframe size: (1072806, 20)


Preprocessing file: 45100
Total progress: 35.18023042660905%
Dataframe size: (1074861, 20)


Preprocessing file: 45200
Total progress: 35.25823537212259%
Dataframe size: (1076952, 20)


Preprocessing file: 45300
Total progress: 35.33624031763614%
Dataframe size: (1079205, 20)


Preprocessing file: 45400
Total progress: 35.41424526314969%
Dataframe size: (1081627, 20)


Preprocessing file: 45500
Total progress: 35.49225020866323%
Dataframe size: (1083896, 20)


Preprocessing file: 45600
Total progress: 35.57025515417678%
Dataframe size: (1086416, 20)


Preprocessing file: 45700
Total progress: 35.64826009969032%
Datafra

Preprocessing file: 53500
Total progress: 41.73264584974687%
Dataframe size: (1275921, 20)


Preprocessing file: 53600
Total progress: 41.810650795260415%
Dataframe size: (1278514, 20)


Preprocessing file: 53700
Total progress: 41.88865574077397%
Dataframe size: (1281104, 20)


Preprocessing file: 53800
Total progress: 41.96666068628751%
Dataframe size: (1283656, 20)


Preprocessing file: 53900
Total progress: 42.04466563180105%
Dataframe size: (1285891, 20)


Preprocessing file: 54000
Total progress: 42.1226705773146%
Dataframe size: (1288499, 20)


Preprocessing file: 54100
Total progress: 42.20067552282815%
Dataframe size: (1290837, 20)


Preprocessing file: 54200
Total progress: 42.2786804683417%
Dataframe size: (1293437, 20)


Preprocessing file: 54300
Total progress: 42.35668541385524%
Dataframe size: (1295952, 20)


Preprocessing file: 54400
Total progress: 42.43469035936878%
Dataframe size: (1298474, 20)


Preprocessing file: 54500
Total progress: 42.51269530488233%
Dataframe 

Preprocessing file: 62300
Total progress: 48.59708105493888%
Dataframe size: (1487413, 20)


Preprocessing file: 62400
Total progress: 48.675086000452424%
Dataframe size: (1489922, 20)


Preprocessing file: 62500
Total progress: 48.75309094596598%
Dataframe size: (1492533, 20)


Preprocessing file: 62600
Total progress: 48.83109589147952%
Dataframe size: (1494937, 20)


Preprocessing file: 62700
Total progress: 48.90910083699306%
Dataframe size: (1497493, 20)


Preprocessing file: 62800
Total progress: 48.98710578250661%
Dataframe size: (1499210, 20)


Preprocessing file: 62900
Total progress: 49.06511072802016%
Dataframe size: (1501522, 20)


Preprocessing file: 63000
Total progress: 49.1431156735337%
Dataframe size: (1503863, 20)


Preprocessing file: 63100
Total progress: 49.22112061904725%
Dataframe size: (1506219, 20)


Preprocessing file: 63200
Total progress: 49.29912556456079%
Dataframe size: (1508604, 20)


Preprocessing file: 63300
Total progress: 49.37713051007434%
Dataframe

Preprocessing file: 71100
Total progress: 55.46151626013089%
Dataframe size: (1697412, 20)


Preprocessing file: 71200
Total progress: 55.53952120564444%
Dataframe size: (1699869, 20)


Preprocessing file: 71300
Total progress: 55.61752615115798%
Dataframe size: (1702179, 20)


Preprocessing file: 71400
Total progress: 55.69553109667152%
Dataframe size: (1704486, 20)


Preprocessing file: 71500
Total progress: 55.77353604218508%
Dataframe size: (1707034, 20)


Preprocessing file: 71600
Total progress: 55.85154098769862%
Dataframe size: (1709615, 20)


Preprocessing file: 71700
Total progress: 55.92954593321217%
Dataframe size: (1712196, 20)


Preprocessing file: 71800
Total progress: 56.00755087872571%
Dataframe size: (1714439, 20)


Preprocessing file: 71900
Total progress: 56.08555582423925%
Dataframe size: (1716581, 20)


Preprocessing file: 72000
Total progress: 56.163560769752806%
Dataframe size: (1718688, 20)


Preprocessing file: 72100
Total progress: 56.24156571526635%
Datafram

Preprocessing file: 79900
Total progress: 62.3259514653229%
Dataframe size: (1905879, 20)


Preprocessing file: 80000
Total progress: 62.40395641083645%
Dataframe size: (1908206, 20)


Preprocessing file: 80100
Total progress: 62.48196135634999%
Dataframe size: (1910521, 20)


Preprocessing file: 80200
Total progress: 62.55996630186353%
Dataframe size: (1912704, 20)


Preprocessing file: 80300
Total progress: 62.63797124737709%
Dataframe size: (1915357, 20)


Preprocessing file: 80400
Total progress: 62.71597619289063%
Dataframe size: (1917497, 20)


Preprocessing file: 80500
Total progress: 62.79398113840418%
Dataframe size: (1919842, 20)


Preprocessing file: 80600
Total progress: 62.87198608391772%
Dataframe size: (1922100, 20)


Preprocessing file: 80700
Total progress: 62.94999102943126%
Dataframe size: (1924361, 20)


Preprocessing file: 80800
Total progress: 63.02799597494481%
Dataframe size: (1926489, 20)


Preprocessing file: 80900
Total progress: 63.106000920458364%
Dataframe

Preprocessing file: 88800
Total progress: 69.26839161602845%
Dataframe size: (2118014, 20)


Preprocessing file: 88900
Total progress: 69.34639656154201%
Dataframe size: (2120633, 20)


Preprocessing file: 89000
Total progress: 69.42440150705555%
Dataframe size: (2123157, 20)


Preprocessing file: 89100
Total progress: 69.50240645256909%
Dataframe size: (2125648, 20)


Preprocessing file: 89200
Total progress: 69.58041139808265%
Dataframe size: (2127722, 20)


Preprocessing file: 89300
Total progress: 69.65841634359619%
Dataframe size: (2130172, 20)


Preprocessing file: 89400
Total progress: 69.73642128910973%
Dataframe size: (2132522, 20)


Preprocessing file: 89500
Total progress: 69.81442623462327%
Dataframe size: (2135103, 20)


Preprocessing file: 89600
Total progress: 69.89243118013681%
Dataframe size: (2137612, 20)


Preprocessing file: 89700
Total progress: 69.97043612565037%
Dataframe size: (2140175, 20)


Preprocessing file: 89800
Total progress: 70.04844107116391%
Dataframe

Preprocessing file: 97700
Total progress: 76.21083176673402%
Dataframe size: (2332651, 20)


Preprocessing file: 97800
Total progress: 76.28883671224756%
Dataframe size: (2335370, 20)


Preprocessing file: 97900
Total progress: 76.3668416577611%
Dataframe size: (2337566, 20)


Preprocessing file: 98000
Total progress: 76.44484660327466%
Dataframe size: (2339901, 20)


Preprocessing file: 98100
Total progress: 76.5228515487882%
Dataframe size: (2342472, 20)


Preprocessing file: 98200
Total progress: 76.60085649430174%
Dataframe size: (2344828, 20)


Preprocessing file: 98300
Total progress: 76.67886143981528%
Dataframe size: (2347357, 20)


Preprocessing file: 98400
Total progress: 76.75686638532882%
Dataframe size: (2349601, 20)


Preprocessing file: 98500
Total progress: 76.83487133084238%
Dataframe size: (2352082, 20)


Preprocessing file: 98600
Total progress: 76.91287627635592%
Dataframe size: (2354418, 20)


Preprocessing file: 98700
Total progress: 76.99088122186947%
Dataframe s

Preprocessing file: 106500
Total progress: 83.07526697192603%
Dataframe size: (2542492, 20)


Preprocessing file: 106600
Total progress: 83.15327191743957%
Dataframe size: (2544899, 20)


Preprocessing file: 106700
Total progress: 83.23127686295311%
Dataframe size: (2547435, 20)


Preprocessing file: 106800
Total progress: 83.30928180846666%
Dataframe size: (2549857, 20)


Preprocessing file: 106900
Total progress: 83.3872867539802%
Dataframe size: (2552372, 20)


Preprocessing file: 107000
Total progress: 83.46529169949375%
Dataframe size: (2554828, 20)


Preprocessing file: 107100
Total progress: 83.54329664500729%
Dataframe size: (2557170, 20)


Preprocessing file: 107200
Total progress: 83.62130159052083%
Dataframe size: (2559237, 20)


Preprocessing file: 107300
Total progress: 83.69930653603438%
Dataframe size: (2561513, 20)


Preprocessing file: 107400
Total progress: 83.77731148154794%
Dataframe size: (2564063, 20)


Preprocessing file: 107500
Total progress: 83.85531642706148%

Preprocessing file: 115300
Total progress: 89.93970217711804%
Dataframe size: (2748632, 20)


Preprocessing file: 115400
Total progress: 90.01770712263158%
Dataframe size: (2750879, 20)


Preprocessing file: 115500
Total progress: 90.09571206814512%
Dataframe size: (2753264, 20)


Preprocessing file: 115600
Total progress: 90.17371701365866%
Dataframe size: (2755895, 20)


Preprocessing file: 115700
Total progress: 90.25172195917222%
Dataframe size: (2758306, 20)


Preprocessing file: 115800
Total progress: 90.32972690468576%
Dataframe size: (2760661, 20)


Preprocessing file: 115900
Total progress: 90.4077318501993%
Dataframe size: (2762829, 20)


Preprocessing file: 116000
Total progress: 90.48573679571284%
Dataframe size: (2765341, 20)


Preprocessing file: 116100
Total progress: 90.5637417412264%
Dataframe size: (2768144, 20)


Preprocessing file: 116200
Total progress: 90.64174668673995%
Dataframe size: (2770428, 20)


Preprocessing file: 116300
Total progress: 90.71975163225349%


Preprocessing file: 124100
Total progress: 96.80413738231005%
Dataframe size: (2959749, 20)


Preprocessing file: 124200
Total progress: 96.88214232782359%
Dataframe size: (2961770, 20)


Preprocessing file: 124300
Total progress: 96.96014727333713%
Dataframe size: (2964561, 20)


Preprocessing file: 124400
Total progress: 97.03815221885067%
Dataframe size: (2966691, 20)


Preprocessing file: 124500
Total progress: 97.11615716436422%
Dataframe size: (2969010, 20)


Preprocessing file: 124600
Total progress: 97.19416210987777%
Dataframe size: (2971200, 20)


Preprocessing file: 124700
Total progress: 97.27216705539131%
Dataframe size: (2973729, 20)


Preprocessing file: 124800
Total progress: 97.35017200090485%
Dataframe size: (2976053, 20)


Preprocessing file: 124900
Total progress: 97.4281769464184%
Dataframe size: (2978788, 20)


Preprocessing file: 125000
Total progress: 97.50618189193196%
Dataframe size: (2981310, 20)


Preprocessing file: 125100
Total progress: 97.5841868374455%


In [5]:
# Coronation drive usually has 3 lanes most of the time so drop 4th lane data (assumption)
df.drop(columns=["dbid", "link_plan", "married", "ds4", "mf4", "rf4"], axis=1, inplace=True)
df.head(len(df))

Unnamed: 0,recorded,ct,ss,tsc,lane,ds1,mf1,rf1,ds2,mf2,rf2,ds3,mf3,rf3
4272,2021-04-17T04:38:00,80,2458,132,SA-217,12.0,3.0,2.0,6.0,2.0,2.0,,,
4273,2021-04-17T04:38:00,80,2458,132,SA-218,0.0,0.0,0.0,8.0,2.0,2.0,,,
4274,2021-04-17T04:38:00,80,2458,132,SA-219,17.0,2.0,1.0,,,,,,
4275,2021-04-17T04:38:00,80,2458,132,SA-220,0.0,0.0,0.0,13.0,2.0,1.0,,,
4276,2021-04-17T04:38:00,80,2458,132,SA-221,6.0,2.0,2.0,11.0,4.0,3.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,2021-06-08T07:19:00,135,2458,132,SA-217,34.0,14.0,15.0,86.0,41.0,44.0,,,
195,2021-06-08T07:19:00,135,2458,132,SA-218,79.0,40.0,39.0,66.0,34.0,34.0,,,
196,2021-06-08T07:19:00,135,2458,132,SA-219,55.0,6.0,4.0,,,,,,
197,2021-06-08T07:19:00,135,2458,132,SA-220,51.0,7.0,5.0,49.0,5.0,5.0,,,


In [6]:
# NaNs only appear for dfN, mfN, rfN with N >= 2 --> this refers to the roads adjacent to Coronation Drive at intersections
# that may only be single lane. Regardless, since we are not interested in these roads, simply drop NaNs (assumption)
df.dropna(inplace=True)
df.head(len(df))

Unnamed: 0,recorded,ct,ss,tsc,lane,ds1,mf1,rf1,ds2,mf2,rf2,ds3,mf3,rf3
5042,2021-04-17T04:38:00,80,2473,188,SA-513,6.0,2.0,3.0,5.0,2.0,2.0,5.0,2.0,2.0
5044,2021-04-17T04:38:00,80,2480,221,SA-572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5066,2021-04-17T04:38:00,80,2446,87,SA-517,8.0,4.0,3.0,6.0,3.0,2.0,0.0,0.0,0.0
5086,2021-04-17T04:38:00,80,2475,203,SA-523,9.0,3.0,3.0,6.0,2.0,2.0,9.0,3.0,3.0
5091,2021-04-17T04:38:00,80,2519,364,SA-532,0.0,0.0,0.0,5.0,2.0,2.0,7.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4502,2021-04-13T02:10:00,60,2473,188,SA-513,0.0,0.0,0.0,6.0,2.0,2.0,0.0,0.0,0.0
4506,2021-04-13T02:10:00,60,2446,87,SA-517,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4512,2021-04-13T02:10:00,60,2475,203,SA-523,0.0,0.0,0.0,7.0,2.0,2.0,7.0,2.0,2.0
4517,2021-04-13T02:10:00,60,2469,181,SA-528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Convert "recorded" column to datetime object for sorting
try:
    df["recorded"] = pd.to_datetime(df["recorded"].str.split("T").apply(" ".join), format="%Y-%m-%d %H:%M:%S")
except AttributeError:
    pass

# Sort dataframe according to time
df.sort_values("recorded", axis=0, inplace=True)
df.head()

Unnamed: 0,recorded,ct,ss,tsc,lane,ds1,mf1,rf1,ds2,mf2,rf2,ds3,mf3,rf3
3694,2021-02-20 20:04:00,110,2475,203,SA-523,38.0,12.0,11.0,38.0,12.0,11.0,19.0,6.0,5.0
3730,2021-02-20 20:04:00,110,2480,221,SA-572,36.0,9.0,10.0,60.0,17.0,18.0,16.0,4.0,5.0
3719,2021-02-20 20:04:00,110,2469,181,SA-528,41.0,12.0,12.0,37.0,10.0,11.0,23.0,7.0,6.0
3716,2021-02-20 20:04:00,110,2519,364,SA-532,27.0,10.0,10.0,33.0,12.0,12.0,10.0,5.0,4.0
3711,2021-02-20 20:04:00,110,2447,278,SA-550,43.0,20.0,19.0,14.0,11.0,10.0,10.0,8.0,7.0


In [8]:
df.to_csv(path_or_buf="../preprocessed_data.csv", index=False)

In [9]:
errors_list

[['Y2021M3D12H22M52S56.json', ValueError('Expected object or value')],
 ['Y2021M3D12H22M50S56.json', ValueError('Expected object or value')],
 ['Y2021M3D23H17M0S27.json', ValueError('Expected object or value')],
 ['Y2021M3D12H22M54S56.json', ValueError('Expected object or value')],
 ['Y2021M3D12H23M20S8.json', ValueError('Expected object or value')],
 ['Y2021M4D14H19M11S3.json',
  ValueError('Unmatched \'\'"\' when when decoding \'string\'')],
 ['Y2021M3D12H23M21S9.json', ValueError('Expected object or value')],
 ['Y2021M4D28H13M28S48.json',
  ValueError("Unexpected character found when decoding 'Infinity'")]]