In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Train DataSet

In [2]:
# Read saved csv files
March = pd.read_csv('03March.csv',header=None)
April = pd.read_csv('04April.csv',header=None)
May   = pd.read_csv('05May.csv',header=None)
June  = pd.read_csv('06June.csv',header=None)

In [3]:
# Trim the data (first 17 columns)
March = March.iloc[:,0:17]
April = April.iloc[:,0:17]
May   = May.iloc[:,0:17]
June  = June.iloc[:,0:17]

In [4]:
# Define dataframe header
columns = ['Timestamp','Station','District','Freeway','DoT','Lane Type','Station Length','Samples', '% Obs',
          'Total flow','Avg Occ','Ave Speed','Lane N Samples','Lane N Flow','Lane N Avg Occ','Lane N Avg Speed','Lane N obs']

In [5]:
# Input the header defined above
March.columns = columns
April.columns = columns
May.columns   = columns
June.columns  = columns

In [6]:
May.head()

Unnamed: 0,Timestamp,Station,District,Freeway,DoT,Lane Type,Station Length,Samples,% Obs,Total flow,Avg Occ,Ave Speed,Lane N Samples,Lane N Flow,Lane N Avg Occ,Lane N Avg Speed,Lane N obs
0,05/01/2019 00:00:00,1007410,10,5,N,ML,0.501,36,100,116.0,0.0234,71.1,9.0,38.0,0.0231,78.4,1
1,05/01/2019 00:00:00,1016610,10,5,N,ML,0.612,30,100,102.0,0.0268,63.8,10.0,38.0,0.0215,74.8,1
2,05/01/2019 00:00:00,1021310,10,5,N,ML,0.726,27,67,92.0,0.0277,64.6,9.0,31.0,0.021,72.7,1
3,05/01/2019 00:00:00,1021410,10,5,N,ML,0.487,0,0,94.0,0.0297,63.2,0.0,33.0,0.0226,68.6,0
4,05/01/2019 00:00:00,1021510,10,5,N,ML,0.45,0,0,114.0,0.0278,65.7,0.0,63.0,0.0227,69.2,0


In [7]:
March.tail()

Unnamed: 0,Timestamp,Station,District,Freeway,DoT,Lane Type,Station Length,Samples,% Obs,Total flow,Avg Occ,Ave Speed,Lane N Samples,Lane N Flow,Lane N Avg Occ,Lane N Avg Speed,Lane N obs
98071,03/31/2019 23:55:00,1022210,10,5,N,ML,1.497,0,0,9.0,0.0036,67.9,0.0,2.0,0.0026,71.9,0
98072,03/31/2019 23:55:00,1027410,10,5,N,ML,0.249,0,0,77.0,0.0219,65.8,0.0,41.0,0.0226,68.5,0
98073,03/31/2019 23:55:00,1096310,10,5,N,ML,0.669,30,100,123.0,0.026,70.0,10.0,43.0,0.0223,71.6,1
98074,03/31/2019 23:55:00,1096710,10,5,N,ML,0.249,30,100,118.0,0.0276,68.0,10.0,41.0,0.0244,71.8,1
98075,03/31/2019 23:55:00,10115310,10,5,N,ML,0.73,24,100,66.0,0.022,59.1,8.0,11.0,0.006,71.6,1


In [8]:
df = pd.concat([March,April,May,June],ignore_index=True)

In [9]:
len(df)

385624

In [10]:
# Pass a list of column names (the important variables)
df = df[['Timestamp','Station','Total flow']]

In [11]:
df

Unnamed: 0,Timestamp,Station,Total flow
0,03/01/2019 00:00:00,1007410,132.0
1,03/01/2019 00:00:00,1016610,128.0
2,03/01/2019 00:00:00,1021310,187.0
3,03/01/2019 00:00:00,1021410,125.0
4,03/01/2019 00:00:00,1021510,125.0
...,...,...,...
385619,06/30/2019 23:55:00,1022210,9.0
385620,06/30/2019 23:55:00,1027410,86.0
385621,06/30/2019 23:55:00,1096310,114.0
385622,06/30/2019 23:55:00,1096710,125.0


In [12]:
# Replace Station ID
mymap = {
            1096710  : 'f1',
            1096310  : 'f2',
            1007410  : 'f3',
            1021310  : 'f4',
            1021410  : 'f5',
            1021510  : 'f6',
            1016610  : 'f7',
            1027410  : 'f8',
            10115310 : 'f9',
            1022010  : 'f10',
            1022210  : 'f11'
        }
df['Station'] = df['Station'].map(mymap)

In [13]:
df['Station'].value_counts()

f4     35083
f8     35083
f7     35083
f9     35083
f1     35083
f6     35083
f5     35083
f3     35083
f2     35083
f10    35082
f11    34795
Name: Station, dtype: int64

In [14]:
df['Timestamp'].nunique()

35083

In [15]:
# Re-arrange the Dataframe to spatio-temporal format
df = df.pivot(index='Timestamp',columns='Station',values='Total flow')

In [16]:
df.keys()

Index(['f1', 'f10', 'f11', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9'], dtype='object', name='Station')

In [17]:
# f11 is incomplete so drop it
df.drop(columns = 'f11', inplace = True)

In [18]:
df

Station,f1,f10,f2,f3,f4,f5,f6,f7,f8,f9
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
03/01/2019 00:00:00,104.0,45.0,98.0,132.0,187.0,125.0,125.0,128.0,72.0,38.0
03/01/2019 00:05:00,100.0,77.0,95.0,135.0,121.0,118.0,125.0,119.0,73.0,72.0
03/01/2019 00:10:00,110.0,73.0,124.0,132.0,134.0,128.0,112.0,118.0,74.0,73.0
03/01/2019 00:15:00,110.0,90.0,102.0,103.0,127.0,103.0,100.0,106.0,74.0,88.0
03/01/2019 00:20:00,101.0,62.0,98.0,117.0,132.0,111.0,114.0,109.0,71.0,66.0
...,...,...,...,...,...,...,...,...,...,...
06/30/2019 23:35:00,141.0,84.0,143.0,150.0,134.0,141.0,142.0,153.0,92.0,110.0
06/30/2019 23:40:00,132.0,89.0,128.0,139.0,134.0,117.0,116.0,114.0,82.0,98.0
06/30/2019 23:45:00,106.0,85.0,111.0,123.0,105.0,118.0,127.0,146.0,86.0,94.0
06/30/2019 23:50:00,137.0,95.0,140.0,141.0,137.0,121.0,103.0,124.0,75.0,109.0


In [19]:
# Re-arrange spatial columns, so, the data is sequential
df = df[['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10']]
df

Station,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
03/01/2019 00:00:00,104.0,98.0,132.0,187.0,125.0,125.0,128.0,72.0,38.0,45.0
03/01/2019 00:05:00,100.0,95.0,135.0,121.0,118.0,125.0,119.0,73.0,72.0,77.0
03/01/2019 00:10:00,110.0,124.0,132.0,134.0,128.0,112.0,118.0,74.0,73.0,73.0
03/01/2019 00:15:00,110.0,102.0,103.0,127.0,103.0,100.0,106.0,74.0,88.0,90.0
03/01/2019 00:20:00,101.0,98.0,117.0,132.0,111.0,114.0,109.0,71.0,66.0,62.0
...,...,...,...,...,...,...,...,...,...,...
06/30/2019 23:35:00,141.0,143.0,150.0,134.0,141.0,142.0,153.0,92.0,110.0,84.0
06/30/2019 23:40:00,132.0,128.0,139.0,134.0,117.0,116.0,114.0,82.0,98.0,89.0
06/30/2019 23:45:00,106.0,111.0,123.0,105.0,118.0,127.0,146.0,86.0,94.0,85.0
06/30/2019 23:50:00,137.0,140.0,141.0,137.0,121.0,103.0,124.0,75.0,109.0,95.0


In [20]:
df.to_csv('000TrainVal.csv')

# Test DataSet

In [21]:
# Read CSV data
Weekday   = pd.read_csv('07Weekday.csv',header=None)
Weekend   = pd.read_csv('08Weekend.csv',header=None)

In [22]:
# Trim the data (first 17 columns only)
Weekday = Weekday.iloc[:,0:17]
Weekend = Weekend.iloc[:,0:17]

In [23]:
# Rename the columns with the column variable from above
Weekday.columns = columns
Weekend.columns = columns

In [24]:
# Only these three variables are required
Weekday = Weekday[['Timestamp','Station','Total flow']]
Weekend = Weekend[['Timestamp','Station','Total flow']]

In [25]:
# Rename the station IDs
Weekday['Station'] = Weekday['Station'].map(mymap)
Weekend['Station'] = Weekend['Station'].map(mymap)

In [26]:
# Re-arrange the data to time-series format
Weekday = Weekday.pivot(index='Timestamp',columns='Station',values='Total flow')
Weekend = Weekend.pivot(index='Timestamp',columns='Station',values='Total flow')

In [27]:
# Drop the f11 column (unrequired)
Weekday.drop(columns = 'f11', inplace = True)
Weekend.drop(columns = 'f11', inplace = True)

In [28]:
# Re-arrange the station header to be sequential
Weekday = Weekday[['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10']]
Weekend = Weekend[['f1','f2','f3','f4','f5','f6','f7','f8','f9','f10']]

In [29]:
Weekday

Station,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
08/06/2019 00:00:00,99,98,115,92,97,103,111,66,65,64
08/06/2019 00:05:00,104,120,123,112,117,106,111,65,73,69
08/06/2019 00:10:00,99,104,118,91,88,117,126,66,75,75
08/06/2019 00:15:00,119,119,118,106,99,111,121,77,74,74
08/06/2019 00:20:00,105,106,110,90,94,98,105,72,84,72
...,...,...,...,...,...,...,...,...,...,...
08/06/2019 23:35:00,169,185,173,137,129,141,158,87,87,85
08/06/2019 23:40:00,132,123,165,137,137,132,141,77,64,69
08/06/2019 23:45:00,143,141,135,107,105,142,145,81,86,81
08/06/2019 23:50:00,96,95,114,105,103,104,113,76,80,69


In [30]:
Weekend

Station,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
08/10/2019 00:00:00,117,134,151,135,150,118,123,93,115,91
08/10/2019 00:05:00,149,135,181,159,155,145,157,92,77,72
08/10/2019 00:10:00,139,135,157,130,126,106,119,82,101,91
08/10/2019 00:15:00,120,112,129,108,97,123,141,77,80,77
08/10/2019 00:20:00,129,134,135,126,114,126,135,81,69,64
...,...,...,...,...,...,...,...,...,...,...
08/10/2019 23:35:00,177,188,209,178,163,172,198,118,136,128
08/10/2019 23:40:00,183,196,228,186,174,160,187,105,108,98
08/10/2019 23:45:00,217,198,197,186,174,177,215,119,141,123
08/10/2019 23:50:00,209,219,193,184,155,150,176,108,123,115


In [31]:
Weekday.to_csv('00Weekday.csv')
Weekend.to_csv('00Weekend.csv')