Step 1: Import Libraries and Load Data

The dataset I used in this file is nyctaxi_clus.Rdata. 

The path might need to be changed as the directory in the dropbox might not match what I used locally. 

In [15]:
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tools.eval_measures import rmse, aic

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

pandas2ri.activate()
robjects.r['load']("./CrowdFlow/data/nyctaxi_clus.RData")

0
'alldata'


Step 2: Process data

The original data is consisted of many tables. I seperated them into each individual dataframe so I could use them in python. 

Data_flow consists of information of the time index, region, inflow, outflow of the data. I added one extra column called trafiic_count which is the sum of inflow and outflow. 

Data_timemap consists of mappings of the 'time' column of the data_flow dataframe into actual time in the day, day in the week, hour in the day, hour in the week and the week number. 

Data_weather consists of weather information including temperature, pressure, humidity, windspeed, winddirection and weather (a 0~16 catagorical data for weather like sunny, rainy, snowy) for each time index. I was focusing on just the 'Weather' column of the data_weather dataframe. 

Data_flow and data_weather are merged together for easier processing. 

I also defined good weather as weather type 0 ~ 2 and bad weather is everything else. 

I then seperated the data_flow dataset into training set and testing test. Since the data is poorly distributed, 75% of the time belongs in training set and 25% of the time belongs in testing set so that there are more weather types in testing data. 

In [16]:
data = robjects.r['alldata']
data_flow = robjects.r['alldata'][0]
traffic_count = data_flow['outflow'] + data_flow['inflow']
data_flow['traffic_count'] = traffic_count
    
data_timemap = robjects.r['alldata'][2]
data_weather = robjects.r['alldata'][3]
data_flow = data_flow.merge(data_weather, how='left', on='time')
testtime = robjects.r['alldata'][-2]
good_weather = data_weather.loc[data_weather['Weather'] <= 2]['time'].values.tolist()
# moderate_weather = data_weather[(data_weather['Weather'] > 2) & (data_weather['Weather'] <= 6)]['time'].values.tolist()
bad_weather = data_weather.loc[data_weather['Weather'] > 2]['time'].values.tolist()
weather_info = [-1 for _ in range (len(data_flow))]
# data_flow['Weather'] = data_weather['Weather']
data_flow['Weather_T'] = weather_info
# data_flow['Temperature'] = data_weather['Temperature']
data_flow.loc[data_flow['time'].isin(good_weather), 'Weather_T'] = 0
data_flow.loc[data_flow['time'].isin(bad_weather), 'Weather_T'] = 1
# data_flow.loc[data_flow['time'].isin(bad_weather), 'Weather'] = 2
test_data = data_flow.loc[data_flow['time'] >= 4410].reset_index()
train_data = data_flow.loc[data_flow['time'] < 4410].reset_index()

This is a function which get all the time in data_timemap with the same dayinweek, hourinday, hourinweek given a certain time index. 

In [17]:
def newGetTimeFromTime(time, data_timemap):
    dayinweek = data_timemap.loc[data_timemap['time'] == time]['dayinweek'].values[0]
    hourinday = data_timemap.loc[data_timemap['time'] == time]['hourinday'].values[0]
    hourinweek = data_timemap.loc[data_timemap['time'] == time]['hourinweek'].values[0]
    time_array = []
    return data_timemap.loc[(data_timemap['dayinweek'] == dayinweek) & (data_timemap['hourinday'] == hourinday)]
    # return data_timemap.loc[(data_timemap['dayinweek'] == dayinweek) & (data_timemap['hourinday'] == hourinday) & (data_timemap['hourinweek'] == hourinweek)]

Step 3: Running the prediction. 

I traversed the entire testing set and for each item, I get a dataframe with all the data with the same region, same weather and same time (dayinweek, hourinday and hourinweek). Then I take the average of inflow, outflow and traffic_count and append it to a list. Repeat this for all the data in test data and I get the predicted value of inflow, outflow and traffic_count for all testing data. 

In [18]:
predicted_outflow = []
predicted_inflow = []
predicted_trafficcount = []

for i in range (len(test_data.index)):
    time = test_data.iloc[[i]]['time'].values[0]
    time = int(time)
    region = test_data.iloc[[i]]['region'].values[0]
    weather = test_data.iloc[[i]]['Weather'].values[0]
    dataofsameregion = train_data.loc[train_data['region'] == region]
    sameweathersameregion = dataofsameregion.loc[dataofsameregion['Weather'] == weather]
    time_array = newGetTimeFromTime(time, data_timemap)

    
    df_new = sameweathersameregion[sameweathersameregion['time'].isin(time_array['time'].values.tolist())]

    predicted_outflow.append(round(df_new['outflow'].mean(), 3))
    predicted_inflow.append(round(df_new['inflow'].mean(), 3))
    predicted_trafficcount.append(round(df_new['traffic_count'].mean(), 3))

# print ("end of loop")
# print (predicted_inflow)
# print (predicted_outflow)
test_data['Pred_Outflow'] = predicted_outflow
test_data['Pred_Inflow'] = predicted_inflow
test_data['Pred_TrafficCount'] = predicted_trafficcount

Step 4. Generate squared error for inflow, outflow and traffic_count. 

In [19]:
test_data['MSE_Out'] = [number ** 2 for number in (test_data['outflow'].values - test_data['Pred_Outflow'].values)]
test_data['MSE_In'] = [number ** 2 for number in (test_data['inflow'].values - test_data['Pred_Inflow'].values)]
test_data['MSE_Traffic_Count'] = [number ** 2 for number in (test_data['traffic_count'].values - test_data['Pred_TrafficCount'].values)]

Step 5. Print RMSE for inflow, outflow and traffic_count for testing set. 

First I printed RMSE for all 17 weather types. Then I printed RMSE for good weather and bad weather. Lastly I printed the total RMSE, without the weather impact. 

In [20]:
print ('-------- Outflow MSE for all weather type --------')

for i in range (0, 17):

    print ('Weather = ', i, ' ', math.sqrt(test_data.loc[test_data['Weather'] == i]['MSE_Out'].mean()))

print ('-------- Inflow MSE for all weather type --------')

for i in range (0, 17):
    
    print ('Weather = ', i, ' ', math.sqrt(test_data.loc[test_data['Weather'] == i]['MSE_In'].mean()))

print ('-------- Traffic_Count MSE for all weather type --------')

for i in range (0, 17):
    
    print ('Weather = ', i, ' ', math.sqrt(test_data.loc[test_data['Weather'] == i]['MSE_Traffic_Count'].mean()))


print ('-------- Traffic_Count MSE for good and bad weather type --------')

print ('good weather inflow: ', math.sqrt(test_data.loc[test_data['Weather_T'] == 0]['MSE_In'].mean()))
print ('bad weather inflow: ', math.sqrt(test_data.loc[test_data['Weather_T'] == 1]['MSE_In'].mean()))
print ('good weather outflow: ', math.sqrt(test_data.loc[test_data['Weather_T'] == 0]['MSE_Out'].mean()))
print ('bad weather outflow: ', math.sqrt(test_data.loc[test_data['Weather_T'] == 1]['MSE_Out'].mean()))
print ('good weather TC: ', math.sqrt(test_data.loc[test_data['Weather_T'] == 0]['MSE_Traffic_Count'].mean()))
print ('bad weather TC: ', math.sqrt(test_data.loc[test_data['Weather_T'] == 1]['MSE_Traffic_Count'].mean()))
print ('MSE_TC: ', math.sqrt(test_data['MSE_Traffic_Count'].mean()))
print ('MSE_Inflow: ', math.sqrt(test_data['MSE_In'].mean()))
print ('MSE_Outflow: ', math.sqrt(test_data['MSE_Out'].mean()))

-------- Outflow MSE for all weather type --------
Weather =  0   289.15902815342514
Weather =  1   436.52818255650817
Weather =  2   343.4048522028582
Weather =  3   nan
Weather =  4   315.0948039730087
Weather =  5   227.61605633594255
Weather =  6   710.3917773080236
Weather =  7   nan
Weather =  8   nan
Weather =  9   nan
Weather =  10   nan
Weather =  11   nan
Weather =  12   nan
Weather =  13   nan
Weather =  14   nan
Weather =  15   nan
Weather =  16   nan
-------- Inflow MSE for all weather type --------
Weather =  0   281.91662738528544
Weather =  1   476.91114854998585
Weather =  2   320.8231936415857
Weather =  3   nan
Weather =  4   304.61809714035553
Weather =  5   250.1936401710479
Weather =  6   724.5771067821647
Weather =  7   nan
Weather =  8   nan
Weather =  9   nan
Weather =  10   nan
Weather =  11   nan
Weather =  12   nan
Weather =  13   nan
Weather =  14   nan
Weather =  15   nan
Weather =  16   nan
-------- Traffic_Count MSE for all weather type --------
Weather 