In [1]:
import pandas as pd
import numpy as np

In [2]:
df_bikeShare2 = pd.read_csv('bikeshare2020cleaner.csv')

In [3]:
df_bikeShare2 = df_bikeShare2.drop(['Trip Id', 'Start Station Id', 'End Station Id', 'End Time', 'Bike Id', 'Start Year', 'End Year', 'Trip End Time'], axis = 1)

In [4]:
#one hot encoding for classes (member type)
from numpy import asarray
from sklearn.preprocessing import OneHotEncoder

In [5]:
data = asarray([['Annual Member'], ['Casual Member']])
encoder = OneHotEncoder(sparse=False)
memType = encoder.fit_transform(data)

In [6]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)

In [7]:
df = encode_and_bind(df_bikeShare2, 'User Type')

In [8]:
# Create the day of week variable
from datetime import datetime
import calendar

In [9]:
df['Date'] = pd.to_datetime(df['Start Time']).dt.date

In [10]:
df['Date'].describe()

count        2788330
unique           366
top       2020-09-06
freq           22191
Name: Date, dtype: object

In [11]:
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d").dt.strftime("%m/%d/%Y")

In [12]:
df['Date'].describe()

count        2788330
unique           366
top       09/06/2020
freq           22191
Name: Date, dtype: object

In [13]:
df

Unnamed: 0.1,Unnamed: 0,Trip Duration,Start Time,Start Station Name,End Station Name,User Type,Start Month,Start Day,End Month,End Day,Trip Start Time,User Type_Annual Member,User Type_Casual Member,Date
0,0,648.0,2020-01-01 00:08:00,Madison Ave / Bloor St W,Yonge St / Alexander St - SMART,Annual Member,1,1,1,1,00:08:00,1,0,01/01/2020
1,1,419.0,2020-01-01 00:10:00,College St / Huron St,Yonge St / Wood St,Annual Member,1,1,1,1,00:10:00,1,0,01/01/2020
2,2,566.0,2020-01-01 00:13:00,Parliament St / Aberdeen Ave,Front St E / Cherry St,Annual Member,1,1,1,1,00:13:00,1,0,01/01/2020
3,3,1274.0,2020-01-01 00:17:00,King St E / Victoria St,Sherbourne St / Isabella St,Annual Member,1,1,1,1,00:17:00,1,0,01/01/2020
4,4,906.0,2020-01-01 00:19:00,King St E / Jarvis St,University Ave / Elm St,Casual Member,1,1,1,1,00:19:00,0,1,01/01/2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2788325,2911303,330.0,2020-12-31 23:52:00,King St W / Spadina Ave,Wellington St W / Stafford St,Annual Member,12,31,12,31,23:52:00,1,0,12/31/2020
2788326,2911304,216.0,2020-12-31 23:54:00,Humber Bay Shores Park West,Humber Bay Shores Park / Marine Parade Dr,Annual Member,12,31,12,31,23:54:00,1,0,12/31/2020
2788327,2911305,204.0,2020-12-31 23:54:00,Humber Bay Shores Park West,Humber Bay Shores Park / Marine Parade Dr,Annual Member,12,31,12,31,23:54:00,1,0,12/31/2020
2788328,2911306,1659.0,2020-12-31 23:56:00,Church St / Dundas St E - SMART,Church St / Dundas St E - SMART,Annual Member,12,31,1,1,23:56:00,1,0,12/31/2020


In [14]:
df['Date'] =  pd.to_datetime(df['Date'], format="%m/%d/%Y")

In [15]:
df['Day of Week'] = df['Date'].dt.dayofweek

In [16]:
print(df['Day of Week'])

0          2
1          2
2          2
3          2
4          2
          ..
2788325    3
2788326    3
2788327    3
2788328    3
2788329    3
Name: Day of Week, Length: 2788330, dtype: int64


In [17]:
print(df.dtypes)

Unnamed: 0                          int64
Trip  Duration                    float64
Start Time                         object
Start Station Name                 object
End Station Name                   object
User Type                          object
Start Month                         int64
Start Day                           int64
End Month                           int64
End Day                             int64
Trip Start Time                    object
User Type_Annual Member             uint8
User Type_Casual Member             uint8
Date                       datetime64[ns]
Day of Week                         int64
dtype: object


In [18]:
#add temperature and precipitation variables
weather = pd.read_csv('Toronto Weather 2020.csv')

In [19]:
weather.rename(columns={'Date/Time': 'Date'}, inplace=True)

In [20]:
weather

Unnamed: 0,Station Name,Date,Year,Month,Day,Mean Temp (°C),Total Precip (mm)
0,TORONTO CITY,1/1/2020,2020,1,1,-0.1,0.2
1,TORONTO CITY,1/2/2020,2020,1,2,3.6,0.0
2,TORONTO CITY,1/3/2020,2020,1,3,5.7,0.0
3,TORONTO CITY,1/4/2020,2020,1,4,2.0,1.5
4,TORONTO CITY,1/5/2020,2020,1,5,0.3,5.6
...,...,...,...,...,...,...,...
361,TORONTO CITY,12/27/2020,2020,12,27,1.1,0.5
362,TORONTO CITY,12/28/2020,2020,12,28,1.7,3.4
363,TORONTO CITY,12/29/2020,2020,12,29,-2.3,0.0
364,TORONTO CITY,12/30/2020,2020,12,30,1.7,5.9


In [21]:
weather['Date'] =  pd.to_datetime(weather['Date'], format="%m/%d/%Y")

In [22]:
bikeShareWeather = pd.merge(df, weather, on = "Date")

In [23]:
bikeShareWeather

Unnamed: 0.1,Unnamed: 0,Trip Duration,Start Time,Start Station Name,End Station Name,User Type,Start Month,Start Day,End Month,End Day,...,User Type_Annual Member,User Type_Casual Member,Date,Day of Week,Station Name,Year,Month,Day,Mean Temp (°C),Total Precip (mm)
0,0,648.0,2020-01-01 00:08:00,Madison Ave / Bloor St W,Yonge St / Alexander St - SMART,Annual Member,1,1,1,1,...,1,0,2020-01-01,2,TORONTO CITY,2020,1,1,-0.1,0.2
1,1,419.0,2020-01-01 00:10:00,College St / Huron St,Yonge St / Wood St,Annual Member,1,1,1,1,...,1,0,2020-01-01,2,TORONTO CITY,2020,1,1,-0.1,0.2
2,2,566.0,2020-01-01 00:13:00,Parliament St / Aberdeen Ave,Front St E / Cherry St,Annual Member,1,1,1,1,...,1,0,2020-01-01,2,TORONTO CITY,2020,1,1,-0.1,0.2
3,3,1274.0,2020-01-01 00:17:00,King St E / Victoria St,Sherbourne St / Isabella St,Annual Member,1,1,1,1,...,1,0,2020-01-01,2,TORONTO CITY,2020,1,1,-0.1,0.2
4,4,906.0,2020-01-01 00:19:00,King St E / Jarvis St,University Ave / Elm St,Casual Member,1,1,1,1,...,0,1,2020-01-01,2,TORONTO CITY,2020,1,1,-0.1,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2788325,2911303,330.0,2020-12-31 23:52:00,King St W / Spadina Ave,Wellington St W / Stafford St,Annual Member,12,31,12,31,...,1,0,2020-12-31,3,TORONTO CITY,2020,12,31,0.7,0.0
2788326,2911304,216.0,2020-12-31 23:54:00,Humber Bay Shores Park West,Humber Bay Shores Park / Marine Parade Dr,Annual Member,12,31,12,31,...,1,0,2020-12-31,3,TORONTO CITY,2020,12,31,0.7,0.0
2788327,2911305,204.0,2020-12-31 23:54:00,Humber Bay Shores Park West,Humber Bay Shores Park / Marine Parade Dr,Annual Member,12,31,12,31,...,1,0,2020-12-31,3,TORONTO CITY,2020,12,31,0.7,0.0
2788328,2911306,1659.0,2020-12-31 23:56:00,Church St / Dundas St E - SMART,Church St / Dundas St E - SMART,Annual Member,12,31,1,1,...,1,0,2020-12-31,3,TORONTO CITY,2020,12,31,0.7,0.0


In [24]:
#drop all columns not using as features this time
bikeShareWeather = bikeShareWeather.drop(['Start Station Name','End Station Name', 'User Type_Annual Member', 'User Type_Casual Member', 'Year', 'Start Month', 'Start Day', 'End Month', 'End Day', 'User Type', 'Station Name'], axis = 1)

In [25]:
df.to_csv('bikeshare2020new.csv')

In [26]:
import pandas as pd
import numpy as np

In [27]:
bikeShareNew = pd.read_csv('bikeshare2020new.csv')

In [28]:
bikeShareNew['Start Time'] = pd.to_datetime(bikeShareNew['Start Time'], format="%Y-%m-%d %H:%M:%S")

In [29]:
#group rides by start date and time (hour) ('Start Time') and create a 'Ride Count' column
s = pd.to_datetime(bikeShareNew['Start Time'])
bikeShareCount = s.groupby(s.dt.floor('H')).size().reset_index(name='Ride Count')

In [30]:
bikeShareCount

Unnamed: 0,Start Time,Ride Count
0,2020-01-01 00:00:00,26
1,2020-01-01 01:00:00,49
2,2020-01-01 02:00:00,58
3,2020-01-01 03:00:00,20
4,2020-01-01 04:00:00,15
...,...,...
8755,2020-12-31 19:00:00,148
8756,2020-12-31 20:00:00,104
8757,2020-12-31 21:00:00,102
8758,2020-12-31 22:00:00,65


In [31]:
bikeShareNew2 = pd.merge(bikeShareNew, bikeShareCount, on = "Start Time")

In [32]:
bikeShareRF = bikeShareNew2

In [33]:
bikeShareRF

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Trip Duration,Start Time,Start Station Name,End Station Name,User Type,Start Month,Start Day,End Month,End Day,Trip Start Time,User Type_Annual Member,User Type_Casual Member,Date,Day of Week,Ride Count
0,75,76,518.0,2020-01-01 02:00:00,Yonge St / Wood St,Sherbourne St / Isabella St,Annual Member,1,1,1,1,02:00:00,1,0,2020-01-01,2,58
1,133,134,1946.0,2020-01-01 03:00:00,Dovercourt Rd / Harrison St (Green P) - SMART,Roncesvalles Ave / Marmaduke St,Annual Member,1,1,1,1,03:00:00,1,0,2020-01-01,2,20
2,134,135,560.0,2020-01-01 03:00:00,Queen St W / Ossington Ave,Vanauley St / Queen St W - SMART,Annual Member,1,1,1,1,03:00:00,1,0,2020-01-01,2,20
3,319,324,364.0,2020-01-01 11:00:00,Claremont St / Dundas St W,Bloor St W / Manning Ave - SMART,Annual Member,1,1,1,1,11:00:00,1,0,2020-01-01,2,68
4,320,325,1326.0,2020-01-01 11:00:00,Mill St / Tannery Rd,Riverdale Park South (Broadview Ave),Annual Member,1,1,1,1,11:00:00,1,0,2020-01-01,2,68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45193,2787863,2910816,49.0,2020-12-31 19:00:00,Jarvis St / Dundas St E,Dundas St E / George St,Annual Member,12,31,12,31,19:00:00,1,0,2020-12-31,3,148
45194,2787864,2910817,511.0,2020-12-31 19:00:00,Yonge St / Harbour St,Dundas St W / Yonge St,Annual Member,12,31,12,31,19:00:00,1,0,2020-12-31,3,148
45195,2788011,2910971,924.0,2020-12-31 20:00:00,Claremont St / Dundas St W,Little Norway Park,Annual Member,12,31,12,31,20:00:00,1,0,2020-12-31,3,104
45196,2788115,2911082,908.0,2020-12-31 21:00:00,College St / Huron St,Ted Rogers Way / Bloor St E,Annual Member,12,31,12,31,21:00:00,1,0,2020-12-31,3,102


In [34]:
bikeShareRF['Start Time'] = pd.to_numeric(bikeShareRF['Start Time'])

In [35]:
from sklearn.ensemble import RandomForestRegressor

In [36]:
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)

In [37]:
#set features and targets 
labels = np.array(bikeShareRF['Ride Count'])

In [38]:
features= bikeShareRF.drop('Ride Count', axis = 1, inplace=True)

In [39]:
feature_list = list(bikeShareRF.columns)

In [40]:
features = np.array(bikeShareRF)

In [41]:
print(bikeShareNew2.dtypes)

Unnamed: 0                   int64
Unnamed: 0.1                 int64
Trip  Duration             float64
Start Time                   int64
Start Station Name          object
End Station Name            object
User Type                   object
Start Month                  int64
Start Day                    int64
End Month                    int64
End Day                      int64
Trip Start Time             object
User Type_Annual Member      int64
User Type_Casual Member      int64
Date                        object
Day of Week                  int64
dtype: object


In [42]:
#split the data into train and test sets
from sklearn.model_selection import train_test_split

In [43]:
training_features, testing_features, training_labels, testing_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [44]:
print('Training Features Shape:', training_features.shape)
print('Training Labels Shape:', training_labels.shape)
print('Testing Features Shape:', testing_features.shape)
print('Testing Labels Shape:', testing_labels.shape)

Training Features Shape: (33898, 16)
Training Labels Shape: (33898,)
Testing Features Shape: (11300, 16)
Testing Labels Shape: (11300,)


In [45]:
rf = RandomForestRegressor(n_estimators = 100, random_state = 25)

In [46]:
print(bikeShareRF.dtypes)

Unnamed: 0                   int64
Unnamed: 0.1                 int64
Trip  Duration             float64
Start Time                   int64
Start Station Name          object
End Station Name            object
User Type                   object
Start Month                  int64
Start Day                    int64
End Month                    int64
End Day                      int64
Trip Start Time             object
User Type_Annual Member      int64
User Type_Casual Member      int64
Date                        object
Day of Week                  int64
dtype: object


In [47]:
#drop all columns not using as features this time
bikeShareRF = bikeShareRF.drop(['Start Station Name','End Station Name', 'User Type_Annual Member', 'User Type_Casual Member', 'Start Month', 'Start Day', 'End Month', 'End Day', 'User Type'], axis = 1)

In [48]:
rf.fit(training_features, training_labels)

ValueError: could not convert string to float: 'Euclid Ave / Herrick St - SMART'

In [None]:
# Use the predict method on the test data and calculate the absolute errors
predictions = rf.predict(testing_features)
errors = abs(predictions - test_labels)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'rides.')

In [None]:
# Calculate mean absolute percentage error (MAPE) and accuracy
mape = 100 * (errors / testing_labels)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')