In [167]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [168]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import holidays
import itertools
from sklearn.linear_model import LinearRegression, Ridge
# from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, max_error, mean_absolute_error, r2_score

# 1. Import Data

In [169]:
df_data_w_clusters = pd.read_csv('/content/drive/My Drive/MIE498 Thesis/Share-Bike-Station-Clustering-and-Usage-Prediction/station_data_w_clusters.csv', index_col=0)

In [170]:
df_data_w_clusters.head()

Unnamed: 0,station_id,station_na,lat,lon,neighborhood_id,FSA_code,da_id,cluster
0,7021,Bay St / Albert St,43.653264,-79.382458,76,M5G,35204616.0,0
1,7160,King St W / Tecumseth St,43.643333,-79.405556,82,M5V,35204142.0,35
2,7012,Elizabeth St / Edward St (Bus Terminal),43.656026,-79.385327,76,M5G,35204607.0,16
3,7041,Edward St / Yonge St,43.656729,-79.382736,76,M5G,35204608.0,16
4,7275,Queen St W / James St,43.652276,-79.380701,76,M5G,35204616.0,34


In [171]:
# Import ridership data
data_dir = '/content/drive/My Drive/MIE498 Thesis/0_Data'
merged_bike_data_2019 = pd.read_csv("{}/ridership_2019_with_bike_stations_info_20200930.csv".format(data_dir), header=0)
print(merged_bike_data_2019.shape)
merged_bike_data_2019['End Day of Year'] = merged_bike_data_2019.apply(lambda row: datetime.strptime(row['End Time'], "%Y-%m-%d %H:%M:%S").timetuple().tm_yday, axis=1)
print(merged_bike_data_2019.shape)
merged_bike_data_2019.head()

(2438720, 26)
(2438720, 27)


Unnamed: 0,Trip Id,Trip Duration,Start Station Id,Start Time,Start Station Name,End Station Id,End Time,End Station Name,User Type,Bike Id,Start Year,Start Month,Start Hour,Start Day of Week,Start Holiday,End Year,End Month,End Hour,End Day of Week,End Holiday,Start Day of Year,Start Week of Year,Start Lat,Start Lon,End Lat,End Lon,End Day of Year
0,4581278.0,1547.0,7021.0,2019-01-01 00:08:00,Bay St / Albert St,7233.0,2019-01-01 00:33:00,King / Cowan Ave - SMART,Annual Member,1296.0,2019.0,1.0,0.0,1.0,1.0,2019.0,1.0,0.0,1.0,1.0,1.0,1.0,43.653264,-79.382458,43.637922,-79.431734,1
1,4586979.0,1243.0,7021.0,2019-01-03 17:13:00,Bay St / Albert St,7233.0,2019-01-03 17:34:00,King / Cowan Ave - SMART,Annual Member,3168.0,2019.0,1.0,17.0,3.0,0.0,2019.0,1.0,17.0,3.0,0.0,3.0,1.0,43.653264,-79.382458,43.637922,-79.431734,3
2,4591641.0,1112.0,7021.0,2019-01-04 21:42:00,Bay St / Albert St,7233.0,2019-01-04 22:00:00,King / Cowan Ave - SMART,Annual Member,3512.0,2019.0,1.0,21.0,4.0,0.0,2019.0,1.0,22.0,4.0,0.0,4.0,1.0,43.653264,-79.382458,43.637922,-79.431734,4
3,4594328.0,1156.0,7021.0,2019-01-05 23:21:00,Bay St / Albert St,7233.0,2019-01-05 23:40:00,King / Cowan Ave - SMART,Annual Member,212.0,2019.0,1.0,23.0,5.0,0.0,2019.0,1.0,23.0,5.0,0.0,5.0,1.0,43.653264,-79.382458,43.637922,-79.431734,5
4,4596322.0,1068.0,7021.0,2019-01-06 21:42:00,Bay St / Albert St,7233.0,2019-01-06 22:00:00,King / Cowan Ave - SMART,Annual Member,3279.0,2019.0,1.0,21.0,6.0,0.0,2019.0,1.0,22.0,6.0,0.0,6.0,1.0,43.653264,-79.382458,43.637922,-79.431734,6


In [172]:
merged_data = merged_bike_data_2019.copy()
merged_data = merged_data.merge(df_data_w_clusters[['station_id', 'cluster']], how='left', left_on='Start Station Id', right_on='station_id').drop('station_id', axis=1)
merged_data.rename({'cluster': 'Start Cluster'}, axis=1, inplace=True)
merged_data = merged_data.merge(df_data_w_clusters[['station_id', 'cluster']], how='left', left_on='End Station Id', right_on='station_id').drop('station_id', axis=1)
merged_data.rename({'cluster': 'End Cluster'}, axis=1, inplace=True)

In [173]:
merged_data.isnull().sum()

Trip Id               0
Trip Duration         0
Start Station Id      0
Start Time            0
Start Station Name    0
End Station Id        0
End Time              0
End Station Name      0
User Type             0
Bike Id               0
Start Year            0
Start Month           0
Start Hour            0
Start Day of Week     0
Start Holiday         0
End Year              0
End Month             0
End Hour              0
End Day of Week       0
End Holiday           0
Start Day of Year     0
Start Week of Year    0
Start Lat             0
Start Lon             0
End Lat               0
End Lon               0
End Day of Year       0
Start Cluster         0
End Cluster           0
dtype: int64

In [174]:
merged_data.columns

Index(['Trip Id', 'Trip Duration', 'Start Station Id', 'Start Time',
       'Start Station Name', 'End Station Id', 'End Time', 'End Station Name',
       'User Type', 'Bike Id', 'Start Year', 'Start Month', 'Start Hour',
       'Start Day of Week', 'Start Holiday', 'End Year', 'End Month',
       'End Hour', 'End Day of Week', 'End Holiday', 'Start Day of Year',
       'Start Week of Year', 'Start Lat', 'Start Lon', 'End Lat', 'End Lon',
       'End Day of Year', 'Start Cluster', 'End Cluster'],
      dtype='object')

In [175]:
merged_data.head()

Unnamed: 0,Trip Id,Trip Duration,Start Station Id,Start Time,Start Station Name,End Station Id,End Time,End Station Name,User Type,Bike Id,Start Year,Start Month,Start Hour,Start Day of Week,Start Holiday,End Year,End Month,End Hour,End Day of Week,End Holiday,Start Day of Year,Start Week of Year,Start Lat,Start Lon,End Lat,End Lon,End Day of Year,Start Cluster,End Cluster
0,4581278.0,1547.0,7021.0,2019-01-01 00:08:00,Bay St / Albert St,7233.0,2019-01-01 00:33:00,King / Cowan Ave - SMART,Annual Member,1296.0,2019.0,1.0,0.0,1.0,1.0,2019.0,1.0,0.0,1.0,1.0,1.0,1.0,43.653264,-79.382458,43.637922,-79.431734,1,0,32
1,4586979.0,1243.0,7021.0,2019-01-03 17:13:00,Bay St / Albert St,7233.0,2019-01-03 17:34:00,King / Cowan Ave - SMART,Annual Member,3168.0,2019.0,1.0,17.0,3.0,0.0,2019.0,1.0,17.0,3.0,0.0,3.0,1.0,43.653264,-79.382458,43.637922,-79.431734,3,0,32
2,4591641.0,1112.0,7021.0,2019-01-04 21:42:00,Bay St / Albert St,7233.0,2019-01-04 22:00:00,King / Cowan Ave - SMART,Annual Member,3512.0,2019.0,1.0,21.0,4.0,0.0,2019.0,1.0,22.0,4.0,0.0,4.0,1.0,43.653264,-79.382458,43.637922,-79.431734,4,0,32
3,4594328.0,1156.0,7021.0,2019-01-05 23:21:00,Bay St / Albert St,7233.0,2019-01-05 23:40:00,King / Cowan Ave - SMART,Annual Member,212.0,2019.0,1.0,23.0,5.0,0.0,2019.0,1.0,23.0,5.0,0.0,5.0,1.0,43.653264,-79.382458,43.637922,-79.431734,5,0,32
4,4596322.0,1068.0,7021.0,2019-01-06 21:42:00,Bay St / Albert St,7233.0,2019-01-06 22:00:00,King / Cowan Ave - SMART,Annual Member,3279.0,2019.0,1.0,21.0,6.0,0.0,2019.0,1.0,22.0,6.0,0.0,6.0,1.0,43.653264,-79.382458,43.637922,-79.431734,6,0,32


In [176]:
# check-out and check-in data split
checkout_data = merged_data[['Trip Id', 'Start Station Id', 'Start Time', 'Start Station Name', 'Start Year', 'Start Month', 'Start Hour',
       'Start Day of Week', 'Start Holiday', 'Start Day of Year', 'Start Week of Year', 'Start Lat', 'Start Lon', 'Start Cluster']]
checkin_data = merged_data[['Trip Id', 'End Station Id', 'End Time', 'End Station Name', 'End Year', 'End Month',
       'End Hour', 'End Day of Week', 'End Holiday', 'End Lat', 'End Lon', 'End Day of Year', 'End Cluster']]

In [177]:
# Training: Months 1-9, Testing: Months 10-12
def train_test_split(data, start_end_flag):
  training_data = data.loc[(data['{} Month'.format(start_end_flag)] >= 1) & (data['{} Month'.format(start_end_flag)] <= 9)]
  testing_data = data.loc[(data['{} Month'.format(start_end_flag)] >= 10) & (data['{} Month'.format(start_end_flag)] <= 12)]
  print(data.shape, training_data.shape, testing_data.shape)
  return training_data, testing_data

In [178]:
def check_weekend(dayofweek):
  if dayofweek > 4:
    return 'weekend'
  else:
    return 'weekday'

# 2. Build prediction models for clusters (check-out numbers)

In [179]:
checkout_data['weekday/weekend'] = checkout_data.apply(lambda row: check_weekend(row['Start Day of Week']), axis=1)
checkout_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Trip Id,Start Station Id,Start Time,Start Station Name,Start Year,Start Month,Start Hour,Start Day of Week,Start Holiday,Start Day of Year,Start Week of Year,Start Lat,Start Lon,Start Cluster,weekday/weekend
0,4581278.0,7021.0,2019-01-01 00:08:00,Bay St / Albert St,2019.0,1.0,0.0,1.0,1.0,1.0,1.0,43.653264,-79.382458,0,weekday
1,4586979.0,7021.0,2019-01-03 17:13:00,Bay St / Albert St,2019.0,1.0,17.0,3.0,0.0,3.0,1.0,43.653264,-79.382458,0,weekday
2,4591641.0,7021.0,2019-01-04 21:42:00,Bay St / Albert St,2019.0,1.0,21.0,4.0,0.0,4.0,1.0,43.653264,-79.382458,0,weekday
3,4594328.0,7021.0,2019-01-05 23:21:00,Bay St / Albert St,2019.0,1.0,23.0,5.0,0.0,5.0,1.0,43.653264,-79.382458,0,weekend
4,4596322.0,7021.0,2019-01-06 21:42:00,Bay St / Albert St,2019.0,1.0,21.0,6.0,0.0,6.0,1.0,43.653264,-79.382458,0,weekend


In [180]:
def fill_in_missing_combinations(data, start_end_flag):
  # Check missing combinations
  hours = np.arange(24)
  weekday_weekend = ['weekday', 'weekend']
  clusters = np.arange(37)

  combinations = itertools.product(hours, weekday_weekend, clusters)
  df_combinations = data[['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag)]].to_numpy().astype('str')
  df_combinations = df_combinations.tolist()
  comb_list = []
  for comb in combinations:
    comb = [str(comb[0]), comb[1], str(comb[2])]
    if comb not in df_combinations:
      # print(comb)
      comb_list.append(comb)
  print('number of missing combinations:', len(comb_list))

  # Fill in missing combinations
  for comb in comb_list:
    new_row = {'{} Hour'.format(start_end_flag) : int(comb[0]), 'weekday/weekend' : comb[1], '{} Cluster'.format(start_end_flag) : int(comb[2]), 'Trip Id': 0.000001}
    data = data.append(new_row, ignore_index = True)

  return data

In [181]:
def preprocess_data(train_data, test_data, start_end_flag):
  train_data = train_data[['Trip Id', '{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag)]]
  train_data = train_data.groupby(by=['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag)]).count() / 9
  print(train_data.head())
  train_data = train_data.reset_index(level=[0,1,2])
  train_data['{} Hour'.format(start_end_flag)] = train_data['{} Hour'.format(start_end_flag)].astype("int64")

  test_data = test_data[['Trip Id', '{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag)]]
  test_data = test_data.groupby(by=['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag)]).count() / 3
  test_data = test_data.reset_index(level=[0,1,2])
  test_data['{} Hour'.format(start_end_flag)] = test_data['{} Hour'.format(start_end_flag)].astype("int64")

  train_data = fill_in_missing_combinations(train_data, start_end_flag)
  test_data = fill_in_missing_combinations(test_data, start_end_flag)

  if start_end_flag == 'Start':
    name = 'Number of Checkouts'
  else:
    name = 'Number of Checkins'

  train_data = train_data.sort_values(by=['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag)])
  train_data.rename({'Trip Id': name}, axis=1, inplace=True)
  train_data.reset_index(drop=True, inplace=True)

  test_data = test_data.sort_values(by=['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag)])
  test_data.rename({'Trip Id': name}, axis=1, inplace=True)
  test_data.reset_index(drop=True, inplace=True)

  print('Train Data Shape: ', train_data.shape)
  print('Test Data Shape: ', test_data.shape)

  return train_data, test_data

In [182]:
checkout_train, checkout_test = train_test_split(checkout_data, 'Start')

(2438720, 15) (1970776, 15) (467944, 15)


In [183]:
checkout_train, checkout_test = preprocess_data(checkout_train, checkout_test, 'Start')

                                            Trip Id
Start Hour weekday/weekend Start Cluster           
0.0        weekday         0              75.444444
                           1              11.888889
                           2              33.111111
                           3              28.333333
                           4               8.000000
number of missing combinations: 21
number of missing combinations: 74
Train Data Shape:  (1776, 4)
Test Data Shape:  (1776, 4)


In [184]:
checkout_train.head(10)

Unnamed: 0,Start Hour,weekday/weekend,Start Cluster,Number of Checkouts
0,0,weekday,0,75.444444
1,0,weekday,1,11.888889
2,0,weekday,2,33.111111
3,0,weekday,3,28.333333
4,0,weekday,4,8.0
5,0,weekday,5,26.666667
6,0,weekday,6,131.0
7,0,weekday,7,3.333333
8,0,weekday,8,37.0
9,0,weekday,9,67.111111


In [185]:
checkout_test.head(10)

Unnamed: 0,Start Hour,weekday/weekend,Start Cluster,Number of Checkouts
0,0,weekday,0,61.0
1,0,weekday,1,8.0
2,0,weekday,2,30.0
3,0,weekday,3,13.666667
4,0,weekday,4,4.0
5,0,weekday,5,23.666667
6,0,weekday,6,129.333333
7,0,weekday,7,2.333333
8,0,weekday,8,20.0
9,0,weekday,9,46.333333


In [186]:
X_train = checkout_train[['Start Hour', 'weekday/weekend', 'Start Cluster']]
X_train = pd.get_dummies(data=X_train, columns=['Start Hour', 'weekday/weekend', 'Start Cluster'], drop_first=True).to_numpy()
# X_train = OneHotEncoder().fit_transform(X_train).toarray()
y_train = checkout_train[['Number of Checkouts']].to_numpy()
X_train.shape, y_train.shape

((1776, 60), (1776, 1))

In [187]:
X_test = checkout_test[['Start Hour', 'weekday/weekend', 'Start Cluster']]
X_test = pd.get_dummies(data=X_test, columns=['Start Hour', 'weekday/weekend', 'Start Cluster'], drop_first=True).to_numpy()
y_test = checkout_test[['Number of Checkouts']].to_numpy()
X_test.shape, y_test.shape

((1776, 60), (1776, 1))

In [188]:
y_train_log, y_test_log = np.log(y_train), np.log(y_test)

In [189]:
# checkout_linreg = LinearRegression().fit(X_train, y_train)
checkout_linreg = LinearRegression().fit(X_train, y_train_log)

In [190]:
checkout_linreg.coef_

array([[-0.67353447, -1.09743333, -1.95667875, -2.03659565, -1.76969739,
        -0.44154157,  0.83406368,  1.46435112,  1.69075391,  1.5950165 ,
         1.7962745 ,  1.97515004,  2.01707051,  2.03723793,  2.12660841,
         2.25294254,  2.43147278,  2.3152312 ,  2.09769656,  1.80693285,
         1.23131999,  0.71749798,  0.54765107, -0.90498889, -1.54246311,
        -0.45657254, -0.54970801, -1.90591524, -0.46788017,  0.48677847,
        -2.53783807, -0.34983208, -0.12329475, -1.8925121 , -2.69637957,
        -0.30524489, -2.70560325,  0.46909629, -1.38410768,  0.24708786,
        -2.71751286, -9.86084518, -1.39797134, -1.43617912, -1.09972395,
        -2.0538852 , -1.62406228, -2.45440312, -1.37544853, -1.19958664,
        -3.65586546, -2.59778914, -0.91044503,  0.17369711, -0.44137941,
        -0.82441245,  0.08431345,  0.32720626,  0.35109525, -3.00641666]])

In [191]:
# import statsmodels.api as sm
# from scipy import stats
# X_train_3 = sm.add_constant(X_train)
# est = sm.OLS(y_train, X_train_3)
# est2 = est.fit()
# print(est2.summary())

In [192]:
# X_train_2 = checkout_train[['Start Hour', 'weekday/weekend', 'Start Cluster']]
# X_train_2 = pd.get_dummies(data=X_train_2, columns=['Start Hour', 'weekday/weekend', 'Start Cluster'], drop_first=True)
# coeffs = pd.concat([pd.DataFrame(X_train_2.columns),pd.DataFrame(np.transpose(checkout_linreg.coef_))], axis = 1)
# coeffs.columns = ['Feature', 'LinReg Coef']
# print(['Feature', 'LinReg Coef'])
# for i, row in coeffs.iterrows():
#   print(row['Feature'], row['LinReg Coef'])

In [193]:
y_train_pred = np.exp(checkout_linreg.predict(X_train))
y_test_pred = np.exp(checkout_linreg.predict(X_test))

In [194]:
# R-squared values
print(checkout_linreg.score(X_train, y_train_log))
print(checkout_linreg.score(X_test, y_test_log))

0.7874589856164292
0.5414259251352735


In [195]:
max(y_train), max(y_test)

(array([2729.77777778]), array([1998.]))

In [196]:
# MSE
print(mean_squared_error(y_train, y_train_pred, squared=True))
print(mean_squared_error(y_test, y_test_pred, squared=True))
# RMSE
print(mean_squared_error(y_train, y_train_pred, squared=False))
print(mean_squared_error(y_test, y_test_pred, squared=False))
# maximum residual error
print(max_error(y_train, y_train_pred))
print(max_error(y_test, y_test_pred))
# MAE
print(mean_absolute_error(y_train, y_train_pred))
print(mean_absolute_error(y_test, y_test_pred))
# R-squared
print(r2_score(y_train, y_train_pred))
print(r2_score(y_test, y_test_pred))

11804.966738984998
13040.712652213226
108.6506637760902
114.19593973610982
1552.0166636619024
831.8191762229144
45.74270031531615
59.85097064116688
0.7621603245669744
0.5831161615560605


# 3. Compute a Transition Matrix

In [197]:
data_train, data_test = train_test_split(merged_data, 'Start')

(2438720, 29) (1970776, 29) (467944, 29)


In [198]:
data_train.shape

(1970776, 29)

In [199]:
data_train['Start Cluster'].unique()

array([ 0, 35, 16, 34, 14, 30, 12,  2, 10, 31,  1, 33,  6, 29,  9, 32, 19,
       22, 15, 21,  8, 36,  3, 20, 23, 24,  4, 26, 17, 25,  5,  7, 11, 13,
       28, 18, 27])

In [200]:
from_cluster_list = list(np.arange(0, 37))
df_transition_matrix = pd.DataFrame(data=np.zeros((37, 37)), columns = from_cluster_list, index=from_cluster_list)
df_transition_matrix = df_transition_matrix.astype("int")

for from_cluster in from_cluster_list:
  data_train_temp = data_train[data_train['Start Cluster'] == from_cluster]
  df_counts = pd.DataFrame(data_train_temp['End Cluster'].value_counts())
  df_counts['End Cluster'] = df_counts['End Cluster'].div(9) # 9 months of training data
  
  # print(df_counts.head())
  to_cluster_list = list(df_counts.index)
  
  for to_cluster in to_cluster_list:
    cnt = df_counts.loc[to_cluster]['End Cluster']
    df_transition_matrix.loc[from_cluster, to_cluster] = cnt
  
  df_transition_matrix = df_transition_matrix.div(df_transition_matrix.sum(axis=1), axis=0)

In [201]:
df_transition_matrix.replace(np.nan, 0, inplace=True)

In [202]:
sum(df_transition_matrix.iloc[18])

1.0000000000000002

In [203]:
df_transition_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
0,0.088817,0.02339,0.025316,0.073562,0.007252,0.044845,0.109464,0.005082,0.098633,0.029017,0.000855,0.000451,0.009816,0.000601,0.031469,0.003307,0.062148,0.000122,0.0,0.002809,0.000958,0.004387,0.000413,0.000892,0.002151,0.005768,0.016345,9.4e-05,0.000751,0.009563,0.0849,0.027382,0.013048,0.060345,0.08366,0.067287,0.005101
1,0.08965,0.082856,0.020639,0.079395,0.04209,0.045979,0.075763,0.006709,0.063499,0.018802,0.000427,0.001667,0.005128,0.002436,0.013332,0.004615,0.027733,0.0,0.0,0.000427,0.000214,0.00047,0.000171,0.000983,0.021537,0.047945,0.096402,0.000897,0.012392,0.002692,0.063413,0.013204,0.001966,0.015469,0.102213,0.037219,0.001667
2,0.069891,0.013799,0.086898,0.041862,0.012747,0.071857,0.131503,0.002949,0.082224,0.084414,0.002794,0.002156,0.014472,0.006175,0.062785,0.010384,0.067304,0.001414,5.2e-05,0.003536,0.0,0.000604,0.000793,0.000949,0.00069,0.022216,0.006692,0.000673,0.005002,0.001794,0.05147,0.01047,0.006175,0.015524,0.063268,0.042621,0.001846
3,0.129491,0.035993,0.025833,0.092258,0.017434,0.049105,0.077483,0.016031,0.103201,0.015575,0.000196,0.000489,0.005121,0.000261,0.019097,0.001174,0.047165,0.000114,8.2e-05,0.00062,0.000424,0.000962,0.000196,0.000522,0.008627,0.012982,0.042452,0.001859,0.002593,0.007078,0.069801,0.015819,0.008073,0.031981,0.120929,0.035276,0.003702
4,0.060706,0.069123,0.040162,0.073033,0.090662,0.078799,0.111339,0.001789,0.063159,0.034727,0.000596,0.000464,0.004109,0.001657,0.033998,0.001591,0.058321,6.6e-05,0.000663,0.002054,0.000133,0.000133,6.6e-05,0.000199,0.00444,0.103652,0.027835,0.001789,0.013188,0.000663,0.053947,0.00338,0.000928,0.00623,0.037179,0.017894,0.001325
5,0.079505,0.019725,0.054262,0.053731,0.020013,0.098259,0.156706,0.002941,0.065981,0.047091,0.0005,0.000197,0.009006,0.001925,0.046181,0.002638,0.099882,0.000136,3e-05,0.001744,3e-05,0.000758,0.000212,0.000591,0.001501,0.019634,0.011386,0.000652,0.003654,0.001455,0.053458,0.007672,0.005352,0.008475,0.088966,0.033855,0.001895
6,0.109071,0.014444,0.045332,0.03907,0.011428,0.077166,0.146378,0.002522,0.047718,0.053686,0.001103,0.000795,0.017833,0.001598,0.072394,0.004779,0.083929,0.000279,0.0,0.004263,0.000294,0.002092,0.000215,0.001354,0.000688,0.011306,0.005761,0.000129,0.001333,0.00268,0.066268,0.013678,0.005288,0.020807,0.082503,0.049223,0.002594
7,0.047853,0.010344,0.012329,0.11702,0.004075,0.015568,0.029986,0.139693,0.128931,0.003761,0.000104,0.0,0.002194,0.000313,0.006164,0.000627,0.007418,0.000104,0.0,0.000627,0.000522,0.000627,0.000104,0.000522,0.022673,0.004806,0.176784,0.000104,0.001254,0.00397,0.02424,0.007836,0.002612,0.066451,0.139484,0.020061,0.000836
8,0.114424,0.021311,0.027679,0.08821,0.010597,0.041666,0.064076,0.019468,0.105021,0.013018,0.000236,8.3e-05,0.005505,0.000461,0.015854,0.001949,0.032723,2.4e-05,2.4e-05,0.00065,0.000839,0.002918,0.000201,0.000543,0.005576,0.007891,0.039031,0.00013,0.000721,0.014034,0.069392,0.020862,0.008139,0.070715,0.152298,0.039917,0.003816
9,0.045938,0.005674,0.055292,0.014675,0.006779,0.034168,0.11154,0.000444,0.023208,0.129599,0.008146,0.005059,0.031035,0.007782,0.140274,0.034225,0.113067,0.003065,5.7e-05,0.010402,0.000194,0.001048,0.001903,0.004295,0.000513,0.014219,0.002632,0.000741,0.002267,0.002563,0.049903,0.012065,0.00556,0.016281,0.056214,0.047077,0.002096


# 4. Make check-in predictions
* check-out predictions x transition matrix = check-in predictions

In [204]:
X_test_checkout = checkout_test[['Start Hour', 'weekday/weekend', 'Start Cluster']]

In [205]:
X_test_checkout.shape

(1776, 3)

In [206]:
X_test_checkout.head()

Unnamed: 0,Start Hour,weekday/weekend,Start Cluster
0,0,weekday,0
1,0,weekday,1
2,0,weekday,2
3,0,weekday,3
4,0,weekday,4


In [207]:
y_test_pred

array([[74.64111655],
       [15.96227942],
       [47.28149403],
       ...,
       [72.42560872],
       [74.17661503],
       [ 2.58296259]])

In [208]:
df_predictions = X_test_checkout.copy()
df_predictions['Check-out Predictions'] = y_test_pred
df_predictions['Check-out True Values'] = y_test

In [209]:
df_predictions.head()

Unnamed: 0,Start Hour,weekday/weekend,Start Cluster,Check-out Predictions,Check-out True Values
0,0,weekday,0,74.641117,61.0
1,0,weekday,1,15.962279,8.0
2,0,weekday,2,47.281494,30.0
3,0,weekday,3,43.076754,13.666667
4,0,weekday,4,11.098126,4.0


In [210]:
min(df_predictions['Check-out Predictions'])

0.00020556706958218846

In [211]:
checkin_prediction_list = []

for idx, row in df_predictions.iterrows():
  checkout_prediction = row['Check-out Predictions']
  # print(checkout_prediction)
  cluster_n = row['Start Cluster']
  
  transition_list = np.array(df_transition_matrix.iloc[cluster_n])
  checkin_prediction = transition_list * checkout_prediction
  checkin_prediction_list.append(checkin_prediction)

df_predictions['Check-in Predictions'] = checkin_prediction_list

In [212]:
df_predictions.head()

Unnamed: 0,Start Hour,weekday/weekend,Start Cluster,Check-out Predictions,Check-out True Values,Check-in Predictions
0,0,weekday,0,74.641117,61.0,"[6.629390418555299, 1.7458680213857953, 1.8896..."
1,0,weekday,1,15.962279,8.0,"[1.4310256482455517, 1.3225732754757509, 0.329..."
2,0,weekday,2,47.281494,30.0,"[3.304550396728519, 0.6524285087321854, 4.1086..."
3,0,weekday,3,43.076754,13.666667,"[5.578052238302557, 1.5504737140974492, 1.1128..."
4,0,weekday,4,11.098126,4.0,"[0.6737281165583376, 0.7671380191816006, 0.445..."


In [213]:
# Checkin data
checkin_data['weekday/weekend'] = checkin_data.apply(lambda row: check_weekend(row['End Day of Week']), axis=1)
checkin_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Trip Id,End Station Id,End Time,End Station Name,End Year,End Month,End Hour,End Day of Week,End Holiday,End Lat,End Lon,End Day of Year,End Cluster,weekday/weekend
0,4581278.0,7233.0,2019-01-01 00:33:00,King / Cowan Ave - SMART,2019.0,1.0,0.0,1.0,1.0,43.637922,-79.431734,1,32,weekday
1,4586979.0,7233.0,2019-01-03 17:34:00,King / Cowan Ave - SMART,2019.0,1.0,17.0,3.0,0.0,43.637922,-79.431734,3,32,weekday
2,4591641.0,7233.0,2019-01-04 22:00:00,King / Cowan Ave - SMART,2019.0,1.0,22.0,4.0,0.0,43.637922,-79.431734,4,32,weekday
3,4594328.0,7233.0,2019-01-05 23:40:00,King / Cowan Ave - SMART,2019.0,1.0,23.0,5.0,0.0,43.637922,-79.431734,5,32,weekend
4,4596322.0,7233.0,2019-01-06 22:00:00,King / Cowan Ave - SMART,2019.0,1.0,22.0,6.0,0.0,43.637922,-79.431734,6,32,weekend


In [214]:
checkin_train, checkin_test = train_test_split(checkin_data, 'End')
checkin_train, checkin_test = preprocess_data(checkin_train, checkin_test, 'End')
y_checkin_test = checkin_test[['Number of Checkins']].to_numpy()
y_checkin_test.shape

(2438720, 14) (1970751, 14) (467969, 14)
                                        Trip Id
End Hour weekday/weekend End Cluster           
0.0      weekday         0            75.888889
                         1            17.666667
                         2            55.111111
                         3            53.666667
                         4            12.000000
number of missing combinations: 16
number of missing combinations: 63
Train Data Shape:  (1776, 4)
Test Data Shape:  (1776, 4)


(1776, 1)

In [215]:
hours = np.arange(24)
weekday_weekend = ['weekday', 'weekend']
clusters = np.arange(37)
combinations = itertools.product(hours, weekday_weekend)

checkin_predictions_final = []
for comb in combinations:
  # obtain 37x37 check-in prediction matrix for the (hour, weekday/weekend) combination
  checkin_predictions_list = df_predictions.loc[(df_predictions['Start Hour'] == comb[0]) & (df_predictions['weekday/weekend'] == comb[1])]['Check-in Predictions'].to_numpy()
  df_checkin_temp = pd.DataFrame(columns=np.arange(37))
  for i in range(37):
    df_checkin_temp = df_checkin_temp.append(pd.Series(checkin_predictions_list[i]), ignore_index=True)
  # compute column sums
  checkin_col_sums = list(df_checkin_temp.sum(axis=0).to_numpy())
  checkin_predictions_final += checkin_col_sums

# Add to dataframe
df_predictions['Check-in True Values'] = y_checkin_test
df_predictions['Check-in Predictions'] = checkin_predictions_final

In [216]:
df_predictions.head()

Unnamed: 0,Start Hour,weekday/weekend,Start Cluster,Check-out Predictions,Check-out True Values,Check-in Predictions,Check-in True Values
0,0,weekday,0,74.641117,61.0,88.866912,64.0
1,0,weekday,1,15.962279,8.0,18.402619,11.0
2,0,weekday,2,47.281494,30.0,34.930505,42.666667
3,0,weekday,3,43.076754,13.666667,45.421163,41.0
4,0,weekday,4,11.098126,4.0,11.065283,7.666667


# 5 Evaluation

In [217]:
def evaluation(y_true, y_pred):
  print('MSE: ', mean_squared_error(y_true, y_pred, squared=True)) # MSE
  print('RMSE: ', mean_squared_error(y_true, y_pred, squared=False)) # RMSE
  print('Maximum residual error: ', max_error(y_true, y_pred)) # maximum residual error
  print('MAE: ', mean_absolute_error(y_true, y_pred)) # MAE

In [218]:
checkout_true = df_predictions['Check-out True Values'].to_numpy()
checkout_pred = df_predictions['Check-out Predictions'].to_numpy()
checkin_true = df_predictions['Check-in True Values'].to_numpy()
checkin_pred = df_predictions['Check-in Predictions'].to_numpy()

In [221]:
print('Checkout evaluation:')
evaluation(checkout_true, checkout_pred)
print(min(checkout_true), max(checkout_true))

Checkout evaluation:
MSE:  13040.712652213226
RMSE:  114.19593973610982
Maximum residual error:  831.8191762229144
MAE:  59.85097064116688
1e-06 1998.0


In [222]:
print('Checkin evaluation:')
evaluation(checkin_true, checkin_pred)
print(min(checkin_true), max(checkin_true))

Checkin evaluation:
MSE:  17632.12128109205
RMSE:  132.78599806113616
Maximum residual error:  1832.1061333111147
MAE:  59.48802911934184
1e-06 2385.0
