In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import holidays
import itertools
from sklearn.linear_model import LinearRegression, Ridge
# from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, max_error, mean_absolute_error, r2_score

# 1. Import Data

In [5]:
df_data_w_clusters = pd.read_csv('/content/drive/My Drive/MIE498 Thesis/Share-Bike-Station-Clustering-and-Usage-Prediction/station_data_w_clusters.csv', index_col=0)

In [6]:
df_data_w_clusters.head()

Unnamed: 0,station_id,station_na,lat,lon,neighborhood_id,FSA_code,da_id,cluster
0,7021,Bay St / Albert St,43.653264,-79.382458,76,M5G,35204616.0,0
1,7160,King St W / Tecumseth St,43.643333,-79.405556,82,M5V,35204142.0,35
2,7012,Elizabeth St / Edward St (Bus Terminal),43.656026,-79.385327,76,M5G,35204607.0,16
3,7041,Edward St / Yonge St,43.656729,-79.382736,76,M5G,35204608.0,16
4,7275,Queen St W / James St,43.652276,-79.380701,76,M5G,35204616.0,34


In [7]:
# Import ridership data
data_dir = '/content/drive/My Drive/MIE498 Thesis/0_Data'
merged_bike_data_2019 = pd.read_csv("{}/ridership_2019_with_bike_stations_info_20200930.csv".format(data_dir), header=0)
print(merged_bike_data_2019.shape)
merged_bike_data_2019['End Day of Year'] = merged_bike_data_2019.apply(lambda row: datetime.strptime(row['End Time'], "%Y-%m-%d %H:%M:%S").timetuple().tm_yday, axis=1)
print(merged_bike_data_2019.shape)
merged_bike_data_2019.head()

(2438720, 26)
(2438720, 27)


Unnamed: 0,Trip Id,Trip Duration,Start Station Id,Start Time,Start Station Name,End Station Id,End Time,End Station Name,User Type,Bike Id,Start Year,Start Month,Start Hour,Start Day of Week,Start Holiday,End Year,End Month,End Hour,End Day of Week,End Holiday,Start Day of Year,Start Week of Year,Start Lat,Start Lon,End Lat,End Lon,End Day of Year
0,4581278.0,1547.0,7021.0,2019-01-01 00:08:00,Bay St / Albert St,7233.0,2019-01-01 00:33:00,King / Cowan Ave - SMART,Annual Member,1296.0,2019.0,1.0,0.0,1.0,1.0,2019.0,1.0,0.0,1.0,1.0,1.0,1.0,43.653264,-79.382458,43.637922,-79.431734,1
1,4586979.0,1243.0,7021.0,2019-01-03 17:13:00,Bay St / Albert St,7233.0,2019-01-03 17:34:00,King / Cowan Ave - SMART,Annual Member,3168.0,2019.0,1.0,17.0,3.0,0.0,2019.0,1.0,17.0,3.0,0.0,3.0,1.0,43.653264,-79.382458,43.637922,-79.431734,3
2,4591641.0,1112.0,7021.0,2019-01-04 21:42:00,Bay St / Albert St,7233.0,2019-01-04 22:00:00,King / Cowan Ave - SMART,Annual Member,3512.0,2019.0,1.0,21.0,4.0,0.0,2019.0,1.0,22.0,4.0,0.0,4.0,1.0,43.653264,-79.382458,43.637922,-79.431734,4
3,4594328.0,1156.0,7021.0,2019-01-05 23:21:00,Bay St / Albert St,7233.0,2019-01-05 23:40:00,King / Cowan Ave - SMART,Annual Member,212.0,2019.0,1.0,23.0,5.0,0.0,2019.0,1.0,23.0,5.0,0.0,5.0,1.0,43.653264,-79.382458,43.637922,-79.431734,5
4,4596322.0,1068.0,7021.0,2019-01-06 21:42:00,Bay St / Albert St,7233.0,2019-01-06 22:00:00,King / Cowan Ave - SMART,Annual Member,3279.0,2019.0,1.0,21.0,6.0,0.0,2019.0,1.0,22.0,6.0,0.0,6.0,1.0,43.653264,-79.382458,43.637922,-79.431734,6


In [8]:
merged_data = merged_bike_data_2019.copy()
merged_data = merged_data.merge(df_data_w_clusters[['station_id', 'cluster']], how='left', left_on='Start Station Id', right_on='station_id').drop('station_id', axis=1)
merged_data.rename({'cluster': 'Start Cluster'}, axis=1, inplace=True)
merged_data = merged_data.merge(df_data_w_clusters[['station_id', 'cluster']], how='left', left_on='End Station Id', right_on='station_id').drop('station_id', axis=1)
merged_data.rename({'cluster': 'End Cluster'}, axis=1, inplace=True)

In [9]:
merged_data.isnull().sum()

Trip Id               0
Trip Duration         0
Start Station Id      0
Start Time            0
Start Station Name    0
End Station Id        0
End Time              0
End Station Name      0
User Type             0
Bike Id               0
Start Year            0
Start Month           0
Start Hour            0
Start Day of Week     0
Start Holiday         0
End Year              0
End Month             0
End Hour              0
End Day of Week       0
End Holiday           0
Start Day of Year     0
Start Week of Year    0
Start Lat             0
Start Lon             0
End Lat               0
End Lon               0
End Day of Year       0
Start Cluster         0
End Cluster           0
dtype: int64

In [10]:
merged_data.columns

Index(['Trip Id', 'Trip Duration', 'Start Station Id', 'Start Time',
       'Start Station Name', 'End Station Id', 'End Time', 'End Station Name',
       'User Type', 'Bike Id', 'Start Year', 'Start Month', 'Start Hour',
       'Start Day of Week', 'Start Holiday', 'End Year', 'End Month',
       'End Hour', 'End Day of Week', 'End Holiday', 'Start Day of Year',
       'Start Week of Year', 'Start Lat', 'Start Lon', 'End Lat', 'End Lon',
       'End Day of Year', 'Start Cluster', 'End Cluster'],
      dtype='object')

In [11]:
merged_data['Start Time'] = merged_data.apply(lambda row: datetime.strptime(row['Start Time'], "%Y-%m-%d %H:%M:%S"), axis=1)
merged_data['Start Day'] = merged_data.apply(lambda row: row['Start Time'].day, axis=1)

In [12]:
merged_data['End Time'] = merged_data.apply(lambda row: datetime.strptime(row['End Time'], "%Y-%m-%d %H:%M:%S"), axis=1)
merged_data['End Day'] = merged_data.apply(lambda row: row['End Time'].day, axis=1)

In [13]:
merged_data.head()

Unnamed: 0,Trip Id,Trip Duration,Start Station Id,Start Time,Start Station Name,End Station Id,End Time,End Station Name,User Type,Bike Id,Start Year,Start Month,Start Hour,Start Day of Week,Start Holiday,End Year,End Month,End Hour,End Day of Week,End Holiday,Start Day of Year,Start Week of Year,Start Lat,Start Lon,End Lat,End Lon,End Day of Year,Start Cluster,End Cluster,Start Day,End Day
0,4581278.0,1547.0,7021.0,2019-01-01 00:08:00,Bay St / Albert St,7233.0,2019-01-01 00:33:00,King / Cowan Ave - SMART,Annual Member,1296.0,2019.0,1.0,0.0,1.0,1.0,2019.0,1.0,0.0,1.0,1.0,1.0,1.0,43.653264,-79.382458,43.637922,-79.431734,1,0,32,1,1
1,4586979.0,1243.0,7021.0,2019-01-03 17:13:00,Bay St / Albert St,7233.0,2019-01-03 17:34:00,King / Cowan Ave - SMART,Annual Member,3168.0,2019.0,1.0,17.0,3.0,0.0,2019.0,1.0,17.0,3.0,0.0,3.0,1.0,43.653264,-79.382458,43.637922,-79.431734,3,0,32,3,3
2,4591641.0,1112.0,7021.0,2019-01-04 21:42:00,Bay St / Albert St,7233.0,2019-01-04 22:00:00,King / Cowan Ave - SMART,Annual Member,3512.0,2019.0,1.0,21.0,4.0,0.0,2019.0,1.0,22.0,4.0,0.0,4.0,1.0,43.653264,-79.382458,43.637922,-79.431734,4,0,32,4,4
3,4594328.0,1156.0,7021.0,2019-01-05 23:21:00,Bay St / Albert St,7233.0,2019-01-05 23:40:00,King / Cowan Ave - SMART,Annual Member,212.0,2019.0,1.0,23.0,5.0,0.0,2019.0,1.0,23.0,5.0,0.0,5.0,1.0,43.653264,-79.382458,43.637922,-79.431734,5,0,32,5,5
4,4596322.0,1068.0,7021.0,2019-01-06 21:42:00,Bay St / Albert St,7233.0,2019-01-06 22:00:00,King / Cowan Ave - SMART,Annual Member,3279.0,2019.0,1.0,21.0,6.0,0.0,2019.0,1.0,22.0,6.0,0.0,6.0,1.0,43.653264,-79.382458,43.637922,-79.431734,6,0,32,6,6


In [14]:
merged_data.shape

(2438720, 31)

In [15]:
# Import weather data
df_weather = pd.read_csv('/content/drive/My Drive/MIE498 Thesis/Share-Bike-Station-Clustering-and-Usage-Prediction/toronto_weather_2019.csv', index_col=None)
df_weather['Month'] = df_weather['Month'].astype("float64")
df_weather['Day'] = df_weather['Day'].astype("float64")
df_weather['Hour'] = df_weather['Hour'].astype("float64")
df_weather['Day of Week'] = df_weather['Day of Week'].astype("float64")
df_weather.head()

Unnamed: 0,Month,Day,Hour,Day of Week,Temperature (Celsius),Precipitation Amount (mm)
0,1.0,1.0,0.0,1.0,5.1,0.5
1,1.0,1.0,1.0,1.0,6.2,0.5
2,1.0,1.0,2.0,1.0,5.6,0.0
3,1.0,1.0,3.0,1.0,3.2,0.4
4,1.0,1.0,4.0,1.0,2.7,0.0


In [16]:
merged_data = merged_data.merge(df_weather[['Month', 'Day', 'Hour', 'Temperature (Celsius)']], how='left', left_on=['Start Month', 'Start Day', 'Start Hour'], right_on=['Month', 'Day', 'Hour']).drop(['Month', 'Day', 'Hour'], axis=1)
merged_data.rename({'Temperature (Celsius)': 'Start Temp'}, axis=1, inplace=True)

In [17]:
merged_data = merged_data.merge(df_weather[['Month', 'Day', 'Hour', 'Temperature (Celsius)']], how='left', left_on=['End Month', 'End Day', 'End Hour'], right_on=['Month', 'Day', 'Hour']).drop(['Month', 'Day', 'Hour'], axis=1)
merged_data.rename({'Temperature (Celsius)': 'End Temp'}, axis=1, inplace=True)

In [18]:
# check-out and check-in data split
checkout_data = merged_data[['Trip Id', 'Start Station Id', 'Start Time', 'Start Station Name', 'Start Year', 'Start Month', 'Start Hour',
       'Start Day of Week', 'Start Holiday', 'Start Day of Year', 'Start Week of Year', 'Start Lat', 'Start Lon', 'Start Cluster',
       'Start Temp']]
checkin_data = merged_data[['Trip Id', 'End Station Id', 'End Time', 'End Station Name', 'End Year', 'End Month',
       'End Hour', 'End Day of Week', 'End Holiday', 'End Lat', 'End Lon', 'End Day of Year', 'End Cluster', 
       'End Temp']]

# 2. Function and Class Definitions

In [19]:
def check_weekend(dayofweek):
    if dayofweek > 4:
      return 'weekend'
    else:
      return 'weekday'

In [233]:
class BikeDemand:
  def __init__(self, nTransitions, tempflag):
    self.nTransitionMatrix = nTransitions
    self.tempFlag = tempflag

  # Training: Months 1-9, Testing: Months 10-12
  def train_test_split(self, data, start_end_flag):
    training_data = data.loc[(data['{} Month'.format(start_end_flag)] >= 1) & (data['{} Month'.format(start_end_flag)] <= 9)]
    testing_data = data.loc[(data['{} Month'.format(start_end_flag)] >= 10) & (data['{} Month'.format(start_end_flag)] <= 12)]
    print(data.shape, training_data.shape, testing_data.shape)
    return training_data, testing_data
  
  def fill_in_missing_combinations(self, data, start_end_flag):
    # Check missing combinations
    
    hours = np.arange(24)
    # dayofyear = data['{} Day of Year'.format(start_end_flag)].unique()
    weekday_weekend = ['weekday', 'weekend']
    clusters = np.arange(37)

    combinations = itertools.product(hours, weekday_weekend, clusters)
    df_combinations = data[['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag)]].to_numpy().astype('str')
    df_combinations = df_combinations.tolist()
    comb_list = []
    for comb in combinations:
      comb = [str(comb[0]), comb[1], str(comb[2])]
      if comb not in df_combinations:
        # print(comb)
        comb_list.append(comb)
    print('number of missing combinations:', len(comb_list))

    # Fill in missing combinations
    for comb in comb_list:
      hour = int(comb[0])
      weekday_weekend = comb[1]
      cluster = int(comb[2])
      df_select = data[(data['{} Hour'.format(start_end_flag)] == hour) & (data['weekday/weekend'] == weekday_weekend)]
      
      clusters_list = data['{} Cluster'.format(start_end_flag)].unique()
      for c in clusters:
        if c not in clusters_list:
          for temperature in df_select['{} Temp'.format(start_end_flag)].unique():
            for doy in df_select[df_select['{} Temp'.format(start_end_flag)] == temperature]['{} Day of Year'.format(start_end_flag)].unique():
              new_row = {'{} Hour'.format(start_end_flag) : hour, 'weekday/weekend' : weekday_weekend, 
                        '{} Cluster'.format(start_end_flag) : cluster, '{} Day of Year'.format(start_end_flag) : doy,
                        '{} Temp'.format(start_end_flag) : temperature, 'Trip Id': 0.000001}
              data = data.append(new_row, ignore_index = True)
    return data
  
  def preprocess_data(self, train_data, test_data, start_end_flag):
    train_data = train_data[['Trip Id', '{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag), '{} Day of Year'.format(start_end_flag), '{} Temp'.format(start_end_flag)]]
    train_data = train_data.groupby(by=['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag), '{} Day of Year'.format(start_end_flag), '{} Temp'.format(start_end_flag)]).count()
    # print(train_data.head())
    train_data = train_data.reset_index(level=[0,1,2,3,4])
    train_data['{} Hour'.format(start_end_flag)] = train_data['{} Hour'.format(start_end_flag)].astype("int64")

    test_data = test_data[['Trip Id', '{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag), '{} Day of Year'.format(start_end_flag), '{} Temp'.format(start_end_flag)]]
    test_data = test_data.groupby(by=['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag), '{} Day of Year'.format(start_end_flag), '{} Temp'.format(start_end_flag)]).count()
    test_data = test_data.reset_index(level=[0,1,2,3,4])
    test_data['{} Hour'.format(start_end_flag)] = test_data['{} Hour'.format(start_end_flag)].astype("int64")

    train_data = self.fill_in_missing_combinations(train_data, start_end_flag)
    test_data = self.fill_in_missing_combinations(test_data, start_end_flag)

    if start_end_flag == 'Start':
      name = 'Number of Checkouts'
    else:
      name = 'Number of Checkins'

    train_data = train_data.sort_values(by=['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag), '{} Day of Year'.format(start_end_flag), '{} Temp'.format(start_end_flag)])
    train_data.rename({'Trip Id': name}, axis=1, inplace=True)
    train_data.reset_index(drop=True, inplace=True)

    test_data = test_data.sort_values(by=['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag), '{} Day of Year'.format(start_end_flag), '{} Temp'.format(start_end_flag)])
    test_data.rename({'Trip Id': name}, axis=1, inplace=True)
    test_data.reset_index(drop=True, inplace=True)

    print('Train Data Shape: ', train_data.shape)
    print('Test Data Shape: ', test_data.shape)

    return train_data, test_data

  def predict_checkout(self, checkout_train, checkout_test):
    X_train = checkout_train[['Start Hour', 'weekday/weekend', 'Start Cluster', 'Start Temp']]
    X_train = pd.get_dummies(data=X_train, columns=['Start Hour', 'weekday/weekend', 'Start Cluster'], drop_first=True).to_numpy()
    y_train = checkout_train[['Number of Checkouts']].to_numpy()

    X_test = checkout_test[['Start Hour', 'weekday/weekend', 'Start Cluster', 'Start Temp']]
    X_test = pd.get_dummies(data=X_test, columns=['Start Hour', 'weekday/weekend', 'Start Cluster'], drop_first=True).to_numpy()
    y_test = checkout_test[['Number of Checkouts']].to_numpy()

    y_train_log, y_test_log = np.log(y_train), np.log(y_test)
    checkout_linreg = LinearRegression().fit(X_train, y_train_log)

    y_train_pred = np.exp(checkout_linreg.predict(X_train))
    y_test_pred = np.exp(checkout_linreg.predict(X_test))

    return X_train, y_train, y_train_pred, X_test, y_test, y_test_pred
  
  def evaluate(self, y_true, y_pred):
    mse = round(mean_squared_error(y_true, y_pred, squared=True),3) # MSE
    rmse = round(mean_squared_error(y_true, y_pred, squared=False),3) # RMSE
    mre = round(max_error(y_true, y_pred),3) # maximum residual error
    mae = round(mean_absolute_error(y_true, y_pred),3) # MAE
    r2 = round(r2_score(y_true, y_pred),3)
    return mse, rmse, mre, mae, r2

  def generate_transition_matrix(self, df_train_select):
    from_cluster_list = list(np.arange(0, 37))
    df_transition_matrix = pd.DataFrame(data=np.zeros((37, 37)), columns = from_cluster_list, index=from_cluster_list)
    df_transition_matrix = df_transition_matrix.astype("int")

    for from_cluster in from_cluster_list:
      data_train_temp = df_train_select[df_train_select['Start Cluster'] == from_cluster]
      df_counts = pd.DataFrame(data_train_temp['End Cluster'].value_counts())
      
      # print(df_counts.head())
      to_cluster_list = list(df_counts.index)
      
      for to_cluster in to_cluster_list:
        cnt = df_counts.loc[to_cluster]['End Cluster']
        df_transition_matrix.loc[from_cluster, to_cluster] = cnt
      
      df_transition_matrix = df_transition_matrix.div(df_transition_matrix.sum(axis=1), axis=0)

    df_transition_matrix.replace(np.nan, 0, inplace=True)
    return df_transition_matrix
  
  def generate_transition_matrix_dict(self, data_train):
    n = self.nTransitionMatrix
    transition_matrix_dict = {}
    if n == 1:
      return self.generate_transition_matrix(data_train)
    
    elif n == 24:
      for hr in data_train['Start Hour'].unique():
        df_train_select = data_train[data_train['Start Hour'] == hr]
        transition_matrix = self.generate_transition_matrix(df_train_select)
        transition_matrix_dict[hr] = transition_matrix
      return transition_matrix_dict
    
    elif n == 48:
      for hr in data_train['Start Hour'].unique():
        for wd in data_train['weekday/weekend'].unique():
          df_train_select = data_train[(data_train['Start Hour'] == hr) & (data_train['weekday/weekend'] == wd)]
          transition_matrix = self.generate_transition_matrix(df_train_select)
          transition_matrix_dict[(hr, wd)] = transition_matrix
      return transition_matrix_dict
  
  def predict_checkin_by_cluster(self, df_predictions, df_transition_matrix):
    checkin_prediction_list = []
    columns_list = ['End Cluster {}'.format(c) for c in np.arange(37)]
    df_checkin = pd.DataFrame(columns = columns_list)

    if self.nTransitionMatrix == 1:
      for idx, row in df_predictions.iterrows():
        if idx % 5000 == 0:
          print(idx)
        checkout_prediction = row['Check-out Predictions']
        cluster_n = row['Start Cluster']
        transition_list = np.array(df_transition_matrix.iloc[cluster_n])
        # check-out predictions x transition matrix = check-in predictions
        checkin_prediction = np.round(transition_list * checkout_prediction, 6)
        checkin_prediction = pd.Series(checkin_prediction, index = df_checkin.columns)
        df_checkin = df_checkin.append(checkin_prediction, ignore_index=True)
    elif self.nTransitionMatrix == 24:
      for idx, row in df_predictions.iterrows():
        if idx % 5000 == 0:
          print(idx)
        checkout_prediction = row['Check-out Predictions']
        cluster_n = row['Start Cluster']
        hour = row['Start Hour']
        transition_list = np.array(df_transition_matrix[hour].iloc[cluster_n])
        # check-out predictions x transition matrix = check-in predictions
        checkin_prediction = np.round(transition_list * checkout_prediction, 6)
        checkin_prediction = pd.Series(checkin_prediction, index = df_checkin.columns)
        df_checkin = df_checkin.append(checkin_prediction, ignore_index=True)
    elif self.nTransitionMatrix == 48:
      for idx, row in df_predictions.iterrows():
        if idx % 5000 == 0:
          print(idx)
        checkout_prediction = row['Check-out Predictions']
        cluster_n = row['Start Cluster']
        hour, weekday = row['Start Hour'], row['weekday/weekend']
        transition_list = np.array(df_transition_matrix[(hour, weekday)].iloc[cluster_n])
        # check-out predictions x transition matrix = check-in predictions
        checkin_prediction = np.round(transition_list * checkout_prediction, 6)
        checkin_prediction = pd.Series(checkin_prediction, index = df_checkin.columns)
        df_checkin = df_checkin.append(checkin_prediction, ignore_index=True)
    
    return pd.concat([df_predictions, df_checkin], axis=1)
  
  def compute_true_checkin(self, checkin_test):
    df = checkin_test[['End Hour', 'weekday/weekend', 'End Temp', 'End Day of Year']].drop_duplicates()
    clusters = np.arange(37)
    df_checkin = pd.DataFrame(columns = ['End Hour','weekday/weekend','End Cluster','End Temp',	'Number of Checkins'])
    for idx, row in df.iterrows():
      hr, wd, temp, edoy = row[0], row[1], row[2], row[3]
      # print(hr, wd, temp)
      df_temp = checkin_test[(checkin_test['End Hour'] == hr) & (checkin_test['weekday/weekend'] == wd) & 
                             (checkin_test['End Day of Year'] == edoy) & (checkin_test['End Temp'] == temp)]
      clusters_list = df_temp['End Cluster'].unique()
      for c in clusters:
        if c not in clusters_list:
          new_row = {'End Hour' : hr, 'weekday/weekend' : wd, 'End Cluster' : c, 'End Temp' : temp, 'End Day of Year' : edoy, 'Number of Checkins': 0.000001}
          df_temp = df_temp.append(new_row, ignore_index = True)
      df_temp.sort_values(by=['End Hour', 'weekday/weekend', 'End Cluster', 'End Day of Year', 'End Temp'], inplace=True)
      df_temp.reset_index(drop=True, inplace=True)
      if len(df_temp) != 37:
        print(df_temp)
      df_checkin = df_checkin.append(df_temp, ignore_index = True)
        
    return df_checkin
  
  def run_prediction_pipeline(self, merged_data, checkout_data, checkin_data):
    print('Checkout predictions')
    checkout_train, checkout_test = self.train_test_split(checkout_data, 'Start')
    checkout_train, checkout_test = self.preprocess_data(checkout_train, checkout_test, 'Start')
    X_train, y_train, y_train_pred, X_test, y_test, y_test_pred = self.predict_checkout(checkout_train, checkout_test)

    print('Checkout training and testing errors')
    mse_train, rmse_train, mre_train, mae_train, r2_train = self.evaluate(y_train, y_train_pred)
    mse_test, rmse_test, mre_test, mae_test, r2_test = self.evaluate(y_test, y_test_pred)
    print('training errors: ', mse_train, rmse_train, mre_train, mae_train, r2_train)
    print('testing errors: ', mse_test, rmse_test, mre_test, mae_test, r2_test)

    print('Transition Matrix Computation')
    data_train, data_test = self.train_test_split(merged_data, 'Start')
    data_train['weekday/weekend'] = data_train.apply(lambda row: check_weekend(row['Start Day of Week']), axis=1)
    df_transition_matrix = self.generate_transition_matrix_dict(data_train)

    print('Summarize checkout predictions')
    df_predictions = checkout_test[['Start Hour', 'weekday/weekend', 'Start Cluster', 'Start Temp']]
    df_predictions['Check-out Predictions'], df_predictions['Check-out True Values'] = y_test_pred, y_test
    df_predictions = self.predict_checkin_by_cluster(df_predictions, df_transition_matrix)

    print('Checkin predictions')
    checkin_train, checkin_test = self.train_test_split(checkin_data, 'End')
    checkin_train, checkin_test = self.preprocess_data(checkin_train, checkin_test, 'End')
    df_checkin = self.compute_true_checkin(checkin_test)
    checkin_pred = []
    df_checkin_row = df_checkin[['End Hour', 'weekday/weekend', 'End Temp', 'End Day of Year']].drop_duplicates()

    for idx, row in df_checkin_row.iterrows():
      hr, wd, temp = row[0], row[1], row[2]
      temp_pred_list = df_predictions[(df_predictions['Start Hour'] == hr) & (df_predictions['weekday/weekend'] == wd) & (df_predictions['Start Temp'] == temp)].iloc[:, 6:].sum(axis=0)
      if len(temp_pred_list) != 37:
        print(len(temp_pred_list))
      checkin_pred += temp_pred_list.to_list()
    df_checkin['Predicted Number of Checkins'] = checkin_pred

    print('Evaluate the predictions')
    checkout_true = df_predictions['Check-out True Values'].to_numpy()
    checkout_pred = df_predictions['Check-out Predictions'].to_numpy()
    checkin_true = df_checkin[['Number of Checkins']].to_numpy()
    checkin_pred = df_checkin[['Predicted Number of Checkins']].to_numpy()

    print('Checkout evaluation:')
    print('mse, rmse, mre, mae, r2: ', self.evaluate(checkout_true, checkout_pred))
    print('Checkin evaluation:')
    print('mse, rmse, mre, mae, r2: ', self.evaluate(checkin_true, checkin_pred))

# 3. Run Pipeline

In [21]:
# Checkout data
checkout_data['weekday/weekend'] = checkout_data.apply(lambda row: check_weekend(row['Start Day of Week']), axis=1)
# Checkin data
checkin_data['weekday/weekend'] = checkin_data.apply(lambda row: check_weekend(row['End Day of Week']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [234]:
bikedemand = BikeDemand(nTransitions=1, tempflag=True)
bikedemand.run_prediction_pipeline(merged_data, checkout_data, checkin_data)

Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 21
number of missing combinations: 74
Train Data Shape:  (154045, 6)
Test Data Shape:  (51015, 6)
Checkout training and testing errors
training errors:  158.251 12.58 299.041 6.02 0.557
testing errors:  109.957 10.486 182.519 4.953 0.414
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 16
number of missing combinations: 63
Train Data Shape:  (152450, 6)
Test Data Shape:  (50390, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae, r2:  (109.957, 10.486, 182.519, 4.953, 0.414)
Checkin evaluation:
mse, rmse, mre, mae, r2:  (73.802, 8.591, 210.806, 3.466, 0.507)


In [235]:
bikedemand = BikeDemand(nTransitions=24, tempflag=True)
bikedemand.run_prediction_pipeline(merged_data, checkout_data, checkin_data)

Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 21
number of missing combinations: 74
Train Data Shape:  (154045, 6)
Test Data Shape:  (51015, 6)
Checkout training and testing errors
training errors:  158.251 12.58 299.041 6.02 0.557
testing errors:  109.957 10.486 182.519 4.953 0.414
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 16
number of missing combinations: 63
Train Data Shape:  (152450, 6)
Test Data Shape:  (50390, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae, r2:  (109.957, 10.486, 182.519, 4.953, 0.414)
Checkin evaluation:
mse, rmse, mre, mae, r2:  (62.403, 7.9, 164.608, 3.261, 0.584)


In [236]:
bikedemand = BikeDemand(nTransitions=48, tempflag=True)
bikedemand.run_prediction_pipeline(merged_data, checkout_data, checkin_data)

Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 21
number of missing combinations: 74
Train Data Shape:  (154045, 6)
Test Data Shape:  (51015, 6)
Checkout training and testing errors
training errors:  158.251 12.58 299.041 6.02 0.557
testing errors:  109.957 10.486 182.519 4.953 0.414
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 16
number of missing combinations: 63
Train Data Shape:  (152450, 6)
Test Data Shape:  (50390, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae, r2:  (109.957, 10.486, 182.519, 4.953, 0.414)
Checkin evaluation:
mse, rmse, mre, mae, r2:  (60.273, 7.764, 161.275, 3.226, 0.598)


In [252]:
df_checkout_errs = pd.DataFrame(columns = ['n Transition Matrix', 'mse', 'rmse', 'mre', 'mae', 'r2'])
df_temp = {'n Transition Matrix' : '1', 'mse':109.957, 'rmse':10.486, 'mre':182.519, 'mae':4.953, 'r2':0.414}
df_checkout_errs = df_checkout_errs.append(df_temp, ignore_index=True)
df_temp = {'n Transition Matrix' : '24', 'mse':109.957, 'rmse':10.486, 'mre':182.519, 'mae':4.953, 'r2':0.414}
df_checkout_errs = df_checkout_errs.append(df_temp, ignore_index=True)
df_temp = {'n Transition Matrix' : '48', 'mse':109.957, 'rmse':10.486, 'mre':182.519, 'mae':4.953, 'r2':0.414}
df_checkout_errs = df_checkout_errs.append(df_temp, ignore_index=True)
df_checkout_errs

Unnamed: 0,n Transition Matrix,mse,rmse,mre,mae,r2
0,1,109.957,10.486,182.519,4.953,0.414
1,24,109.957,10.486,182.519,4.953,0.414
2,48,109.957,10.486,182.519,4.953,0.414


In [253]:
df_checkin_errs = pd.DataFrame(columns = ['n Transition Matrix', 'mse', 'rmse', 'mre', 'mae', 'r2'])
df_temp = {'n Transition Matrix' : '1', 'mse':73.802, 'rmse':8.591, 'mre':210.806, 'mae':3.466, 'r2':0.507}
df_checkin_errs = df_checkin_errs.append(df_temp, ignore_index=True)
df_temp = {'n Transition Matrix' : '24', 'mse':62.403, 'rmse':7.9, 'mre':164.608, 'mae':3.261, 'r2':0.584}
df_checkin_errs = df_checkin_errs.append(df_temp, ignore_index=True)
df_temp = {'n Transition Matrix' : '48', 'mse':60.273, 'rmse':7.764, 'mre':161.275, 'mae':3.226, 'r2':0.598}
df_checkin_errs = df_checkin_errs.append(df_temp, ignore_index=True)
df_checkin_errs

Unnamed: 0,n Transition Matrix,mse,rmse,mre,mae,r2
0,1,73.802,8.591,210.806,3.466,0.507
1,24,62.403,7.9,164.608,3.261,0.584
2,48,60.273,7.764,161.275,3.226,0.598
