In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import holidays
import itertools
import time

In [3]:
def import_ridership_data():
  # Import ridership data
  data_dir = '/content/drive/My Drive/MIE498 Thesis/0_Data'
  merged_bike_data_2019 = pd.read_csv("{}/ridership_2019_with_bike_stations_info_20200930.csv".format(data_dir), header=0)
  print(merged_bike_data_2019.shape)
  merged_bike_data_2019['End Day of Year'] = merged_bike_data_2019.apply(lambda row: datetime.strptime(row['End Time'], "%Y-%m-%d %H:%M:%S").timetuple().tm_yday, axis=1)
  print(merged_bike_data_2019.shape)

  merged_bike_data_2019['Start Time'] = merged_bike_data_2019.apply(lambda row: datetime.strptime(row['Start Time'], "%Y-%m-%d %H:%M:%S"), axis=1)
  merged_bike_data_2019['Start Day'] = merged_bike_data_2019.apply(lambda row: row['Start Time'].day, axis=1)
  merged_bike_data_2019['End Time'] = merged_bike_data_2019.apply(lambda row: datetime.strptime(row['End Time'], "%Y-%m-%d %H:%M:%S"), axis=1)
  merged_bike_data_2019['End Day'] = merged_bike_data_2019.apply(lambda row: row['End Time'].day, axis=1)

  return merged_bike_data_2019

In [26]:
def merge_clustering_data(nClusteringIterations, ridership_data, k2):
  merged_data = merged_bike_data_2019.copy()
  # Import cluster-station-assignment data
  df_data_w_clusters = pd.read_csv('/content/drive/My Drive/MIE498 Thesis/Share-Bike-Station-Clustering-and-Usage-Prediction/clustering_results/station_data_w_clusters_{}iterations_k2={}.csv'.format(nClusteringIterations, k2), index_col=0)
  merged_data = merged_data.merge(df_data_w_clusters[['station_id', 'cluster']], how='left', left_on='Start Station Id', right_on='station_id').drop('station_id', axis=1)
  merged_data.rename({'cluster': 'Start Cluster'}, axis=1, inplace=True)
  merged_data = merged_data.merge(df_data_w_clusters[['station_id', 'cluster']], how='left', left_on='End Station Id', right_on='station_id').drop('station_id', axis=1)
  merged_data.rename({'cluster': 'End Cluster'}, axis=1, inplace=True)

  return df_data_w_clusters, merged_data

In [5]:
def check_weekend(dayofweek):
    if dayofweek > 4:
      return 'weekend'
    else:
      return 'weekday'

In [6]:
def filter_checkout_checkin_data(merged_data):
# check-out and check-in data split
  checkout_data = merged_data[['Trip Id', 'Start Station Id', 'Start Time', 'Start Station Name', 'Start Year', 'Start Month', 'Start Hour',
        'Start Day of Week', 'Start Holiday', 'Start Day of Year', 'Start Week of Year', 'Start Lat', 'Start Lon', 'Start Cluster']]
  checkout_data['weekday/weekend'] = checkout_data.apply(lambda row: check_weekend(row['Start Day of Week']), axis=1)

  checkin_data = merged_data[['Trip Id', 'End Station Id', 'End Time', 'End Station Name', 'End Year', 'End Month',
        'End Hour', 'End Day of Week', 'End Holiday', 'End Lat', 'End Lon', 'End Day of Year', 'End Cluster']]
  checkin_data['weekday/weekend'] = checkin_data.apply(lambda row: check_weekend(row['End Day of Week']), axis=1)

  return checkout_data, checkin_data

## Computing Objective Function Values For Testing Data Only:

In [15]:
def select_data(checkout_data, checkin_data):
    checkout_test = checkout_data.loc[(checkout_data['Start Month'] >= 10) & (checkout_data['Start Month'] <= 12)]
    checkin_test = checkin_data.loc[(checkin_data['End Month'] >= 10) & (checkin_data['End Month'] <= 12)]
    print(checkout_test.shape, checkin_test.shape)
    return checkout_test, checkin_test

In [19]:
def fill_in_missing_combinations(nClusters, data, start_end_flag):
    # Check missing combinations
    hours = np.arange(24)
    # dayofyear = data['{} Day of Year'.format(start_end_flag)].unique()
    weekday_weekend = ['weekday', 'weekend']
    clusters = np.arange(nClusters)

    combinations = itertools.product(hours, weekday_weekend, clusters)
    features_list = ['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag)]
    df_combinations = data[features_list].to_numpy().astype('str')
    df_combinations = df_combinations.tolist()
    comb_list = []
    for comb in combinations:
      comb = [str(comb[0]), comb[1], str(comb[2])]
      if comb not in df_combinations:
        # print(comb)
        comb_list.append(comb)
    print('number of missing combinations:', len(comb_list))

    # Fill in missing combinations
    for comb in comb_list:
      hour = int(comb[0])
      weekday_weekend = comb[1]
      cluster = int(comb[2])
      df_select = data[(data['{} Hour'.format(start_end_flag)] == hour) & (data['weekday/weekend'] == weekday_weekend)]
      
      clusters_list = data['{} Cluster'.format(start_end_flag)].unique()
      for c in clusters:
        if c not in clusters_list:
          for doy in df_select['{} Day of Year'.format(start_end_flag)].unique():
              new_row = {'{} Hour'.format(start_end_flag) : hour, 'weekday/weekend' : weekday_weekend, 
                          '{} Cluster'.format(start_end_flag) : cluster, '{} Day of Year'.format(start_end_flag) : doy,
                          'Trip Id': 0.000001}
              data = data.append(new_row, ignore_index = True)
    return data

In [111]:
def generate_transition_matrix_helper(station_id_list, mClusters, data_select, station_cluster_flag):
    if station_cluster_flag == 'cluster':
      from_list = list(np.arange(0, mClusters))
      df_transition_matrix = pd.DataFrame(data=np.zeros((mClusters, mClusters)), columns = from_list, index=from_list)
      column_name = 'Start Cluster'
    else:
      from_list = station_id_list
      nStations = len(station_id_list)
      df_transition_matrix = pd.DataFrame(data=np.zeros((nStations, mClusters)), columns = list(np.arange(0, mClusters)), index=from_list)
      column_name = 'Start Station Id'
    
    df_transition_matrix = df_transition_matrix.astype("int")

    for f in from_list:
      data_select[column_name] = data_select[column_name].astype("int")
      data_temp = data_select[data_select[column_name] == f]

      df_counts = pd.DataFrame(data_temp['End Cluster'].value_counts())
      
      # print(df_counts.head())
      to_cluster_list = list(df_counts.index)
      # print(to_cluster_list)
      for t in to_cluster_list:
        cnt = df_counts.loc[t]['End Cluster']
        df_transition_matrix.loc[f, t] = cnt
      
      df_transition_matrix = df_transition_matrix.div(df_transition_matrix.sum(axis=1), axis=0)

    df_transition_matrix.replace(np.nan, 0, inplace=True)
    return df_transition_matrix

In [112]:
def generate_transition_matrix_dict(station_id_list, mClusters, nTransitionMatrix, data, station_cluster_flag):
    transition_matrix_dict = {}
    if nTransitionMatrix == 1:
      return generate_transition_matrix_helper(station_id_list, mClusters, data, station_cluster_flag)
    
    elif nTransitionMatrix == 24:
      for hr in data['Start Hour'].unique():
        df_train_select = data[data['Start Hour'] == hr]
        transition_matrix = generate_transition_matrix_helper(station_id_list, mClusters, df_train_select, station_cluster_flag)
        transition_matrix_dict[hr] = transition_matrix
      return transition_matrix_dict
    
    elif nTransitionMatrix == 48:
      for hr in data['Start Hour'].unique():
        for wd in data['weekday/weekend'].unique():
          df_train_select = data[(data['Start Hour'] == hr) & (data['weekday/weekend'] == wd)]
          transition_matrix = generate_transition_matrix_helper(station_id_list, mClusters, df_train_select, station_cluster_flag)
          transition_matrix_dict[(hr, wd)] = transition_matrix
      return transition_matrix_dict

In [10]:
merged_bike_data_2019 = import_ridership_data()

(2438720, 26)
(2438720, 27)


## k2=8

In [27]:
df_data_w_clusters, merged_data = merge_clustering_data(nClusteringIterations=1, ridership_data=merged_bike_data_2019, k2=8)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [48]:
merged_data_test = merged_data.loc[(merged_data['Start Month'] >= 10) & (merged_data['Start Month'] <= 12)]
merged_data_test['weekday/weekend'] = merged_data_test.apply(lambda row: check_weekend(row['Start Day of Week']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Index(['Trip Id', 'Trip Duration', 'Start Station Id', 'Start Time',
       'Start Station Name', 'End Station Id', 'End Time', 'End Station Name',
       'User Type', 'Bike Id', 'Start Year', 'Start Month', 'Start Hour',
       'Start Day of Week', 'Start Holiday', 'End Year', 'End Month',
       'End Hour', 'End Day of Week', 'End Holiday', 'Start Day of Year',
       'Start Week of Year', 'Start Lat', 'Start Lon', 'End Lat', 'End Lon',
       'End Day of Year', 'Start Day', 'End Day', 'Start Cluster',
       'End Cluster'],
      dtype='object')

In [113]:
clusters_list = list(merged_data['Start Cluster'].unique())
mClusters = len(clusters_list)
station_id_list = list(df_data_w_clusters['station_id'].unique())
nStations = len(station_id_list)

(467944, 15) (467969, 14)


In [96]:
merged_data_test.columns

Index(['Trip Id', 'Trip Duration', 'Start Station Id', 'Start Time',
       'Start Station Name', 'End Station Id', 'End Time', 'End Station Name',
       'User Type', 'Bike Id', 'Start Year', 'Start Month', 'Start Hour',
       'Start Day of Week', 'Start Holiday', 'End Year', 'End Month',
       'End Hour', 'End Day of Week', 'End Holiday', 'Start Day of Year',
       'Start Week of Year', 'Start Lat', 'Start Lon', 'End Lat', 'End Lon',
       'End Day of Year', 'Start Day', 'End Day', 'Start Cluster',
       'End Cluster', 'weekday/weekend'],
      dtype='object')

## Station-Cluster Dictionaries

In [135]:
stationID_cluster = df_data_w_clusters[['station_id', 'cluster']]
cluster_stationID_dict = {k: [] for k in stationID_cluster['cluster'].unique()}

for idx, row in stationID_cluster.iterrows():
  station, cluster = row[0], row[1]
  cluster_stationID_dict[cluster].append(station)

stationID_cluster_dict = dict(zip(stationID_cluster['station_id'], stationID_cluster['cluster']))

## Metric 1

In [107]:
station_cluster_transitions1 = generate_transition_matrix_dict(station_id_list, mClusters, nTransitionMatrix=1, data=merged_data_test, station_cluster_flag='station')
cluster_cluster_transitions1 = generate_transition_matrix_dict(station_id_list, mClusters, nTransitionMatrix=1, data=merged_data_test, station_cluster_flag='cluster')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [108]:
station_cluster_transitions24 = generate_transition_matrix_dict(station_id_list, mClusters, nTransitionMatrix=24, data=merged_data_test, station_cluster_flag='station')
cluster_cluster_transitions24 = generate_transition_matrix_dict(station_id_list, mClusters, nTransitionMatrix=24, data=merged_data_test, station_cluster_flag='cluster')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [170]:
station_cluster_transitions48 = generate_transition_matrix_dict(station_id_list, mClusters, nTransitionMatrix=48, data=merged_data_test, station_cluster_flag='station')
cluster_cluster_transitions48 = generate_transition_matrix_dict(station_id_list, mClusters, nTransitionMatrix=48, data=merged_data_test, station_cluster_flag='cluster')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [110]:
print(len(station_cluster_transitions24), len(cluster_cluster_transitions24))

24 24


In [161]:
# Metric 1: 1 transition matrix
obj_val = 0
for from_station_i in station_id_list:
  from_c = stationID_cluster_dict.get(from_station_i)
  for cluster_j in clusters_list:
    obj_val += abs(station_cluster_transitions1.at[from_station_i, cluster_j] - cluster_cluster_transitions1.at[from_c, cluster_j])
obj_val / (mClusters * nStations)

0.012199692362150415

In [162]:
# Metric 1: 24 transition matrices
nTransitionMatrix = len(station_cluster_transitions24)
obj_val = 0
for tm in range(nTransitionMatrix):
  for from_station_i in station_id_list:    
    from_c = stationID_cluster_dict.get(from_station_i)
    for cluster_j in clusters_list:
      obj_val += abs(station_cluster_transitions24[tm].at[from_station_i, cluster_j] - cluster_cluster_transitions24[tm].at[from_c, cluster_j])
obj_val / (mClusters * nStations)

0.5999016595143108

In [163]:
0.5999016595143108/24

0.024995902479762952

In [178]:
# Metric 1: 48 transition matrices
hours, weekday_weekend = np.arange(24), ['weekday', 'weekend']
combinations = itertools.product(hours, weekday_weekend)
obj_val = 0
for comb in combinations:
  for from_station_i in station_id_list:    
    from_c = stationID_cluster_dict.get(from_station_i)
    for cluster_j in clusters_list:
      obj_val += abs(station_cluster_transitions48[comb].at[from_station_i, cluster_j] - cluster_cluster_transitions48[comb].at[from_c, cluster_j])
obj_val / (mClusters * nStations)

1.3266949672559214

In [180]:
1.3266949672559214 / 48

0.027639478484498364

## Metric 2

In [200]:
def fill_in_missing_combinations(mClusters, station_id_list, data, start_end_flag, station_cluster_flag):
    # Check missing combinations
    hours = np.arange(24)
    # dayofyear = data['{} Day of Year'.format(start_end_flag)].unique()
    weekday_weekend = ['weekday', 'weekend']
    if station_cluster_flag == 'Cluster':
      grouping_feature = np.arange(mClusters)
    else:
      grouping_feature = station_id_list

    combinations = itertools.product(hours, weekday_weekend, grouping_feature)
    features_list = ['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} {}'.format(start_end_flag, station_cluster_flag)]
    df_combinations = data[features_list].to_numpy().astype('str')
    df_combinations = df_combinations.tolist()
    comb_list = []
    for comb in combinations:
      comb = [str(comb[0]), comb[1], str(comb[2])]
      if comb not in df_combinations:
        # print(comb)
        comb_list.append(comb)
    print('number of missing combinations:', len(comb_list))

    # Fill in missing combinations
    for comb in comb_list:
      hour = int(comb[0])
      weekday_weekend = comb[1]
      grouping_feat = int(comb[2])
      df_select = data[(data['{} Hour'.format(start_end_flag)] == hour) & (data['weekday/weekend'] == weekday_weekend)]
      
      grouping_feature_list = data['{} {}'.format(start_end_flag, station_cluster_flag)].unique()
      for c in grouping_feature:
        if c not in grouping_feature_list:
              new_row = {'{} Hour'.format(start_end_flag) : hour, 'weekday/weekend' : weekday_weekend, 
                          '{} {}'.format(start_end_flag, station_cluster_flag) : grouping_feat,
                          'Trip Id': 0.000001}
              data = data.append(new_row, ignore_index = True)
    return data

In [201]:
def checkout_checkin_volume(mClusters, station_id_list, test_data, start_end_flag, station_cluster_flag):
  features_list = ['Trip Id', '{} Hour'.format(start_end_flag), 'weekday/weekend', '{} {}'.format(start_end_flag, station_cluster_flag)]
  grouping_features_list = ['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} {}'.format(start_end_flag, station_cluster_flag)]
  reset_levels_list = [0,1,2]

  test_data = test_data[features_list]
  test_data = test_data.groupby(by=grouping_features_list).count()
  test_data = test_data.reset_index(level=reset_levels_list)
  test_data['{} Hour'.format(start_end_flag)] = test_data['{} Hour'.format(start_end_flag)].astype("int64")

  test_data = fill_in_missing_combinations(mClusters, station_id_list, test_data, start_end_flag, station_cluster_flag)

  if start_end_flag == 'Start':
    name = 'Number of Checkouts ({})'.format(station_cluster_flag)
  else:
    name = 'Number of Checkins ({})'.format(station_cluster_flag)

  test_data = test_data.sort_values(by=grouping_features_list)
  test_data.rename({'Trip Id': name}, axis=1, inplace=True)
  test_data.reset_index(drop=True, inplace=True)

  print('Test Data Shape: ', test_data.shape)

  return test_data

In [184]:
checkout_data, checkin_data = filter_checkout_checkin_data(merged_data)
checkout_test, checkin_test = select_data(checkout_data, checkin_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


(467944, 15) (467969, 14)


In [190]:
checkout_test.columns

Index(['Trip Id', 'Start Station Id', 'Start Time', 'Start Station Name',
       'Start Year', 'Start Month', 'Start Hour', 'Start Day of Week',
       'Start Holiday', 'Start Day of Year', 'Start Week of Year', 'Start Lat',
       'Start Lon', 'Start Cluster', 'weekday/weekend'],
      dtype='object')

### Checkout

In [205]:
checkout_volume = checkout_checkin_volume(mClusters=mClusters, station_id_list=station_id_list, test_data=checkout_test, start_end_flag='Start', station_cluster_flag='Station Id')  # 'Station Id', 'Cluster'

number of missing combinations: 22272
Test Data Shape:  (18363, 4)


In [209]:
checkout_volume_with_cluster = checkout_volume.merge(df_data_w_clusters[['station_id', 'cluster']], how='left', left_on='Start Station Id', right_on='station_id').drop('station_id', axis=1)
checkout_volume_with_cluster.rename({'cluster': 'Start Cluster'}, axis=1, inplace=True)
checkout_volume_with_cluster

Unnamed: 0,Start Hour,weekday/weekend,Start Station Id,Number of Checkouts (Station Id),Start Cluster
0,0,weekday,7000.0,8,18
1,0,weekday,7002.0,19,6
2,0,weekday,7003.0,10,6
3,0,weekday,7004.0,5,1
4,0,weekday,7005.0,4,20
...,...,...,...,...,...
18358,23,weekend,7502.0,11,1
18359,23,weekend,7505.0,1,13
18360,23,weekend,7508.0,1,11
18361,23,weekend,7509.0,1,13


In [233]:
def generate_checkout_checkin_dict_helper(station_id_list, mClusters, grouping_features, df_select, start_end_flag, station_cluster_flag):
  if start_end_flag == 'Start':
    checkout_checkin_flag = 'Checkouts'
  else:
    checkout_checkin_flag = 'Checkins'

  features_list = ['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} {}'.format(start_end_flag, station_cluster_flag), 'Number of {} ({})'.format(checkout_checkin_flag, station_cluster_flag)]
  reset_levels_list = list(np.arange(len(grouping_features)))
  # print(reset_levels_list)

  df_select = df_select[features_list]
  df_select = df_select.groupby(by=grouping_features).sum()
  df_select = df_select.reset_index(level=reset_levels_list)
  df_select['{} Hour'.format(start_end_flag)] = df_select['{} Hour'.format(start_end_flag)].astype("int64")

  if start_end_flag == 'Start':
    name = 'Number of Checkouts ({})'.format(station_cluster_flag)
  else:
    name = 'Number of Checkins ({})'.format(station_cluster_flag)

  df_select = df_select.sort_values(by=grouping_features)
  df_select.rename({'Trip Id': name}, axis=1, inplace=True)
  df_select.reset_index(drop=True, inplace=True)
  # print('Data Shape: ', df_select.shape)

  return df_select

In [225]:
def generate_checkout_checkin_dict(station_id_list, mClusters, grouping_features, data, start_end_flag, station_cluster_flag):
    return_dict = {}
    for c in range(mClusters):
      df_select = data[data['{} Cluster'.format(start_end_flag)] == c]
      checkout_checkin = generate_checkout_checkin_dict_helper(station_id_list, mClusters, grouping_features, df_select, start_end_flag, station_cluster_flag)
      return_dict[c] = checkout_checkin
    return return_dict

In [264]:
start_end_flag, station_cluster_flag = 'Start', 'Station Id'
# grouping_features = ['{} Hour'.format(start_end_flag), '{} {}'.format(start_end_flag, station_cluster_flag)]
grouping_features = ['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} {}'.format(start_end_flag, station_cluster_flag)]
return_dict = generate_checkout_checkin_dict(station_id_list=station_id_list, mClusters=mClusters, grouping_features=grouping_features, data=checkout_volume_with_cluster, start_end_flag=start_end_flag, station_cluster_flag=station_cluster_flag)

In [260]:
df_test1 = return_dict[0].copy()
df_test1['Number of Checkouts (Station Id)'] = return_dict[0]['Number of Checkouts (Station Id)'].div(return_dict[0]['Number of Checkouts (Station Id)'].sum(), axis=0)
df_test1

Unnamed: 0,Start Hour,weekday/weekend,Start Station Id,Number of Checkouts (Station Id)
0,0,weekday,7279.0,0.003636
1,0,weekday,7376.0,0.000519
2,0,weekday,7378.0,0.003636
3,0,weekday,7379.0,0.000519
4,0,weekday,7423.0,0.000519
...,...,...,...,...
244,23,weekday,7378.0,0.006753
245,23,weekday,7379.0,0.003636
246,23,weekday,7423.0,0.005195
247,23,weekend,7378.0,0.000519


In [259]:
df_test2 = return_dict[0].copy()
avg_checkout = df_test2['Number of Checkouts (Station Id)'].sum() / len(df_test2)
avg_checkout

7.730923694779117

In [265]:
# R^48
obj_val = 0
for k in clusters_list:
  stations_list = cluster_stationID_dict.get(k)
  for i in stations_list:
    df_temp = return_dict[k]
    df_temp['Number of Checkouts (Station Id)'] = return_dict[k]['Number of Checkouts (Station Id)'].div(return_dict[k]['Number of Checkouts (Station Id)'].sum(), axis=0)
    avg_checkouts = df_temp['Number of Checkouts (Station Id)'].sum() / len(df_temp)
    for idx, row in df_temp.iterrows():
      n_checkouts = row[-1]  # of station i
      obj_val += abs(n_checkouts - avg_checkouts)
obj_val / (mClusters * nStations)

0.022876067280582536

In [266]:
# R^24
start_end_flag, station_cluster_flag = 'Start', 'Station Id'
grouping_features = ['{} Hour'.format(start_end_flag), '{} {}'.format(start_end_flag, station_cluster_flag)]
return_dict = generate_checkout_checkin_dict(station_id_list=station_id_list, mClusters=mClusters, grouping_features=grouping_features, data=test1, start_end_flag=start_end_flag, station_cluster_flag=station_cluster_flag)

obj_val = 0
for k in clusters_list:
  stations_list = cluster_stationID_dict.get(k)
  for i in stations_list:
    df_temp = return_dict[k]
    df_temp['Number of Checkouts (Station Id)'] = return_dict[k]['Number of Checkouts (Station Id)'].div(return_dict[k]['Number of Checkouts (Station Id)'].sum(), axis=0)
    avg_checkouts = df_temp['Number of Checkouts (Station Id)'].sum() / len(df_temp)
    for idx, row in df_temp.iterrows():
      n_checkouts = row[-1]  # of station i
      obj_val += abs(n_checkouts - avg_checkouts)
obj_val / (mClusters * nStations)

0.02029390999224126

### Checkin

In [211]:
checkin_volume = checkout_checkin_volume(mClusters=mClusters, station_id_list=station_id_list, test_data=checkin_test, start_end_flag='End', station_cluster_flag='Station Id')  # 'Station Id', 'Cluster'
checkin_volume_with_cluster = checkin_volume.merge(df_data_w_clusters[['station_id', 'cluster']], how='left', left_on='End Station Id', right_on='station_id').drop('station_id', axis=1)
checkin_volume_with_cluster.rename({'cluster': 'End Cluster'}, axis=1, inplace=True)
checkin_volume_with_cluster

number of missing combinations: 22272
Test Data Shape:  (18338, 4)


Unnamed: 0,End Hour,weekday/weekend,End Station Id,Number of Checkins (Station Id),End Cluster
0,0,weekday,7000.0,20,18
1,0,weekday,7001.0,7,9
2,0,weekday,7002.0,14,6
3,0,weekday,7003.0,6,6
4,0,weekday,7004.0,3,1
...,...,...,...,...,...
18333,23,weekend,7502.0,3,1
18334,23,weekend,7505.0,4,13
18335,23,weekend,7508.0,5,11
18336,23,weekend,7509.0,1,13


In [271]:
checkin_volume = checkout_checkin_volume(mClusters=mClusters, station_id_list=station_id_list, test_data=checkin_test, start_end_flag='End', station_cluster_flag='Station Id')

number of missing combinations: 22272
Test Data Shape:  (18338, 4)


In [275]:
checkin_volume_with_cluster = checkin_volume.merge(df_data_w_clusters[['station_id', 'cluster']], how='left', left_on='End Station Id', right_on='station_id').drop('station_id', axis=1)
checkin_volume_with_cluster.rename({'cluster': 'End Cluster'}, axis=1, inplace=True)
checkin_volume_with_cluster

Unnamed: 0,End Hour,weekday/weekend,End Station Id,Number of Checkins (Station Id),End Cluster
0,0,weekday,7000.0,20,18
1,0,weekday,7001.0,7,9
2,0,weekday,7002.0,14,6
3,0,weekday,7003.0,6,6
4,0,weekday,7004.0,3,1
...,...,...,...,...,...
18333,23,weekend,7502.0,3,1
18334,23,weekend,7505.0,4,13
18335,23,weekend,7508.0,5,11
18336,23,weekend,7509.0,1,13


In [276]:
# R^48
start_end_flag, station_cluster_flag = 'End', 'Station Id'
grouping_features = ['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} {}'.format(start_end_flag, station_cluster_flag)]
return_dict = generate_checkout_checkin_dict(station_id_list=station_id_list, mClusters=mClusters, grouping_features=grouping_features, data=checkin_volume_with_cluster, start_end_flag=start_end_flag, station_cluster_flag=station_cluster_flag)

obj_val = 0
for k in clusters_list:
  stations_list = cluster_stationID_dict.get(k)
  for i in stations_list:
    df_temp = return_dict[k]
    df_temp['Number of Checkins (Station Id)'] = return_dict[k]['Number of Checkins (Station Id)'].div(return_dict[k]['Number of Checkins (Station Id)'].sum(), axis=0)
    avg_checkoins = df_temp['Number of Checkins (Station Id)'].sum() / len(df_temp)
    for idx, row in df_temp.iterrows():
      n_checkins = row[-1]  # of station i
      obj_val += abs(n_checkins - avg_checkoins)
obj_val / (mClusters * nStations)

0.023338219480092355

In [277]:
# R^24
start_end_flag, station_cluster_flag = 'End', 'Station Id'
grouping_features = ['{} Hour'.format(start_end_flag), '{} {}'.format(start_end_flag, station_cluster_flag)]
return_dict = generate_checkout_checkin_dict(station_id_list=station_id_list, mClusters=mClusters, grouping_features=grouping_features, data=checkin_volume_with_cluster, start_end_flag=start_end_flag, station_cluster_flag=station_cluster_flag)

obj_val = 0
for k in clusters_list:
  stations_list = cluster_stationID_dict.get(k)
  for i in stations_list:
    df_temp = return_dict[k]
    df_temp['Number of Checkins (Station Id)'] = return_dict[k]['Number of Checkins (Station Id)'].div(return_dict[k]['Number of Checkins (Station Id)'].sum(), axis=0)
    avg_checkoins = df_temp['Number of Checkins (Station Id)'].sum() / len(df_temp)
    for idx, row in df_temp.iterrows():
      n_checkins = row[-1]  # of station i
      obj_val += abs(n_checkins - avg_checkoins)
obj_val / (mClusters * nStations)

0.02114700677277065