In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import holidays
import itertools
from sklearn.linear_model import LinearRegression, Ridge
# from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, max_error, mean_absolute_error, r2_score
import time

# 1. Import Data

In [3]:
def import_ridership_weather_data():
  # Import ridership data
  data_dir = '/content/drive/My Drive/MIE498 Thesis/0_Data'
  merged_bike_data_2019 = pd.read_csv("{}/ridership_2019_with_bike_stations_info_20200930.csv".format(data_dir), header=0)
  print(merged_bike_data_2019.shape)
  merged_bike_data_2019['End Day of Year'] = merged_bike_data_2019.apply(lambda row: datetime.strptime(row['End Time'], "%Y-%m-%d %H:%M:%S").timetuple().tm_yday, axis=1)
  print(merged_bike_data_2019.shape)

  merged_bike_data_2019['Start Time'] = merged_bike_data_2019.apply(lambda row: datetime.strptime(row['Start Time'], "%Y-%m-%d %H:%M:%S"), axis=1)
  merged_bike_data_2019['Start Day'] = merged_bike_data_2019.apply(lambda row: row['Start Time'].day, axis=1)
  merged_bike_data_2019['End Time'] = merged_bike_data_2019.apply(lambda row: datetime.strptime(row['End Time'], "%Y-%m-%d %H:%M:%S"), axis=1)
  merged_bike_data_2019['End Day'] = merged_bike_data_2019.apply(lambda row: row['End Time'].day, axis=1)

  # Import weather data
  df_weather = pd.read_csv('/content/drive/My Drive/MIE498 Thesis/Share-Bike-Station-Clustering-and-Usage-Prediction/toronto_weather_2019.csv', index_col=None)
  df_weather['Month'] = df_weather['Month'].astype("float64")
  df_weather['Day'] = df_weather['Day'].astype("float64")
  df_weather['Hour'] = df_weather['Hour'].astype("float64")
  df_weather['Day of Week'] = df_weather['Day of Week'].astype("float64")

  return merged_bike_data_2019, df_weather

In [4]:
def merge_clustering_data(nClusteringIterations, ridership_data, weather_data):
  merged_data = merged_bike_data_2019.copy()
  # Import cluster-station-assignment data
  df_data_w_clusters = pd.read_csv('/content/drive/My Drive/MIE498 Thesis/Share-Bike-Station-Clustering-and-Usage-Prediction/station_data_w_clusters_{}iterations.csv'.format(nClusteringIterations), index_col=0)
  merged_data = merged_data.merge(df_data_w_clusters[['station_id', 'cluster']], how='left', left_on='Start Station Id', right_on='station_id').drop('station_id', axis=1)
  merged_data.rename({'cluster': 'Start Cluster'}, axis=1, inplace=True)
  merged_data = merged_data.merge(df_data_w_clusters[['station_id', 'cluster']], how='left', left_on='End Station Id', right_on='station_id').drop('station_id', axis=1)
  merged_data.rename({'cluster': 'End Cluster'}, axis=1, inplace=True)

  merged_data = merged_data.merge(df_weather[['Month', 'Day', 'Hour', 'Temperature (Celsius)']], how='left', left_on=['Start Month', 'Start Day', 'Start Hour'], right_on=['Month', 'Day', 'Hour']).drop(['Month', 'Day', 'Hour'], axis=1)
  merged_data.rename({'Temperature (Celsius)': 'Start Temp'}, axis=1, inplace=True)
  merged_data = merged_data.merge(df_weather[['Month', 'Day', 'Hour', 'Temperature (Celsius)']], how='left', left_on=['End Month', 'End Day', 'End Hour'], right_on=['Month', 'Day', 'Hour']).drop(['Month', 'Day', 'Hour'], axis=1)
  merged_data.rename({'Temperature (Celsius)': 'End Temp'}, axis=1, inplace=True)

  return merged_data

In [5]:
def check_weekend(dayofweek):
    if dayofweek > 4:
      return 'weekend'
    else:
      return 'weekday'

In [6]:
def filter_checkout_checkin_data(merged_data):
# check-out and check-in data split
  checkout_data = merged_data[['Trip Id', 'Start Station Id', 'Start Time', 'Start Station Name', 'Start Year', 'Start Month', 'Start Hour',
        'Start Day of Week', 'Start Holiday', 'Start Day of Year', 'Start Week of Year', 'Start Lat', 'Start Lon', 'Start Cluster', 'Start Temp']]
  checkout_data['weekday/weekend'] = checkout_data.apply(lambda row: check_weekend(row['Start Day of Week']), axis=1)

  checkin_data = merged_data[['Trip Id', 'End Station Id', 'End Time', 'End Station Name', 'End Year', 'End Month',
        'End Hour', 'End Day of Week', 'End Holiday', 'End Lat', 'End Lon', 'End Day of Year', 'End Cluster', 'End Temp']]
  checkin_data['weekday/weekend'] = checkin_data.apply(lambda row: check_weekend(row['End Day of Week']), axis=1)

  return checkout_data, checkin_data

# 2. Class Definition

In [11]:
class BikeDemand:
  def __init__(self, nTransitions, tempflag):
    self.nTransitionMatrix = nTransitions
    self.tempFlag = tempflag
    self.nClusters = None

  # Training: Months 1-9, Testing: Months 10-12
  def train_test_split(self, data, start_end_flag):
    training_data = data.loc[(data['{} Month'.format(start_end_flag)] >= 1) & (data['{} Month'.format(start_end_flag)] <= 9)]
    testing_data = data.loc[(data['{} Month'.format(start_end_flag)] >= 10) & (data['{} Month'.format(start_end_flag)] <= 12)]
    print(data.shape, training_data.shape, testing_data.shape)
    return training_data, testing_data
  
  def fill_in_missing_combinations(self, data, start_end_flag):
    # Check missing combinations
    
    hours = np.arange(24)
    # dayofyear = data['{} Day of Year'.format(start_end_flag)].unique()
    weekday_weekend = ['weekday', 'weekend']
    clusters = np.arange(self.nClusters)

    combinations = itertools.product(hours, weekday_weekend, clusters)
    features_list = ['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag)]
    df_combinations = data[features_list].to_numpy().astype('str')
    df_combinations = df_combinations.tolist()
    comb_list = []
    for comb in combinations:
      comb = [str(comb[0]), comb[1], str(comb[2])]
      if comb not in df_combinations:
        # print(comb)
        comb_list.append(comb)
    print('number of missing combinations:', len(comb_list))

    # Fill in missing combinations
    for comb in comb_list:
      hour = int(comb[0])
      weekday_weekend = comb[1]
      cluster = int(comb[2])
      df_select = data[(data['{} Hour'.format(start_end_flag)] == hour) & (data['weekday/weekend'] == weekday_weekend)]
      
      clusters_list = data['{} Cluster'.format(start_end_flag)].unique()
      for c in clusters:
        if c not in clusters_list:
          if self.tempFlag == True:
            for temperature in df_select['{} Temp'.format(start_end_flag)].unique():
              for doy in df_select[df_select['{} Temp'.format(start_end_flag)] == temperature]['{} Day of Year'.format(start_end_flag)].unique():
                new_row = {'{} Hour'.format(start_end_flag) : hour, 'weekday/weekend' : weekday_weekend, 
                            '{} Cluster'.format(start_end_flag) : cluster, '{} Day of Year'.format(start_end_flag) : doy,
                            '{} Temp'.format(start_end_flag) : temperature, 'Trip Id': 0.000001}
                data = data.append(new_row, ignore_index = True)
          else:
            for doy in df_select['{} Day of Year'.format(start_end_flag)].unique():
                new_row = {'{} Hour'.format(start_end_flag) : hour, 'weekday/weekend' : weekday_weekend, 
                            '{} Cluster'.format(start_end_flag) : cluster, '{} Day of Year'.format(start_end_flag) : doy,
                            'Trip Id': 0.000001}
                data = data.append(new_row, ignore_index = True)
    return data
  
  def preprocess_data(self, train_data, test_data, start_end_flag):
    features_list = ['Trip Id', '{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag), '{} Day of Year'.format(start_end_flag)]
    grouping_features_list = ['{} Hour'.format(start_end_flag), 'weekday/weekend', '{} Cluster'.format(start_end_flag), '{} Day of Year'.format(start_end_flag)]
    reset_levels_list = [0,1,2,3]
    if self.tempFlag == True:
      features_list += ['{} Temp'.format(start_end_flag)]
      grouping_features_list += ['{} Temp'.format(start_end_flag)]
      reset_levels_list += [4]
    train_data = train_data[features_list]
    train_data = train_data.groupby(by=grouping_features_list).count()
    # print(train_data.head())
    train_data = train_data.reset_index(level=reset_levels_list)
    train_data['{} Hour'.format(start_end_flag)] = train_data['{} Hour'.format(start_end_flag)].astype("int64")

    test_data = test_data[features_list]
    test_data = test_data.groupby(by=grouping_features_list).count()
    test_data = test_data.reset_index(level=reset_levels_list)
    test_data['{} Hour'.format(start_end_flag)] = test_data['{} Hour'.format(start_end_flag)].astype("int64")

    train_data = self.fill_in_missing_combinations(train_data, start_end_flag)
    test_data = self.fill_in_missing_combinations(test_data, start_end_flag)

    if start_end_flag == 'Start':
      name = 'Number of Checkouts'
    else:
      name = 'Number of Checkins'

    train_data = train_data.sort_values(by=grouping_features_list)
    train_data.rename({'Trip Id': name}, axis=1, inplace=True)
    train_data.reset_index(drop=True, inplace=True)

    test_data = test_data.sort_values(by=grouping_features_list)
    test_data.rename({'Trip Id': name}, axis=1, inplace=True)
    test_data.reset_index(drop=True, inplace=True)

    print('Train Data Shape: ', train_data.shape)
    print('Test Data Shape: ', test_data.shape)

    return train_data, test_data

  def predict_checkout(self, checkout_train, checkout_test):
    features_list = ['Start Hour', 'weekday/weekend', 'Start Cluster']
    if self.tempFlag == True:
      features_list += ['Start Temp']
    X_train = checkout_train[features_list]
    X_train = pd.get_dummies(data=X_train, columns=['Start Hour', 'weekday/weekend', 'Start Cluster'], drop_first=True).to_numpy()
    y_train = checkout_train[['Number of Checkouts']].to_numpy()

    X_test = checkout_test[features_list]
    X_test = pd.get_dummies(data=X_test, columns=['Start Hour', 'weekday/weekend', 'Start Cluster'], drop_first=True).to_numpy()
    y_test = checkout_test[['Number of Checkouts']].to_numpy()

    y_train_log, y_test_log = np.log(y_train), np.log(y_test)
    checkout_linreg = LinearRegression().fit(X_train, y_train_log)

    y_train_pred = np.exp(checkout_linreg.predict(X_train))
    y_test_pred = np.exp(checkout_linreg.predict(X_test))

    return X_train, y_train, y_train_pred, X_test, y_test, y_test_pred
  
  def evaluate(self, y_true, y_pred):
    mse = round(mean_squared_error(y_true, y_pred, squared=True),3) # MSE
    rmse = round(mean_squared_error(y_true, y_pred, squared=False),3) # RMSE
    mre = round(max_error(y_true, y_pred),3) # maximum residual error
    mae = round(mean_absolute_error(y_true, y_pred),3) # MAE
    # r2 = round(r2_score(y_true, y_pred),3)
    # return mse, rmse, mre, mae, r2
    return mse, rmse, mre, mae

  def generate_transition_matrix(self, df_train_select):
    from_cluster_list = list(np.arange(0, self.nClusters))
    df_transition_matrix = pd.DataFrame(data=np.zeros((self.nClusters, self.nClusters)), columns = from_cluster_list, index=from_cluster_list)
    df_transition_matrix = df_transition_matrix.astype("int")

    for from_cluster in from_cluster_list:
      data_train_temp = df_train_select[df_train_select['Start Cluster'] == from_cluster]
      df_counts = pd.DataFrame(data_train_temp['End Cluster'].value_counts())
      
      # print(df_counts.head())
      to_cluster_list = list(df_counts.index)
      
      for to_cluster in to_cluster_list:
        cnt = df_counts.loc[to_cluster]['End Cluster']
        df_transition_matrix.loc[from_cluster, to_cluster] = cnt
      
      df_transition_matrix = df_transition_matrix.div(df_transition_matrix.sum(axis=1), axis=0)

    df_transition_matrix.replace(np.nan, 0, inplace=True)
    return df_transition_matrix
  
  def generate_transition_matrix_dict(self, data_train):
    n = self.nTransitionMatrix
    transition_matrix_dict = {}
    if n == 1:
      return self.generate_transition_matrix(data_train)
    
    elif n == 24:
      for hr in data_train['Start Hour'].unique():
        df_train_select = data_train[data_train['Start Hour'] == hr]
        transition_matrix = self.generate_transition_matrix(df_train_select)
        transition_matrix_dict[hr] = transition_matrix
      return transition_matrix_dict
    
    elif n == 48:
      for hr in data_train['Start Hour'].unique():
        for wd in data_train['weekday/weekend'].unique():
          df_train_select = data_train[(data_train['Start Hour'] == hr) & (data_train['weekday/weekend'] == wd)]
          transition_matrix = self.generate_transition_matrix(df_train_select)
          transition_matrix_dict[(hr, wd)] = transition_matrix
      return transition_matrix_dict
  
  def predict_checkin_by_cluster(self, df_predictions, df_transition_matrix):
    checkin_prediction_list = []
    columns_list = ['End Cluster {}'.format(c) for c in np.arange(self.nClusters)]
    df_checkin = pd.DataFrame(columns = columns_list)

    if self.nTransitionMatrix == 1:
      for idx, row in df_predictions.iterrows():
        if idx % 5000 == 0:
          print(idx)
        checkout_prediction = row['Check-out Predictions']
        cluster_n = row['Start Cluster']
        transition_list = np.array(df_transition_matrix.iloc[cluster_n])
        # check-out predictions x transition matrix = check-in predictions
        checkin_prediction = np.round(transition_list * checkout_prediction, 6)
        checkin_prediction = pd.Series(checkin_prediction, index = df_checkin.columns)
        df_checkin = df_checkin.append(checkin_prediction, ignore_index=True)
    elif self.nTransitionMatrix == 24:
      for idx, row in df_predictions.iterrows():
        if idx % 5000 == 0:
          print(idx)
        checkout_prediction = row['Check-out Predictions']
        cluster_n = row['Start Cluster']
        hour = row['Start Hour']
        transition_list = np.array(df_transition_matrix[hour].iloc[cluster_n])
        # check-out predictions x transition matrix = check-in predictions
        checkin_prediction = np.round(transition_list * checkout_prediction, 6)
        checkin_prediction = pd.Series(checkin_prediction, index = df_checkin.columns)
        df_checkin = df_checkin.append(checkin_prediction, ignore_index=True)
    elif self.nTransitionMatrix == 48:
      for idx, row in df_predictions.iterrows():
        if idx % 5000 == 0:
          print(idx)
        checkout_prediction = row['Check-out Predictions']
        cluster_n = row['Start Cluster']
        hour, weekday = row['Start Hour'], row['weekday/weekend']
        transition_list = np.array(df_transition_matrix[(hour, weekday)].iloc[cluster_n])
        # check-out predictions x transition matrix = check-in predictions
        checkin_prediction = np.round(transition_list * checkout_prediction, 6)
        checkin_prediction = pd.Series(checkin_prediction, index = df_checkin.columns)
        df_checkin = df_checkin.append(checkin_prediction, ignore_index=True)
    
    return pd.concat([df_predictions, df_checkin], axis=1)
  
  def compute_true_checkin(self, checkin_test):
    features_list = ['End Hour', 'weekday/weekend', 'End Day of Year']
    data_columns = ['End Hour','weekday/weekend', 'End Cluster', 'Number of Checkins']
    if self.tempFlag == True:
      features_list += ['End Temp']
      data_columns += ['End Temp']
    df = checkin_test[features_list].drop_duplicates()
    clusters = np.arange(self.nClusters)
    df_checkin = pd.DataFrame(columns = data_columns)
    for idx, row in df.iterrows():
      if self.tempFlag == True:
        hr, wd, edoy, temp = row[0], row[1], row[2], row[3]
        df_temp = checkin_test[(checkin_test['End Hour'] == hr) & (checkin_test['weekday/weekend'] == wd) & 
                             (checkin_test['End Day of Year'] == edoy) & (checkin_test['End Temp'] == temp)]
      else: 
        hr, wd, edoy = row[0], row[1], row[2]
        df_temp = checkin_test[(checkin_test['End Hour'] == hr) & (checkin_test['weekday/weekend'] == wd) & 
                             (checkin_test['End Day of Year'] == edoy)]
      
      clusters_list = df_temp['End Cluster'].unique()
      for c in clusters:
        if c not in clusters_list:
          if self.tempFlag == True:
            new_row = {'End Hour' : hr, 'weekday/weekend' : wd, 'End Cluster' : c, 'End Day of Year' : edoy, 'End Temp' : temp, 'Number of Checkins': 0.000001}
          else:
            new_row = {'End Hour' : hr, 'weekday/weekend' : wd, 'End Cluster' : c, 'End Day of Year' : edoy, 'Number of Checkins': 0.000001}
          df_temp = df_temp.append(new_row, ignore_index = True)
      if self.tempFlag == True:
        df_temp.sort_values(by=['End Hour', 'weekday/weekend', 'End Cluster', 'End Day of Year', 'End Temp'], inplace=True)
      else:
        df_temp.sort_values(by=['End Hour', 'weekday/weekend', 'End Cluster', 'End Day of Year'], inplace=True)
      df_temp.reset_index(drop=True, inplace=True)
      if len(df_temp) != self.nClusters:
        print(df_temp)
      df_checkin = df_checkin.append(df_temp, ignore_index = True)
        
    return df_checkin
  
  def run_prediction_pipeline(self, merged_data, checkout_data, checkin_data):
    self.nClusters = len(merged_data['Start Cluster'].unique())
    print(self.nClusters)

    print('Checkout predictions')
    checkout_train, checkout_test = self.train_test_split(checkout_data, 'Start')
    checkout_train, checkout_test = self.preprocess_data(checkout_train, checkout_test, 'Start')
    X_train, y_train, y_train_pred, X_test, y_test, y_test_pred = self.predict_checkout(checkout_train, checkout_test)

    print('Checkout training and testing errors')
    mse_train, rmse_train, mre_train, mae_train = self.evaluate(y_train, y_train_pred)
    mse_test, rmse_test, mre_test, mae_test = self.evaluate(y_test, y_test_pred)
    print('training errors: ', mse_train, rmse_train, mre_train, mae_train)
    print('testing errors: ', mse_test, rmse_test, mre_test, mae_test)

    print('Transition Matrix Computation')
    data_train, data_test = self.train_test_split(merged_data, 'Start')
    data_train['weekday/weekend'] = data_train.apply(lambda row: check_weekend(row['Start Day of Week']), axis=1)
    df_transition_matrix = self.generate_transition_matrix_dict(data_train)

    print('Summarize checkout predictions')
    if self.tempFlag == True:
      df_predictions = checkout_test[['Start Hour', 'weekday/weekend', 'Start Cluster', 'Start Temp']]
    else:
      df_predictions = checkout_test[['Start Hour', 'weekday/weekend', 'Start Cluster']]
    df_predictions['Check-out Predictions'], df_predictions['Check-out True Values'] = y_test_pred, y_test
    df_predictions = self.predict_checkin_by_cluster(df_predictions, df_transition_matrix)

    print('Checkin predictions')
    checkin_train, checkin_test = self.train_test_split(checkin_data, 'End')
    checkin_train, checkin_test = self.preprocess_data(checkin_train, checkin_test, 'End')
    df_checkin = self.compute_true_checkin(checkin_test)
    checkin_pred = []
    
    if self.tempFlag == True:
      df_checkin_row = df_checkin[['End Hour', 'weekday/weekend', 'End Temp', 'End Day of Year']].drop_duplicates()
    else:
      df_checkin_row = df_checkin[['End Hour', 'weekday/weekend', 'End Day of Year']].drop_duplicates()
    
    for idx, row in df_checkin_row.iterrows():
      if self.tempFlag == True:
        hr, wd, temp = row[0], row[1], row[2]
        temp_pred_list = df_predictions[(df_predictions['Start Hour'] == hr) & (df_predictions['weekday/weekend'] == wd) & (df_predictions['Start Temp'] == temp)].iloc[:, 6:].sum(axis=0)
      else:
        hr, wd = row[0], row[1]
        temp_pred_list = df_predictions[(df_predictions['Start Hour'] == hr) & (df_predictions['weekday/weekend'] == wd)].iloc[:, 5:].sum(axis=0)
      
      if len(temp_pred_list) != self.nClusters:
        print(len(temp_pred_list))
      checkin_pred += temp_pred_list.to_list()
    df_checkin['Predicted Number of Checkins'] = checkin_pred

    print('Evaluate the predictions')
    checkout_true = df_predictions['Check-out True Values'].to_numpy()
    checkout_pred = df_predictions['Check-out Predictions'].to_numpy()
    checkin_true = df_checkin[['Number of Checkins']].to_numpy()
    checkin_pred = df_checkin[['Predicted Number of Checkins']].to_numpy()

    print('Checkout evaluation:')
    checkout_mse, checkout_rmse, checkout_mre, checkout_mae = self.evaluate(checkout_true, checkout_pred)
    print('mse, rmse, mre, mae: ', checkout_mse, checkout_rmse, checkout_mre, checkout_mae)
    print('Checkin evaluation:')
    checkin_mse, checkin_rmse, checkin_mre, checkin_mae = self.evaluate(checkin_true, checkin_pred)
    print('mse, rmse, mre, mae: ', checkin_mse, checkin_rmse, checkin_mre, checkin_mae)
    return checkout_mse, checkout_rmse, checkout_mre, checkout_mae, checkin_mse, checkin_rmse, checkin_mre, checkin_mae

# 3. Run Pipeline

In [8]:
merged_bike_data_2019, df_weather = import_ridership_weather_data()

(2438720, 26)
(2438720, 27)


## Clustering Iterations = 1

In [12]:
n_clustering_iterations_list = [1]
n_transition_matrix_list = [1, 24, 48]
temperatureFlag_list = [True, False]
df_checkout_errs = pd.DataFrame(columns = ['n Clustering Iterations', 'n Transition Matrix', 'Temperature', 'mse', 'rmse', 'mre', 'mae'])
df_checkin_errs = pd.DataFrame(columns = ['n Clustering Iterations', 'n Transition Matrix', 'Temperature', 'mse', 'rmse', 'mre', 'mae'])

for n_clustering_iterations in n_clustering_iterations_list:
  merged_data = merge_clustering_data(nClusteringIterations=n_clustering_iterations, ridership_data=merged_bike_data_2019, weather_data=df_weather)
  checkout_data, checkin_data = filter_checkout_checkin_data(merged_data)
  for temperatureFlag in temperatureFlag_list:
    for n_transition_matrix in n_transition_matrix_list:
      print(n_clustering_iterations, temperatureFlag, n_transition_matrix)
      start_time = time.time()
      bikedemand = BikeDemand(nTransitions=n_transition_matrix, tempflag=temperatureFlag)
      checkout_mse, checkout_rmse, checkout_mre, checkout_mae, checkin_mse, checkin_rmse, checkin_mre, checkin_mae = bikedemand.run_prediction_pipeline(merged_data, checkout_data, checkin_data)
      print("Total Time Elapsed: ", time.time() - start_time)

      # Save results
      df_temp = {'n Clustering Iterations' : n_clustering_iterations, 'n Transition Matrix' : bikedemand.nTransitionMatrix, 'Temperature': bikedemand.tempFlag,'mse': checkout_mse, 'rmse': checkout_rmse, 'mre': checkout_mre, 'mae': checkout_mae}
      df_checkout_errs = df_checkout_errs.append(df_temp, ignore_index=True)
      df_temp = {'n Clustering Iterations' : n_clustering_iterations, 'n Transition Matrix' : bikedemand.nTransitionMatrix, 'Temperature': bikedemand.tempFlag,'mse': checkin_mse, 'rmse': checkin_rmse, 'mre': checkin_mre, 'mae': checkin_mae}
      df_checkin_errs = df_checkin_errs.append(df_temp, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


1 True 1
38
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 14
number of missing combinations: 60
Train Data Shape:  (158529, 6)
Test Data Shape:  (52809, 6)
Checkout training and testing errors
training errors:  148.101 12.17 251.137 5.852
testing errors:  95.145 9.754 154.416 4.758
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 4
number of missing combinations: 55
Train Data Shape:  (157061, 6)
Test Data Shape:  (52197, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  95.145 9.754 154.416 4.758
Checkin evaluation:
mse, rmse, mre, mae:  63.115 7.944 159.067 3.359
Total Time Elapsed:  373.8611693382263
1 True 24
38
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 14
number of missing combinations: 60
Train Data Shape:  (158529, 6)
Test Data Shape:  (52809, 6)
Checkout training and testing errors
training errors:  148.101 12.17 251.137 5.852
testing errors:  95.145 9.754 154.416 4.758
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 4
number of missing combinations: 55
Train Data Shape:  (157061, 6)
Test Data Shape:  (52197, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  95.145 9.754 154.416 4.758
Checkin evaluation:
mse, rmse, mre, mae:  54.113 7.356 143.145 3.182
Total Time Elapsed:  388.19877791404724
1 True 48
38
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 14
number of missing combinations: 60
Train Data Shape:  (158529, 6)
Test Data Shape:  (52809, 6)
Checkout training and testing errors
training errors:  148.101 12.17 251.137 5.852
testing errors:  95.145 9.754 154.416 4.758
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 4
number of missing combinations: 55
Train Data Shape:  (157061, 6)
Test Data Shape:  (52197, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  95.145 9.754 154.416 4.758
Checkin evaluation:
mse, rmse, mre, mae:  52.536 7.248 142.612 3.146
Total Time Elapsed:  411.69413447380066
1 False 1
38
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 14
number of missing combinations: 60
Train Data Shape:  (158529, 5)
Test Data Shape:  (52809, 5)
Checkout training and testing errors
training errors:  242.428 15.57 278.634 8.028
testing errors:  97.134 9.856 159.194 5.284
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 4
number of missing combinations: 55
Train Data Shape:  (157059, 5)
Test Data Shape:  (52197, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  97.134 9.856 159.194 5.284
Checkin evaluation:
mse, rmse, mre, mae:  211352.836 459.731 2428.513 275.041
Total Time Elapsed:  354.89255237579346
1 False 24
38
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 14
number of missing combinations: 60
Train Data Shape:  (158529, 5)
Test Data Shape:  (52809, 5)
Checkout training and testing errors
training errors:  242.428 15.57 278.634 8.028
testing errors:  97.134 9.856 159.194 5.284
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 4
number of missing combinations: 55
Train Data Shape:  (157059, 5)
Test Data Shape:  (52197, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  97.134 9.856 159.194 5.284
Checkin evaluation:
mse, rmse, mre, mae:  222877.442 472.099 2902.128 275.04
Total Time Elapsed:  367.36137652397156
1 False 48
38
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 14
number of missing combinations: 60
Train Data Shape:  (158529, 5)
Test Data Shape:  (52809, 5)
Checkout training and testing errors
training errors:  242.428 15.57 278.634 8.028
testing errors:  97.134 9.856 159.194 5.284
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 4
number of missing combinations: 55
Train Data Shape:  (157059, 5)
Test Data Shape:  (52197, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  97.134 9.856 159.194 5.284
Checkin evaluation:
mse, rmse, mre, mae:  227156.16 476.609 2997.88 275.04
Total Time Elapsed:  379.59584498405457


In [13]:
df_checkout_errs

Unnamed: 0,n Clustering Iterations,n Transition Matrix,Temperature,mse,rmse,mre,mae
0,1,1,True,95.145,9.754,154.416,4.758
1,1,24,True,95.145,9.754,154.416,4.758
2,1,48,True,95.145,9.754,154.416,4.758
3,1,1,False,97.134,9.856,159.194,5.284
4,1,24,False,97.134,9.856,159.194,5.284
5,1,48,False,97.134,9.856,159.194,5.284


In [14]:
df_checkin_errs

Unnamed: 0,n Clustering Iterations,n Transition Matrix,Temperature,mse,rmse,mre,mae
0,1,1,True,63.115,7.944,159.067,3.359
1,1,24,True,54.113,7.356,143.145,3.182
2,1,48,True,52.536,7.248,142.612,3.146
3,1,1,False,211352.836,459.731,2428.513,275.041
4,1,24,False,222877.442,472.099,2902.128,275.04
5,1,48,False,227156.16,476.609,2997.88,275.04


## Clustering Iterations = 5, 10

In [15]:
n_clustering_iterations_list = [5, 10]
n_transition_matrix_list = [1, 24, 48]
temperatureFlag_list = [True, False]

for n_clustering_iterations in n_clustering_iterations_list:
  merged_data = merge_clustering_data(nClusteringIterations=n_clustering_iterations, ridership_data=merged_bike_data_2019, weather_data=df_weather)
  checkout_data, checkin_data = filter_checkout_checkin_data(merged_data)
  for temperatureFlag in temperatureFlag_list:
    for n_transition_matrix in n_transition_matrix_list:
      print(n_clustering_iterations, temperatureFlag, n_transition_matrix)
      start_time = time.time()
      bikedemand = BikeDemand(nTransitions=n_transition_matrix, tempflag=temperatureFlag)
      checkout_mse, checkout_rmse, checkout_mre, checkout_mae, checkin_mse, checkin_rmse, checkin_mre, checkin_mae = bikedemand.run_prediction_pipeline(merged_data, checkout_data, checkin_data)
      print("Total Time Elapsed: ", time.time() - start_time)

      # Save results
      df_temp = {'n Clustering Iterations' : n_clustering_iterations, 'n Transition Matrix' : bikedemand.nTransitionMatrix, 'Temperature': bikedemand.tempFlag,'mse': checkout_mse, 'rmse': checkout_rmse, 'mre': checkout_mre, 'mae': checkout_mae}
      df_checkout_errs = df_checkout_errs.append(df_temp, ignore_index=True)
      df_temp = {'n Clustering Iterations' : n_clustering_iterations, 'n Transition Matrix' : bikedemand.nTransitionMatrix, 'Temperature': bikedemand.tempFlag,'mse': checkin_mse, 'rmse': checkin_rmse, 'mre': checkin_mre, 'mae': checkin_mae}
      df_checkin_errs = df_checkin_errs.append(df_temp, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


5 True 1
38
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 10
number of missing combinations: 53
Train Data Shape:  (159074, 6)
Test Data Shape:  (52957, 6)
Checkout training and testing errors
training errors:  161.483 12.708 226.451 5.92
testing errors:  103.092 10.153 164.487 4.785
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 6
number of missing combinations: 50
Train Data Shape:  (158246, 6)
Test Data Shape:  (52639, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  103.092 10.153 164.487 4.785
Checkin evaluation:
mse, rmse, mre, mae:  68.821 8.296 185.482 3.405
Total Time Elapsed:  369.52971172332764
5 True 24
38
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 10
number of missing combinations: 53
Train Data Shape:  (159074, 6)
Test Data Shape:  (52957, 6)
Checkout training and testing errors
training errors:  161.483 12.708 226.451 5.92
testing errors:  103.092 10.153 164.487 4.785
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 6
number of missing combinations: 50
Train Data Shape:  (158246, 6)
Test Data Shape:  (52639, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  103.092 10.153 164.487 4.785
Checkin evaluation:
mse, rmse, mre, mae:  56.899 7.543 149.575 3.196
Total Time Elapsed:  407.6066164970398
5 True 48
38
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 10
number of missing combinations: 53
Train Data Shape:  (159074, 6)
Test Data Shape:  (52957, 6)
Checkout training and testing errors
training errors:  161.483 12.708 226.451 5.92
testing errors:  103.092 10.153 164.487 4.785
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 6
number of missing combinations: 50
Train Data Shape:  (158246, 6)
Test Data Shape:  (52639, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  103.092 10.153 164.487 4.785
Checkin evaluation:
mse, rmse, mre, mae:  54.8 7.403 147.423 3.159
Total Time Elapsed:  402.9452919960022
5 False 1
38
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 10
number of missing combinations: 53
Train Data Shape:  (159074, 5)
Test Data Shape:  (52957, 5)
Checkout training and testing errors
training errors:  258.22 16.069 257.234 8.043
testing errors:  105.024 10.248 169.438 5.31
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 6
number of missing combinations: 50
Train Data Shape:  (158244, 5)
Test Data Shape:  (52639, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  105.024 10.248 169.438 5.31
Checkin evaluation:
mse, rmse, mre, mae:  211576.858 459.975 2327.125 271.783
Total Time Elapsed:  381.5623435974121
5 False 24
38
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 10
number of missing combinations: 53
Train Data Shape:  (159074, 5)
Test Data Shape:  (52957, 5)
Checkout training and testing errors
training errors:  258.22 16.069 257.234 8.043
testing errors:  105.024 10.248 169.438 5.31
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 6
number of missing combinations: 50
Train Data Shape:  (158244, 5)
Test Data Shape:  (52639, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  105.024 10.248 169.438 5.31
Checkin evaluation:
mse, rmse, mre, mae:  229196.784 478.745 3629.734 271.783
Total Time Elapsed:  398.45514011383057
5 False 48
38
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 10
number of missing combinations: 53
Train Data Shape:  (159074, 5)
Test Data Shape:  (52957, 5)
Checkout training and testing errors
training errors:  258.22 16.069 257.234 8.043
testing errors:  105.024 10.248 169.438 5.31
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 6
number of missing combinations: 50
Train Data Shape:  (158244, 5)
Test Data Shape:  (52639, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  105.024 10.248 169.438 5.31
Checkin evaluation:
mse, rmse, mre, mae:  233758.549 483.486 3749.187 271.783
Total Time Elapsed:  409.477822303772


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


10 True 1
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 29
number of missing combinations: 81
Train Data Shape:  (152874, 6)
Test Data Shape:  (50563, 6)
Checkout training and testing errors
training errors:  172.293 13.126 272.843 6.122
testing errors:  116.434 10.79 212.083 4.998
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 19
number of missing combinations: 88
Train Data Shape:  (151463, 6)
Test Data Shape:  (49855, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  116.434 10.79 212.083 4.998
Checkin evaluation:
mse, rmse, mre, mae:  76.784 8.763 218.844 3.467
Total Time Elapsed:  375.7968554496765
10 True 24
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 29
number of missing combinations: 81
Train Data Shape:  (152874, 6)
Test Data Shape:  (50563, 6)
Checkout training and testing errors
training errors:  172.293 13.126 272.843 6.122
testing errors:  116.434 10.79 212.083 4.998
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 19
number of missing combinations: 88
Train Data Shape:  (151463, 6)
Test Data Shape:  (49855, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  116.434 10.79 212.083 4.998
Checkin evaluation:
mse, rmse, mre, mae:  64.003 8.0 155.526 3.26
Total Time Elapsed:  376.2665832042694
10 True 48
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 29
number of missing combinations: 81
Train Data Shape:  (152874, 6)
Test Data Shape:  (50563, 6)
Checkout training and testing errors
training errors:  172.293 13.126 272.843 6.122
testing errors:  116.434 10.79 212.083 4.998
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 19
number of missing combinations: 88
Train Data Shape:  (151463, 6)
Test Data Shape:  (49855, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  116.434 10.79 212.083 4.998
Checkin evaluation:
mse, rmse, mre, mae:  61.548 7.845 152.718 3.225
Total Time Elapsed:  383.3414270877838
10 False 1
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 29
number of missing combinations: 81
Train Data Shape:  (152874, 5)
Test Data Shape:  (50563, 5)
Checkout training and testing errors
training errors:  282.37 16.804 305.787 8.364
testing errors:  118.008 10.863 221.787 5.531
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 19
number of missing combinations: 88
Train Data Shape:  (151461, 5)
Test Data Shape:  (49855, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  118.008 10.863 221.787 5.531
Checkin evaluation:
mse, rmse, mre, mae:  234168.925 483.91 2947.519 278.221
Total Time Elapsed:  358.7567901611328
10 False 24
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 29
number of missing combinations: 81
Train Data Shape:  (152874, 5)
Test Data Shape:  (50563, 5)
Checkout training and testing errors
training errors:  282.37 16.804 305.787 8.364
testing errors:  118.008 10.863 221.787 5.531
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 19
number of missing combinations: 88
Train Data Shape:  (151461, 5)
Test Data Shape:  (49855, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  118.008 10.863 221.787 5.531
Checkin evaluation:
mse, rmse, mre, mae:  252019.443 502.015 4802.325 278.221
Total Time Elapsed:  346.6141850948334
10 False 48
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 29
number of missing combinations: 81
Train Data Shape:  (152874, 5)
Test Data Shape:  (50563, 5)
Checkout training and testing errors
training errors:  282.37 16.804 305.787 8.364
testing errors:  118.008 10.863 221.787 5.531
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 19
number of missing combinations: 88
Train Data Shape:  (151461, 5)
Test Data Shape:  (49855, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  118.008 10.863 221.787 5.531
Checkin evaluation:
mse, rmse, mre, mae:  258537.817 508.466 4967.282 278.22
Total Time Elapsed:  378.2541489601135


In [16]:
df_checkout_errs

Unnamed: 0,n Clustering Iterations,n Transition Matrix,Temperature,mse,rmse,mre,mae
0,1,1,True,95.145,9.754,154.416,4.758
1,1,24,True,95.145,9.754,154.416,4.758
2,1,48,True,95.145,9.754,154.416,4.758
3,1,1,False,97.134,9.856,159.194,5.284
4,1,24,False,97.134,9.856,159.194,5.284
5,1,48,False,97.134,9.856,159.194,5.284
6,5,1,True,103.092,10.153,164.487,4.785
7,5,24,True,103.092,10.153,164.487,4.785
8,5,48,True,103.092,10.153,164.487,4.785
9,5,1,False,105.024,10.248,169.438,5.31


In [17]:
df_checkin_errs

Unnamed: 0,n Clustering Iterations,n Transition Matrix,Temperature,mse,rmse,mre,mae
0,1,1,True,63.115,7.944,159.067,3.359
1,1,24,True,54.113,7.356,143.145,3.182
2,1,48,True,52.536,7.248,142.612,3.146
3,1,1,False,211352.836,459.731,2428.513,275.041
4,1,24,False,222877.442,472.099,2902.128,275.04
5,1,48,False,227156.16,476.609,2997.88,275.04
6,5,1,True,68.821,8.296,185.482,3.405
7,5,24,True,56.899,7.543,149.575,3.196
8,5,48,True,54.8,7.403,147.423,3.159
9,5,1,False,211576.858,459.975,2327.125,271.783


## Clustering Iterations = 15, 20

In [19]:
n_clustering_iterations_list = [15, 20]
n_transition_matrix_list = [1, 24, 48]
temperatureFlag_list = [True, False]

for n_clustering_iterations in n_clustering_iterations_list:
  merged_data = merge_clustering_data(nClusteringIterations=n_clustering_iterations, ridership_data=merged_bike_data_2019, weather_data=df_weather)
  checkout_data, checkin_data = filter_checkout_checkin_data(merged_data)
  for temperatureFlag in temperatureFlag_list:
    for n_transition_matrix in n_transition_matrix_list:
      print(n_clustering_iterations, temperatureFlag, n_transition_matrix)
      start_time = time.time()
      bikedemand = BikeDemand(nTransitions=n_transition_matrix, tempflag=temperatureFlag)
      checkout_mse, checkout_rmse, checkout_mre, checkout_mae, checkin_mse, checkin_rmse, checkin_mre, checkin_mae = bikedemand.run_prediction_pipeline(merged_data, checkout_data, checkin_data)
      print("Total Time Elapsed: ", time.time() - start_time)

      # Save results
      df_temp = {'n Clustering Iterations' : n_clustering_iterations, 'n Transition Matrix' : bikedemand.nTransitionMatrix, 'Temperature': bikedemand.tempFlag,'mse': checkout_mse, 'rmse': checkout_rmse, 'mre': checkout_mre, 'mae': checkout_mae}
      df_checkout_errs = df_checkout_errs.append(df_temp, ignore_index=True)
      df_temp = {'n Clustering Iterations' : n_clustering_iterations, 'n Transition Matrix' : bikedemand.nTransitionMatrix, 'Temperature': bikedemand.tempFlag,'mse': checkin_mse, 'rmse': checkin_rmse, 'mre': checkin_mre, 'mae': checkin_mae}
      df_checkin_errs = df_checkin_errs.append(df_temp, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


15 True 1
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 5
number of missing combinations: 49
Train Data Shape:  (154974, 6)
Test Data Shape:  (51515, 6)
Checkout training and testing errors
training errors:  189.229 13.756 332.595 6.086
testing errors:  116.662 10.801 207.699 4.909
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 2
number of missing combinations: 52
Train Data Shape:  (153936, 6)
Test Data Shape:  (51193, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  116.662 10.801 207.699 4.909
Checkin evaluation:
mse, rmse, mre, mae:  79.887 8.938 247.06 3.479
Total Time Elapsed:  356.97666120529175
15 True 24
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 5
number of missing combinations: 49
Train Data Shape:  (154974, 6)
Test Data Shape:  (51515, 6)
Checkout training and testing errors
training errors:  189.229 13.756 332.595 6.086
testing errors:  116.662 10.801 207.699 4.909
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 2
number of missing combinations: 52
Train Data Shape:  (153936, 6)
Test Data Shape:  (51193, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  116.662 10.801 207.699 4.909
Checkin evaluation:
mse, rmse, mre, mae:  65.96 8.122 186.825 3.283
Total Time Elapsed:  346.74535155296326
15 True 48
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 5
number of missing combinations: 49
Train Data Shape:  (154974, 6)
Test Data Shape:  (51515, 6)
Checkout training and testing errors
training errors:  189.229 13.756 332.595 6.086
testing errors:  116.662 10.801 207.699 4.909
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 2
number of missing combinations: 52
Train Data Shape:  (153936, 6)
Test Data Shape:  (51193, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  116.662 10.801 207.699 4.909
Checkin evaluation:
mse, rmse, mre, mae:  63.399 7.962 183.357 3.246
Total Time Elapsed:  376.43824458122253
15 False 1
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 5
number of missing combinations: 49
Train Data Shape:  (154974, 5)
Test Data Shape:  (51515, 5)
Checkout training and testing errors
training errors:  302.581 17.395 373.957 8.264
testing errors:  118.851 10.902 209.011 5.428
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 2
number of missing combinations: 52
Train Data Shape:  (153934, 5)
Test Data Shape:  (51193, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  118.851 10.902 209.011 5.428
Checkin evaluation:
mse, rmse, mre, mae:  240560.536 490.47 3228.748 277.161
Total Time Elapsed:  363.2732548713684
15 False 24
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 5
number of missing combinations: 49
Train Data Shape:  (154974, 5)
Test Data Shape:  (51515, 5)
Checkout training and testing errors
training errors:  302.581 17.395 373.957 8.264
testing errors:  118.851 10.902 209.011 5.428
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 2
number of missing combinations: 52
Train Data Shape:  (153934, 5)
Test Data Shape:  (51193, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  118.851 10.902 209.011 5.428
Checkin evaluation:
mse, rmse, mre, mae:  260352.168 510.247 4843.031 277.161
Total Time Elapsed:  359.9021751880646
15 False 48
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 5
number of missing combinations: 49
Train Data Shape:  (154974, 5)
Test Data Shape:  (51515, 5)
Checkout training and testing errors
training errors:  302.581 17.395 373.957 8.264
testing errors:  118.851 10.902 209.011 5.428
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 2
number of missing combinations: 52
Train Data Shape:  (153934, 5)
Test Data Shape:  (51193, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  118.851 10.902 209.011 5.428
Checkin evaluation:
mse, rmse, mre, mae:  265899.99 515.655 5028.826 277.161
Total Time Elapsed:  353.48617672920227


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


20 True 1
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 4
number of missing combinations: 37
Train Data Shape:  (163889, 6)
Test Data Shape:  (54684, 6)
Checkout training and testing errors
training errors:  133.458 11.552 181.861 5.716
testing errors:  87.172 9.337 144.8 4.614
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 1
number of missing combinations: 32
Train Data Shape:  (162701, 6)
Test Data Shape:  (54111, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  87.172 9.337 144.8 4.614
Checkin evaluation:
mse, rmse, mre, mae:  64.527 8.033 167.594 3.503
Total Time Elapsed:  394.2254157066345
20 True 24
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 4
number of missing combinations: 37
Train Data Shape:  (163889, 6)
Test Data Shape:  (54684, 6)
Checkout training and testing errors
training errors:  133.458 11.552 181.861 5.716
testing errors:  87.172 9.337 144.8 4.614
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 1
number of missing combinations: 32
Train Data Shape:  (162701, 6)
Test Data Shape:  (54111, 6)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  87.172 9.337 144.8 4.614
Checkin evaluation:
mse, rmse, mre, mae:  52.998 7.28 134.344 3.271
Total Time Elapsed:  363.3079426288605
20 True 48
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 4
number of missing combinations: 37
Train Data Shape:  (163889, 6)
Test Data Shape:  (54684, 6)
Checkout training and testing errors
training errors:  133.458 11.552 181.861 5.716
testing errors:  87.172 9.337 144.8 4.614
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 1
number of missing combinations: 32


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Train Data Shape:  (162701, 6)
Test Data Shape:  (54111, 6)
Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  87.172 9.337 144.8 4.614
Checkin evaluation:
mse, rmse, mre, mae:  51.182 7.154 133.798 3.232
Total Time Elapsed:  383.9528148174286
20 False 1
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 4
number of missing combinations: 37
Train Data Shape:  (163889, 5)
Test Data Shape:  (54684, 5)
Checkout training and testing errors
training errors:  217.396 14.744 210.111 7.77
testing errors:  89.02 9.435 149.49 5.14
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 1
number of missing combinations: 32
Train Data Shape:  (162699, 5)
Test Data Shape:  (54111, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  89.02 9.435 149.49 5.14
Checkin evaluation:
mse, rmse, mre, mae:  206011.91 453.885 2144.49 283.961
Total Time Elapsed:  371.59585309028625
20 False 24
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 4
number of missing combinations: 37
Train Data Shape:  (163889, 5)
Test Data Shape:  (54684, 5)
Checkout training and testing errors
training errors:  217.396 14.744 210.111 7.77
testing errors:  89.02 9.435 149.49 5.14
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 1
number of missing combinations: 32
Train Data Shape:  (162699, 5)
Test Data Shape:  (54111, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  89.02 9.435 149.49 5.14
Checkin evaluation:
mse, rmse, mre, mae:  223135.424 472.372 3066.987 283.961
Total Time Elapsed:  384.61481738090515
20 False 48
37
Checkout predictions
(2438720, 16) (1970776, 16) (467944, 16)
number of missing combinations: 4
number of missing combinations: 37
Train Data Shape:  (163889, 5)
Test Data Shape:  (54684, 5)
Checkout training and testing errors
training errors:  217.396 14.744 210.111 7.77
testing errors:  89.02 9.435 149.49 5.14
Transition Matrix Computation
(2438720, 33) (1970776, 33) (467944, 33)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Summarize checkout predictions
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
Checkin predictions
(2438720, 15) (1970751, 15) (467969, 15)
number of missing combinations: 1
number of missing combinations: 32
Train Data Shape:  (162699, 5)
Test Data Shape:  (54111, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Evaluate the predictions
Checkout evaluation:
mse, rmse, mre, mae:  89.02 9.435 149.49 5.14
Checkin evaluation:
mse, rmse, mre, mae:  227390.242 476.855 3160.562 283.961
Total Time Elapsed:  367.3748152256012


In [20]:
df_checkout_errs

Unnamed: 0,n Clustering Iterations,n Transition Matrix,Temperature,mse,rmse,mre,mae
0,1,1,True,95.145,9.754,154.416,4.758
1,1,24,True,95.145,9.754,154.416,4.758
2,1,48,True,95.145,9.754,154.416,4.758
3,1,1,False,97.134,9.856,159.194,5.284
4,1,24,False,97.134,9.856,159.194,5.284
5,1,48,False,97.134,9.856,159.194,5.284
6,5,1,True,103.092,10.153,164.487,4.785
7,5,24,True,103.092,10.153,164.487,4.785
8,5,48,True,103.092,10.153,164.487,4.785
9,5,1,False,105.024,10.248,169.438,5.31


In [21]:
df_checkin_errs

Unnamed: 0,n Clustering Iterations,n Transition Matrix,Temperature,mse,rmse,mre,mae
0,1,1,True,63.115,7.944,159.067,3.359
1,1,24,True,54.113,7.356,143.145,3.182
2,1,48,True,52.536,7.248,142.612,3.146
3,1,1,False,211352.836,459.731,2428.513,275.041
4,1,24,False,222877.442,472.099,2902.128,275.04
5,1,48,False,227156.16,476.609,2997.88,275.04
6,5,1,True,68.821,8.296,185.482,3.405
7,5,24,True,56.899,7.543,149.575,3.196
8,5,48,True,54.8,7.403,147.423,3.159
9,5,1,False,211576.858,459.975,2327.125,271.783


* adding temperature feature helps both checkout and checkin numbers, but much more obvious in checkin numbers
* increasing number of transition matrices improve checkin results
* more iterations for clustering algorithm actually makes both checkout and checkin predictions worse first, then better