In [1]:
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from torch import nn
import torch
import numpy as np
from torch.autograd import Variable 
from sys import *
from subprocess import *
import os

In [None]:
!pip install torch

Collecting torch
  Downloading torch-1.11.0-cp38-none-macosx_10_9_x86_64.whl (129.9 MB)
[K     |████████████████████████████████| 129.9 MB 29 kB/s s eta 0:00:01
Installing collected packages: torch
Successfully installed torch-1.11.0


In [None]:
mobi_data = pd.read_csv("../data/Mobi_System_Data_2020.csv", compression='zip').dropna()
geo_data = pd.read_csv('../data/geocodings.csv',index_col=0)
data = mobi_data.merge(geo_data.rename({'lat':'Departure lat','long':'Departure long'},axis=1),left_on='Departure station',right_on='address').drop('address',axis=1)
data = data.merge(geo_data.rename({'lat':'Return lat','long':'Return long'},axis=1),left_on='Return station',right_on='address').drop('address',axis=1)
data.shape
# data = data.loc[data["Departure postal code"].str.startswith("V6")]
print(data.columns)
data["time"] = pd.to_datetime(data["Departure"], format="%Y-%m-%d %H:%M:%S")
data["hour"] = data["time"].dt.hour
data["day"] = data["time"].dt.dayofweek
data["month"] = data["time"].dt.month
data["year"] = data["time"].dt.year
stations = pd.read_csv("../data/stations.csv")
max_lat, min_lat = stations["lat"].max(), stations["lat"].min()
max_long, min_long = stations["long"].max(), stations["long"].min()

Index(['Unnamed: 0', 'Departure', 'Return', 'Bike', 'Departure station',
       'Return station', 'Membership type', 'Covered distance (m)',
       'Duration (sec.)', 'Departure battery voltage (mV)',
       'Return battery voltage (mV)', 'Departure temperature (C)',
       'Return temperature (C)', 'Stopover duration (sec.)',
       'Number of stopovers', 'postal_code_x', 'Departure lat',
       'Departure long', 'postal_code_y', 'Return lat', 'Return long'],
      dtype='object')


In [None]:
import datetime
def day_of_week(x):
    times = str(x).split('-')
    y = int(times[0])
    m = int(times[1])
    d = int(times[2].split(' ')[0])
    return datetime.datetime(y, m, d).weekday()+1

In [None]:
data['day_of_week'] = data['Departure'].apply(day_of_week)

In [None]:
def generate_time_df(start_time, peroid):
    predict_data = pd.DataFrame(data={
        "month": [], 
        "day": [], 
        "hour": [], 
        "lat": [],
        "long": [],
        "parks": [],
        "stops": [],
        "population": [],
        "station": []})
    for row in stations.iterrows():
        time = pd.date_range(start_time, periods=peroid, freq='H')

        station_data = pd.DataFrame(data={"time": time})
        station_data["hour"] = station_data["time"].dt.hour
        station_data["day"] = station_data["time"].dt.dayofweek
        station_data["month"] = station_data["time"].dt.month
        station_data["lat"] = (row[1]["lat"] - min_lat) / (max_lat - min_lat)
        station_data["long"] = (row[1]["long"] - min_long) / (max_long - min_long)
        station_data["station"] = row[1]["station"]
        station_data["parks"] = row[1]["parks"]
        station_data["stops"] = row[1]["stops"]
        station_data["population"] = row[1]["population"]
        station_data["bike_ways"] = row[1]["bike_ways"]
        station_data = station_data.drop(columns=["time"])
        
        predict_data = pd.concat([predict_data, station_data])
    
    return predict_data


In [None]:
def hour_transf(x):
    if x >= 6 and x < 12:
        return 'Morning'
    elif x >= 12 and x < 18:
        return 'Afternoon'
    elif x >= 18 and x < 24:
        return 'Evening'
    else:
        return 'Wee'

In [None]:
data['period'] = data['hour'].apply(hour_transf)

In [None]:
data['period'].value_counts()

Afternoon    265529
Evening      181508
Morning      111384
Wee           13151
Name: period, dtype: int64

In [None]:
#features_departure = ["month", "day", "hour", "lat", "long", "parks", "stops", "population", "bike_ways"]
counts_data_departure = data.groupby(["month", "day_of_week", "period", "Departure lat", "Departure long", "Departure station"]).size().reset_index(name='counts_departure')
counts_data_departure = counts_data_departure.rename(columns={"Departure lat": "lat", "Departure long": "long", "Departure station": "station"})

counts_data_return = data.groupby(["month", "day_of_week", "period", "Return lat", "Return long", "Return station"]).size().reset_index(name='counts_return')
counts_data_return = counts_data_return.rename(columns={"Return lat": "lat", "Return long": "long", "Return station": "station"})


In [None]:
counts_data_return.shape

(51832, 7)

In [None]:
counts_data_departure.shape

(52510, 7)

In [None]:
merge_data = counts_data_return.merge(counts_data_departure[["month", "day_of_week", "period", "station", "lat", "long", "counts_departure"]], how="right", on=["month", "day_of_week", "period", "station", "lat", "long"])
merge_data.shape
merge_data["counts_return"] = merge_data["counts_return"].fillna(0)

In [None]:
merge_data.head()

Unnamed: 0,month,day_of_week,period,lat,long,station,counts_return,counts_departure
0,1,1,Afternoon,43.390667,-79.763537,0981 Workshop - Service Complete,0.0,2
1,1,1,Afternoon,44.231878,-76.485435,0215 Princess & Union,5.0,2
2,1,1,Afternoon,44.821573,-64.237719,0281 Windsor & 14th,2.0,1
3,1,1,Afternoon,45.514375,-73.81142,0177 Quebec & 1st,17.0,7
4,1,1,Afternoon,49.140196,-122.313343,0192 7th & Alder,4.0,5


In [None]:
# Label of y definition:
# We design the rule for operation transfer bikes out or in or nothing to do:
# 1: out: Return counts much more than depature counts, which means operator needs to transfer bikes from this station
# 2: in: Departure counts much more than return counts, which means operator needs to put bikes to this station
# 0: No need to transfer bikes to this station

In [None]:
def label_cal(x):
    if x['counts_return'] == 0 and x['counts_departure'] == 0:
        return 0
    elif x['counts_return'] == 0 and x['counts_departure'] >= 5:
        return 2
    elif x['counts_return'] == 0 and x['counts_departure'] < 5:
        return 0
    elif x['counts_departure'] == 0 and x['counts_return'] >= 5:
        return 1
    elif x['counts_departure'] == 0 and x['counts_return'] < 5:
        return 0
    elif x['counts_return'] / x['counts_departure'] > 2 and x['counts_return'] - x['counts_departure'] >= 5:
        return 1
    elif x['counts_departure'] / x['counts_return'] > 2 and x['counts_departure'] - x['counts_return'] >= 5:
        return 2
    else:
        return 0

In [None]:
merge_data.isnull().sum()

month               0
day_of_week         0
period              0
lat                 0
long                0
station             0
counts_return       0
counts_departure    0
dtype: int64

In [None]:
merge_data['label'] = merge_data.apply(label_cal,axis=1)

In [None]:
merge_data['label'].value_counts()

0    43510
2     5230
1     3770
Name: label, dtype: int64

In [None]:
merge_data.head()

Unnamed: 0,month,day_of_week,period,lat,long,station,counts_return,counts_departure,label
0,1,1,Afternoon,43.390667,-79.763537,0981 Workshop - Service Complete,0.0,2,0
1,1,1,Afternoon,44.231878,-76.485435,0215 Princess & Union,5.0,2,0
2,1,1,Afternoon,44.821573,-64.237719,0281 Windsor & 14th,2.0,1,0
3,1,1,Afternoon,45.514375,-73.81142,0177 Quebec & 1st,17.0,7,1
4,1,1,Afternoon,49.140196,-122.313343,0192 7th & Alder,4.0,5,0


In [None]:
departure_time_df = generate_time_df('2020-01-01', 8760)

In [None]:
departure_time_df['period'] = departure_time_df['hour'].apply(hour_transf)

In [None]:
departure_time_df['day_of_week'] = departure_time_df['day'] + 1

In [None]:
train_data = departure_time_df[["month", "day_of_week", "period", "station", "bike_ways","parks","stops","population"]].drop_duplicates().merge(merge_data, how="right", on=["month", "day_of_week", "period", "station"])
#train_data["counts"] = train_data["counts"].fillna(0)
train_data.shape

(52510, 13)

In [None]:
train_data.isnull().sum()

month                  0
day_of_week            0
period                 0
station                0
bike_ways           1995
parks               1995
stops               1995
population          1995
lat                    0
long                   0
counts_return          0
counts_departure       0
label                  0
dtype: int64

In [None]:
train_data.dropna().to_csv('data_2020_merged_xgbClassify.csv')

In [25]:
model_data = pd.read_csv('/content/data_2020_merged_xgbClassify.csv')
model_data.shape

(50515, 14)

In [11]:
model_data.head()

Unnamed: 0.1,Unnamed: 0,month,day_of_week,period,station,bike_ways,parks,stops,population,lat,long,counts_return,counts_departure,label
0,4,1.0,1.0,Afternoon,0192 7th & Alder,0.0,0.0,0.0,0.597601,49.140196,-122.313343,4.0,5,0
1,5,1.0,1.0,Afternoon,0282 14th & Fraser,0.16129,0.0,0.16129,0.701333,49.202455,-122.966572,5.0,2,0
2,6,1.0,1.0,Afternoon,0179 11th & Kingsway,0.16129,0.0,0.16129,0.701333,49.214147,-122.938867,0.0,2,0
3,7,1.0,1.0,Afternoon,0198 6th & Main,0.247312,0.0,0.247312,0.701333,49.218888,-122.928148,5.0,4,0
4,8,1.0,1.0,Afternoon,0283 15th & Knight,0.139785,0.5,0.139785,0.624532,49.256496,-123.075426,3.0,1,0


In [29]:
dummy_periods = pd.get_dummies(model_data['period'])
df = pd.merge(
    left=model_data,
    right=dummy_periods,
    left_index=True,
    right_index=True,
)

In [30]:
df = df.drop(['station','period','counts_return','counts_departure','Unnamed: 0'], axis=1)

In [3]:
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,recall_score,precision_score
import pandas as pd

In [21]:
xgbclf = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, num_class=5, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=0.2,
              seed=None, silent=None, subsample=1, verbosity=1)

In [31]:
data_y = df['label']
data_x = df.drop(['label'], axis=1)

In [32]:
X_train, X_validate, y_train, y_validate = train_test_split(data_x, data_y, test_size=0.2, random_state=100)

In [33]:
X_train.shape, X_validate.shape, y_train.shape, y_validate.shape

((40412, 12), (10103, 12), (40412,), (10103,))

In [34]:
#xgbclf_temp = XGBClassifier(**conf.xgbclf_params) 
eval_set_clf = [(X_train,y_train.values),(X_validate, y_validate)]
xgbclf.fit(X_train, y_train.values, eval_set=eval_set_clf, verbose=True, early_stopping_rounds=100)

[0]	validation_0-merror:0.139958	validation_1-merror:0.142433
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 100 rounds.
[1]	validation_0-merror:0.138672	validation_1-merror:0.142235
[2]	validation_0-merror:0.138548	validation_1-merror:0.142433
[3]	validation_0-merror:0.138746	validation_1-merror:0.142136
[4]	validation_0-merror:0.138721	validation_1-merror:0.142136
[5]	validation_0-merror:0.138127	validation_1-merror:0.141938
[6]	validation_0-merror:0.13835	validation_1-merror:0.142037
[7]	validation_0-merror:0.138202	validation_1-merror:0.141245
[8]	validation_0-merror:0.137236	validation_1-merror:0.141245
[9]	validation_0-merror:0.136568	validation_1-merror:0.140156
[10]	validation_0-merror:0.136148	validation_1-merror:0.140156
[11]	validation_0-merror:0.136197	validation_1-merror:0.140057
[12]	validation_0-merror:0.135331	validation_1-merror:0.138969
[13]	validation_0-merror:0.1

XGBClassifier(max_depth=8, num_class=5, objective='multi:softprob',
              scale_pos_weight=0.2)

In [35]:
# feature importance
pd.DataFrame(xgbclf.feature_importances_,index=X_train.columns).sort_values(0,ascending=False)

Unnamed: 0,0
Wee,0.189065
Morning,0.185449
long,0.104864
lat,0.096827
bike_ways,0.092169
Evening,0.070841
population,0.068734
parks,0.062008
Afternoon,0.056901
month,0.037184


In [41]:
test = pd.read_csv('/content/202101_test.csv',index_col=0)
test.shape

(3701, 13)

In [47]:
from sklearn.metrics import classification_report

In [48]:
def print_precison_recall_f1(y_true, y_pre):
    print(classification_report(y_true, y_pre))
    f1 = round(f1_score(y_true, y_pre, average='macro'), 2)
    p = round(precision_score(y_true, y_pre, average='macro'), 2)
    r = round(recall_score(y_true, y_pre, average='macro'), 2)
    print("Precision: {}, Recall: {}, F1: {} ".format(p, r, f1))

In [49]:
from sklearn.metrics import f1_score, precision_score, recall_score
pre_y_test = xgbclf.predict(test.drop(['label'], axis=1))
print_precison_recall_f1(test.label, pre_y_test)

              precision    recall  f1-score   support

           0       0.86      0.94      0.90      3049
           1       0.53      0.22      0.31       297
           2       0.52      0.36      0.43       355

    accuracy                           0.83      3701
   macro avg       0.64      0.51      0.55      3701
weighted avg       0.80      0.83      0.81      3701

Precision: 0.64, Recall: 0.51, F1: 0.55 
