In [133]:
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from torch import nn
import torch
import numpy as np
from torch.autograd import Variable 
from sys import *
from subprocess import *
import os

In [134]:
mobi_data = pd.read_csv("../data/Mobi_System_Data_2020.csv", compression='zip').dropna()
geo_data = pd.read_csv('../data/geocodings.csv',index_col=0)
data = mobi_data.merge(geo_data.rename({'lat':'Departure lat','long':'Departure long'},axis=1),left_on='Departure station',right_on='address').drop('address',axis=1)
data = data.merge(geo_data.rename({'lat':'Return lat','long':'Return long'},axis=1),left_on='Return station',right_on='address').drop('address',axis=1)
data.shape
# data = data.loc[data["Departure postal code"].str.startswith("V6")]
print(data.columns)
data["time"] = pd.to_datetime(data["Departure"], format="%Y-%m-%d %H:%M:%S")
data["hour"] = data["time"].dt.hour
data["day"] = data["time"].dt.day
data["month"] = data["time"].dt.month
data["year"] = data["time"].dt.year
data["temperature"] = data["Departure temperature (C)"] + data['Return temperature (C)'] / 2
stations = pd.read_csv("../data/stations.csv")
max_lat, min_lat = stations["lat"].max(), stations["lat"].min()
max_long, min_long = stations["long"].max(), stations["long"].min()

Index(['Unnamed: 0', 'Departure', 'Return', 'Bike', 'Departure station',
       'Return station', 'Membership type', 'Covered distance (m)',
       'Duration (sec.)', 'Departure battery voltage (mV)',
       'Return battery voltage (mV)', 'Departure temperature (C)',
       'Return temperature (C)', 'Stopover duration (sec.)',
       'Number of stopovers', 'postal_code_x', 'Departure lat',
       'Departure long', 'postal_code_y', 'Return lat', 'Return long'],
      dtype='object')


In [135]:
import datetime
def day_of_week(x):
    times = str(x).split('-')
    y = int(times[0])
    m = int(times[1])
    d = int(times[2].split(' ')[0])
    return datetime.datetime(y, m, d).weekday()+1

In [136]:
data['day_of_week'] = data['Departure'].apply(day_of_week)

In [137]:
def generate_time_df(start_time, peroid):
    predict_data = pd.DataFrame(data={
        "month": [], 
        "day": [], 
        "hour": [], 
        "lat": [],
        "long": [],
        "parks": [],
        "stops": [],
        "population": [],
        "station": []})
    for row in stations.iterrows():
        time = pd.date_range(start_time, periods=peroid, freq='H')

        station_data = pd.DataFrame(data={"time": time})
        station_data["hour"] = station_data["time"].dt.hour
        station_data["day"] = station_data["time"].dt.dayofweek
        station_data["month"] = station_data["time"].dt.month
        station_data["lat"] = (row[1]["lat"] - min_lat) / (max_lat - min_lat)
        station_data["long"] = (row[1]["long"] - min_long) / (max_long - min_long)
        station_data["station"] = row[1]["station"]
        station_data["parks"] = row[1]["parks"]
        station_data["stops"] = row[1]["stops"]
        station_data["population"] = row[1]["population"]
        station_data["bike_ways"] = row[1]["bike_ways"]
        station_data = station_data.drop(columns=["time"])
        
        predict_data = pd.concat([predict_data, station_data])
    
    return predict_data

In [138]:
def hour_transf(x):
    if x >= 6 and x < 12:
        return 'Morning'
    elif x >= 12 and x < 18:
        return 'Afternoon'
    elif x >= 18 and x < 24:
        return 'Evening'
    else:
        return 'Wee'

In [139]:
data['period'] = data['hour'].apply(hour_transf)

In [140]:
data['period'].value_counts()

Afternoon    265529
Evening      181508
Morning      111384
Wee           13151
Name: period, dtype: int64

In [141]:
#features_departure = ["month", "day", "hour", "lat", "long", "parks", "stops", "population", "bike_ways"]
counts_data_departure = data.groupby(["month", "day_of_week", "period", "Departure lat", "Departure long", "Departure station"]).size().reset_index(name='counts_departure')
counts_data_departure = counts_data_departure.rename(columns={"Departure lat": "lat", "Departure long": "long", "Departure station": "station"})
temp = data.groupby(["month", "day_of_week", "period", "Departure lat", "Departure long", "Departure station"])["temperature"].mean().reset_index(name="temperature")
counts_data_departure["temperature"] = temp["temperature"]

counts_data_return = data.groupby(["month", "day_of_week", "period", "Return lat", "Return long", "Return station"]).size().reset_index(name='counts_return')
counts_data_return = counts_data_return.rename(columns={"Return lat": "lat", "Return long": "long", "Return station": "station"})

In [142]:
print(counts_data_departure.shape, counts_data_return.shape)

(52510, 8) (51832, 7)


In [143]:
merge_data = counts_data_return.merge(counts_data_departure[["month", "day_of_week", "period", "temperature", "station", "lat", "long", "counts_departure"]], how="right", on=["month", "day_of_week", "period", "station", "lat", "long"])
merge_data.shape
merge_data["counts_return"] = merge_data["counts_return"].fillna(0)

In [144]:
merge_data.head()

Unnamed: 0,month,day_of_week,period,lat,long,station,counts_return,temperature,counts_departure
0,1,1,Afternoon,43.390667,-79.763537,0981 Workshop - Service Complete,0.0,30.0,2
1,1,1,Afternoon,44.231878,-76.485435,0215 Princess & Union,5.0,14.75,2
2,1,1,Afternoon,44.821573,-64.237719,0281 Windsor & 14th,2.0,15.0,1
3,1,1,Afternoon,45.514375,-73.81142,0177 Quebec & 1st,17.0,12.071429,7
4,1,1,Afternoon,49.140196,-122.313343,0192 7th & Alder,4.0,15.3,5


In [145]:
def label_cal(x):
    if x['counts_return'] == 0 and x['counts_departure'] == 0:
        return 0
    elif x['counts_return'] == 0 and x['counts_departure'] >= 5:
        return 2
    elif x['counts_return'] == 0 and x['counts_departure'] < 5:
        return 0
    elif x['counts_departure'] == 0 and x['counts_return'] >= 5:
        return 1
    elif x['counts_departure'] == 0 and x['counts_return'] < 5:
        return 0
    elif x['counts_return'] / x['counts_departure'] > 2 and x['counts_return'] - x['counts_departure'] >= 5:
        return 1
    elif x['counts_departure'] / x['counts_return'] > 2 and x['counts_departure'] - x['counts_return'] >= 5:
        return 2
    else:
        return 0

In [146]:
merge_data.isnull().sum()

month               0
day_of_week         0
period              0
lat                 0
long                0
station             0
counts_return       0
temperature         0
counts_departure    0
dtype: int64

In [147]:
merge_data['label'] = merge_data.apply(label_cal,axis=1)

In [148]:
merge_data['label'].value_counts()

0    43510
2     5230
1     3770
Name: label, dtype: int64

In [149]:
merge_data.head()

Unnamed: 0,month,day_of_week,period,lat,long,station,counts_return,temperature,counts_departure,label
0,1,1,Afternoon,43.390667,-79.763537,0981 Workshop - Service Complete,0.0,30.0,2,0
1,1,1,Afternoon,44.231878,-76.485435,0215 Princess & Union,5.0,14.75,2,0
2,1,1,Afternoon,44.821573,-64.237719,0281 Windsor & 14th,2.0,15.0,1,0
3,1,1,Afternoon,45.514375,-73.81142,0177 Quebec & 1st,17.0,12.071429,7,1
4,1,1,Afternoon,49.140196,-122.313343,0192 7th & Alder,4.0,15.3,5,0


In [150]:
departure_time_df = generate_time_df('2020-01-01', 8760)

In [151]:
departure_time_df['period'] = departure_time_df['hour'].apply(hour_transf)

In [152]:
departure_time_df['day_of_week'] = departure_time_df['day'] + 1

In [153]:
train_data = departure_time_df[["month", "day_of_week", "period", "station", "bike_ways","parks","stops","population"]].drop_duplicates().merge(merge_data, how="right", on=["month", "day_of_week", "period", "station"])
#train_data["counts"] = train_data["counts"].fillna(0)
train_data.shape

(52510, 14)

In [154]:
train_data.isnull().sum()

month                  0
day_of_week            0
period                 0
station                0
bike_ways           1995
parks               1995
stops               1995
population          1995
lat                    0
long                   0
counts_return          0
temperature            0
counts_departure       0
label                  0
dtype: int64

In [155]:
train_data.dropna().to_csv('data_2020_merged_xgbClassify.csv')

In [156]:
mobi_data = pd.read_csv("../data/Mobi_System_Data_2021.csv", compression='zip').dropna()
geo_data = pd.read_csv('../data/geocodings.csv',index_col=0)
data = mobi_data.merge(geo_data.rename({'lat':'Departure lat','long':'Departure long'},axis=1),left_on='Departure station',right_on='address').drop('address',axis=1)
data = data.merge(geo_data.rename({'lat':'Return lat','long':'Return long'},axis=1),left_on='Return station',right_on='address').drop('address',axis=1)
data.shape
# data = data.loc[data["Departure postal code"].str.startswith("V6")]
print(data.columns)
data["time"] = pd.to_datetime(data["Departure"], format="%Y-%m-%d %H:%M:%S")
data["hour"] = data["time"].dt.hour
data["day"] = data["time"].dt.dayofweek
data["month"] = data["time"].dt.month
data["year"] = data["time"].dt.year
data["temperature"] = data["Departure temperature (C)"] + data['Return temperature (C)'] / 2
stations = pd.read_csv("../data/stations.csv")
max_lat, min_lat = stations["lat"].max(), stations["lat"].min()
max_long, min_long = stations["long"].max(), stations["long"].min()

data['day_of_week'] = data['Departure'].apply(day_of_week)

data['period'] = data['hour'].apply(hour_transf)

#features_departure = ["month", "day", "hour", "lat", "long", "parks", "stops", "population", "bike_ways"]
counts_data_departure = data.groupby(["month", "day_of_week", "period", "Departure lat", "Departure long", "Departure station"]).size().reset_index(name='counts_departure')
counts_data_departure = counts_data_departure.rename(columns={"Departure lat": "lat", "Departure long": "long", "Departure station": "station"})
temp = data.groupby(["month", "day_of_week", "period", "Departure lat", "Departure long", "Departure station"])["temperature"].mean().reset_index(name="temperature")
counts_data_departure["temperature"] = temp["temperature"]

counts_data_return = data.groupby(["month", "day_of_week", "period", "Return lat", "Return long", "Return station"]).size().reset_index(name='counts_return')
counts_data_return = counts_data_return.rename(columns={"Return lat": "lat", "Return long": "long", "Return station": "station"})

data_2021 = counts_data_return.merge(counts_data_departure[["month", "day_of_week", "period", "temperature", "station", "lat", "long", "counts_departure"]], how="right", on=["month", "day_of_week", "period", "station", "lat", "long"])
data_2021.shape
data_2021["counts_return"] = data_2021["counts_return"].fillna(0)
data_2021.shape

Index(['Unnamed: 0', 'Departure', 'Return', 'Bike', 'Departure station',
       'Return station', 'Membership type', 'Covered distance (m)',
       'Duration (sec.)', 'Departure battery voltage (mV)',
       'Return battery voltage (mV)', 'Departure temperature (C)',
       'Return temperature (C)', 'Stopover duration (sec.)',
       'Number of stopovers', 'postal_code_x', 'Departure lat',
       'Departure long', 'postal_code_y', 'Return lat', 'Return long'],
      dtype='object')


(30439, 9)

In [157]:
data_2021['label'] = data_2021.apply(label_cal,axis=1)

In [158]:
data_2021 = departure_time_df[["month", "day_of_week", "period", "station", "bike_ways","parks","stops","population"]].drop_duplicates().merge(data_2021, how="right", on=["month", "day_of_week", "period", "station"])
#train_data["counts"] = train_data["counts"].fillna(0)
data_2021.shape

(30439, 14)

In [159]:
data_2021.isnull().sum()

month                  0
day_of_week            0
period                 0
station                0
bike_ways           1638
parks               1638
stops               1638
population          1638
lat                    0
long                   0
counts_return          0
temperature            0
counts_departure       0
label                  0
dtype: int64

In [160]:
data_2021 = data_2021.dropna()

In [161]:
dummy_periods = pd.get_dummies(data_2021['period'])
df = pd.merge(
    left=data_2021,
    right=dummy_periods,
    left_index=True,
    right_index=True,
)
df = df.drop(['station','period','counts_return','counts_departure'], axis=1)

In [163]:
test_data_jan = df[df.month==1]
print(test_data_jan.columns)
test_data_jan.to_csv('202101_test.csv')

Index(['month', 'day_of_week', 'bike_ways', 'parks', 'stops', 'population',
       'lat', 'long', 'temperature', 'label', 'Afternoon', 'Evening',
       'Morning', 'Wee'],
      dtype='object')
