In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.externals.joblib import dump, load





In [7]:
neighborhood_model_df = pd.read_csv('data/trips/neighborhood_model.csv')

In [None]:
neighborhood_model_df

In [None]:
neighborhood_list = list(neighborhood_model_df.start_neighborhood.unique())

In [None]:
def get_neighborhood_model(neighborhood_name):
    df = neighborhood_model_df.loc[neighborhood_model_df['start_neighborhood'] == neighborhood_name]
    df['start_date'] = pd.to_datetime(df['start_date'])
    df = df.set_index('start_date')
    df = pd.get_dummies(df,drop_first=True, columns=['precip_cat', 'weekday_weekend'])
    df_2018 = df.loc['2018-01-01':'2018-12-31']
    df_2019 = df.loc['2019-01-01':'2019-12-31']
    ride_counts_2018 = df_2018.resample('H').count()
    ride_counts_2019 = df_2019.resample('H').count()
    df_2018 = df_2018.resample('H').mean()
    df_2019 = df_2019.resample('H').mean()
    df_2018['ride_count'] = ride_counts_2018['hour_x']
    df_2019['ride_count'] = ride_counts_2019['hour_x']
    df_2019 = df_2019.fillna(0)
    df_2018 = df_2018.fillna(0)
    y_train = df_2018[['ride_count']]
    x_train = df_2018.drop(columns=['ride_count'])
    y_test = df_2019[['ride_count']]
    x_test = df_2019.drop(columns=['ride_count'])
    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    bst = xgb.XGBRegressor(max_depth=6)
    bst.fit(x_train_scaled, y_train)
    y_pred = bst.predict(x_test_scaled)
    df_2019['y_pred'] = y_pred
    train_score = bst.score(x_train_scaled, y_train)
    test_score = bst.score(x_test_scaled, y_test)
    # Make subplot figure 
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces; weekday and weekend aggregate rides
    fig.add_trace(go.Scatter(x=df_2019.index, y=df_2019['ride_count'], name="Actual Number of Rides Each Hour",
                             line_color='red'))
    fig.add_trace(go.Scatter(x=df_2019.index, y=df_2019['y_pred'], name="Predicted Rides Each Hour",
                             line_color='black'), secondary_y=False)

    fig.update_layout(title_text='Predicted and Actual Rides Each Hour, {}'.format(neighborhood_name))

    # Set x-axis title
    fig.update_xaxes(title_text="Hour of Day")

    # Set y-axes titles
    fig.update_yaxes(title_text="<b>Numer of Rides</b> initiated", secondary_y=False)
    fig.update_yaxes(title_text="<b>Rides</b>", secondary_y=False)

    # Include x-axis slider
    fig.update_layout(xaxis_rangeslider_visible=True)

    fig.show()

In [None]:
neighborhood_list

In [None]:
get_neighborhood_model('Near Southeast, Navy Yard')

In [None]:
for neighborhood in neighborhood_list:
    get_neighborhood_model(neighborhood)

In [2]:
full_df = pd.read_csv('data/trips/full.csv')

In [3]:
def get_full_model(df):
    df['start_date'] = pd.to_datetime(df['start_date'])
    df = df.set_index('start_date')
    neighborhood_list = pd.DataFrame(df['start_neighborhood'])
    neighborhood_list.to_csv('models/neighborhood_list.csv')
    print([df.columns])
    df = pd.get_dummies(df,drop_first=True, columns=['precip_cat', 'weekday_weekend', 'start_neighborhood'])
    df_2018 = df.loc['2018-01-01':'2018-12-31']
    df_2019 = df.loc['2019-01-01':'2019-12-31']
    ride_counts_2018 = df_2018.resample('H').count()
    ride_counts_2019 = df_2019.resample('H').count()
    df_2018 = df_2018.resample('H').mean()
    df_2019 = df_2019.resample('H').mean()
    df_2018['ride_count'] = ride_counts_2018['hour_x']
    df_2019['ride_count'] = ride_counts_2019['hour_x']
    df_2019 = df_2019.fillna(0)
    df_2018 = df_2018.fillna(0)
    y_train = df_2018[['ride_count']]
    x_train = df_2018.drop(columns=['ride_count'])
    y_test = df_2019[['ride_count']]
    x_test = df_2019.drop(columns=['ride_count'])
    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    bst = xgb.XGBRegressor(max_depth=6, n_estimators=500)
    bst.fit(x_train_scaled, y_train)
    y_pred = bst.predict(x_test_scaled)
    df_2019['y_pred'] = y_pred
    train_score = bst.score(x_train_scaled, y_train)
    test_score = bst.score(x_test_scaled, y_test)
    bst.save_model('models/20191126_xgb.model')
    dump(scaler, 'models/20191126_scaler.bin', compress=True)
    print(x_test.shape)
    print(train_score)
    print(test_score)
    print(bst.feature_importances_)
    # Make subplot figure 
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces; weekday and weekend aggregate rides
    fig.add_trace(go.Scatter(x=df_2019.index, y=df_2019['ride_count'], name="Actual Number of Rides Each Hour",
                             line_color='red'))
    fig.add_trace(go.Scatter(x=df_2019.index, y=df_2019['y_pred'], name="Predicted Rides Each Hour",
                             line_color='black'), secondary_y=False)

    fig.update_layout(title_text='Predicted and Actual Rides Each Hour')

    # Set x-axis title
    fig.update_xaxes(title_text="Hour of Day")

    # Set y-axes titles
    fig.update_yaxes(title_text="<b>Numer of Rides</b> initiated", secondary_y=False)
    fig.update_yaxes(title_text="<b>Rides</b>", secondary_y=False)

    # Include x-axis slider
    fig.update_layout(xaxis_rangeslider_visible=True)

    
    fig.update_layout(
    xaxis=go.layout.XAxis(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1d",
                     step="day",
                     stepmode="backward"),
                dict(count=7,
                     label="1w",
                     step="day",
                     stepmode="backward"),
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
    )
    fig.show()
    
    n_features = x_train.shape[1]
    plt.figure(figsize=(15,9))
    plt.barh(range(n_features), bst.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), x_train.columns.values) 
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    print(range(n_features))

In [None]:
get_full_model(neighborhood_model_df)

In [4]:
def get_good_model(df):
    df['start_date'] = pd.to_datetime(df['start_date'])
    df = df.set_index('start_date')
    df = pd.get_dummies(df,drop_first=True, columns=['precip_cat', 'weekday_weekend'])
    df_2018 = df.loc['2018-01-01':'2018-12-31']
    df_2019 = df.loc['2019-01-01':'2019-12-31']
    ride_counts_2018 = df_2018.resample('H').count()
    ride_counts_2019 = df_2019.resample('H').count()
    df_2018 = df_2018.resample('H').mean()
    df_2019 = df_2019.resample('H').mean()
    df_2018['ride_count'] = ride_counts_2018['hour_x']
    df_2019['ride_count'] = ride_counts_2019['hour_x']
    df_2019 = df_2019.fillna(0)
    df_2018 = df_2018.fillna(0)
    y_train = df_2018[['ride_count']]
    x_train = df_2018.drop(columns=['ride_count'])
    y_test = df_2019[['ride_count']]
    x_test = df_2019.drop(columns=['ride_count'])
    scaler = StandardScaler()
    scaler.fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    bst = xgb.XGBRegressor(max_depth=6, n_estimators=500)
    bst.fit(x_train_scaled, y_train)
    y_pred = bst.predict(x_test_scaled)
    df_2019['y_pred'] = y_pred
    train_score = bst.score(x_train_scaled, y_train)
    pd.to_pickle(x_test, 'x_test.pickle')
    pd.to_pickle(df_2019, 'df_2019.pickle')
    test_score = bst.score(x_test_scaled, y_test)
    bst.save_model('models/20191205_xgb.model')
    dump(scaler, 'models/20191205_scaler.bin', compress=True)
    print(x_test.shape)
    print(train_score)
    print(test_score)
    print(bst.feature_importances_)
    # Make subplot figure 
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces; weekday and weekend aggregate rides
    fig.add_trace(go.Scatter(x=df_2019.index, y=df_2019['ride_count'], name="Actual Number of Rides Each Hour",
                             line_color='red'))
    fig.add_trace(go.Scatter(x=df_2019.index, y=df_2019['y_pred'], name="Predicted Rides Each Hour",
                             line_color='black'), secondary_y=False)

    fig.update_layout(title_text='Predicted and Actual Rides Each Hour')

    # Set x-axis title
    fig.update_xaxes(title_text="Hour of Day")

    # Set y-axes titles
    fig.update_yaxes(title_text="<b>Numer of Rides</b> initiated", secondary_y=False)
    fig.update_yaxes(title_text="<b>Rides</b>", secondary_y=False)

    # Include x-axis slider
    fig.update_layout(xaxis_rangeslider_visible=True)

    
    fig.update_layout(
    xaxis=go.layout.XAxis(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1d",
                     step="day",
                     stepmode="backward"),
                dict(count=7,
                     label="1w",
                     step="day",
                     stepmode="backward"),
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
    )
    fig.show()
    
    n_features = x_train.shape[1]
    plt.figure(figsize=(15,9))
    plt.barh(range(n_features), bst.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), x_train.columns.values) 
    plt.xlabel("Feature importance")
    plt.ylabel("Feature")
    print(range(n_features))
    print(x_train)
    print(x_train_scaled)

# Ride Count Prediction Testing

In [None]:
input1 = 75

In [None]:
input2 = 20

In [None]:
input3 = 6

In [None]:
input4 = 'rain'

In [None]:
input5 = 'weekday'

In [None]:
model_dict = {'temperature': input1, 'hour': input2, 'month': input3, 'precip': input4, 'weekday': input5}

In [None]:
model_df = pd.DataFrame([model_dict])

In [None]:
model_df

In [None]:
model_df['hour_x']=np.sin(2.*np.pi*model_df.hour/24.)
model_df['hour_y']=np.cos(2.*np.pi*model_df.hour/24.)

In [None]:
model_df.drop('hour', axis=1, inplace=True)

In [None]:
model_df

In [None]:
model_df['month_x']=np.sin(2.*np.pi*model_df.month/12.)
model_df['month_y']=np.cos(2.*np.pi*model_df.month/12.)

In [None]:
model_df

In [None]:
model_df.precip = model_df['precip'].astype('category', categories=['rain', 'trace', 'no rain'])
model_df.weekday = model_df['weekday'].astype('category', categories=['weekday', 'weekend'])

In [None]:
model_df.dtypes

In [None]:
model_df = pd.get_dummies(model_df, drop_first=False, columns=['precip', 'weekday'])

In [None]:
model_df

In [None]:
model_df = model_df[['temperature', 'hour_x', 'hour_y', 'month_x', 'month_y', 'precip_trace', 'precip_rain', 'weekday_weekend']]

In [None]:
model_df

In [None]:
bst = xgb.Booster()

In [None]:
bst.load_model('models/20191205_xgb.model')

In [None]:
scaler = load('models/20191205_scaler.bin')

In [None]:
scaled_model_df = scaler.transform(model_df)

In [None]:
scaled_model_df

In [None]:
scaled_dmatrix = xgb.DMatrix(scaled_model_df, feature_names=model_df.columns)

In [None]:
bst.predict(scaled_dmatrix)

In [None]:
model_dict = {'temperature': [input1, (input1-10), input1], 'hour': [input2, input2, (input2-12)], 'month': [input3, input3, input3], 'precip': [input4,input4,input4], 'weekday': [input5,input5,input5]}

In [None]:
def get_numbers(input1, input2, input3, input4, input5):
    model_dict = {'temperature': [input1, (input1-10), input1], 'hour': [input2, input2, (input2-12)], 'month': [input3, input3, input3], 'precip': [input4,input4,input4], 'weekday': [input5,input5,input5]}
    model_df = pd.DataFrame.from_dict(model_dict)
    model_df['hour_x']=np.sin(2.*np.pi*model_df.hour/24.)
    model_df['hour_y']=np.cos(2.*np.pi*model_df.hour/24.)
    model_df.drop('hour', axis=1, inplace=True)
    model_df['month_x']=np.sin(2.*np.pi*model_df.month/12.)
    model_df['month_y']=np.cos(2.*np.pi*model_df.month/12.)
    model_df.precip = model_df['precip'].astype('category', categories=['rain', 'trace', 'no rain'])
    model_df.weekday = model_df['weekday'].astype('category', categories=['weekday', 'weekend'])
    model_df = pd.get_dummies(model_df, drop_first=False, columns=['precip', 'weekday'])
    model_df = model_df[['temperature', 'hour_x', 'hour_y', 'month_x', 'month_y', 'precip_trace', 'precip_rain', 'weekday_weekend']]
    bst = xgb.Booster()
    bst.load_model('models/20191205_xgb.model')
    scaler = load('models/20191205_scaler.bin')
    scaled_model_df = scaler.transform(model_df)
    scaled_dmatrix = xgb.DMatrix(scaled_model_df, feature_names=model_df.columns)
    numbers = bst.predict(scaled_dmatrix)
    with open('x_test.pickle', 'rb') as handle:
        x_test = pickle.load(handle)
    x_test_scaled = scaler.transform(x_test)
    scaled_full_dmatrix = xgb.DMatrix(x_test_scaled, feature_names=model_df.columns)
    y_pred = bst.predict(scaled_full_dmatrix)
    with open('df_2019.pickle', 'rb') as handle:
        df_2019 = pickle.load(handle)
    df_2019['y_pred'] = y_pred
    
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces; weekday and weekend aggregate rides
    fig.add_trace(go.Scatter(x=df_2019.index, y=df_2019['ride_count'], name="Actual Number of Rides Each Hour",
                             line_color='red'))
    fig.add_trace(go.Scatter(x=df_2019.index, y=df_2019['y_pred'], name="Predicted Rides Each Hour",
                             line_color='black'), secondary_y=False)

    fig.update_layout(title_text='Predicted and Actual Rides Each Hour')

    # Set x-axis title
    fig.update_xaxes(title_text="Hour of Day")

    # Set y-axes titles
    fig.update_yaxes(title_text="<b>Numer of Rides</b> initiated", secondary_y=False)
    fig.update_yaxes(title_text="<b>Rides</b>", secondary_y=False)

    # Include x-axis slider
    fig.update_layout(xaxis_rangeslider_visible=True)

    
    fig.update_layout(
    xaxis=go.layout.XAxis(
        rangeselector=dict(
            buttons=list([
                dict(count=1,
                     label="1d",
                     step="day",
                     stepmode="backward"),
                dict(count=7,
                     label="1w",
                     step="day",
                     stepmode="backward"),
                dict(count=1,
                     label="1m",
                     step="month",
                     stepmode="backward"),
                dict(count=6,
                     label="6m",
                     step="month",
                     stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(
            visible=True
        ),
        type="date"
    )
    )
    fig.show()
    display_df = pd.DataFrame(numbers)
    display_df.index = ['Input Value', 'Subtract 10 Degrees', 'Add 12 Hours']
    display_df.columns = ['Expected Ride Count']
    figure = px.bar(display_df, x=display_df.index, y=display_df['Expected Ride Count'].values, labels={'y': 'Expected Ride Count'}, range_color='red')
    figure.show()

In [None]:
import pickle
import plotly.express as px

In [None]:
prediction = get_numbers(95, 15, 6, 'no rain', 'weekday')

In [None]:
display_df = pd.DataFrame(prediction)

In [None]:
display_df.index = ['Input Value', 'Subtract 10 Degrees', 'Add 12 Hours']

In [None]:
display_df.columns = ['Expected Ride Count']

In [None]:
display_df

In [None]:
import plotly.express as px

In [None]:
fig = px.bar(display_df, x=display_df.index, y=display_df['Expected Ride Count'].values, labels={'y': 'Expected Ride Count'})
fig.show()