In [None]:
import plotly as py
import plotly.offline as po
import plotly.graph_objs as go
import ipywidgets as widegets
from scipy import special

py.offline.init_notebook_mode(connected=True)

In [None]:

import numpy as np
import pandas as pd 
import datetime 
import time 
import seaborn as sns 
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from pylab import *
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
plt.rcParams["figure.figsize"] = [9,5]
sns.set(rc={'figure.figsize':(6,3)})

In [None]:
cab_df=pd.read_csv('cab_rides.csv')
weather_df=pd.read_csv('weather.csv')
pd.set_option('display.max_columns', 500)

## Converting UNIX Date to DatetimeNS format

In [None]:
cab_df['date_time_ns'] = pd.to_datetime(cab_df['time_stamp']/1000, unit='s')
weather_df['date_time_ns'] = pd.to_datetime(weather_df['time_stamp'], unit='s')


In [None]:
weather_df['date_time_ns'].min()

In [None]:
weather_df['date_time_ns'].max()

In [None]:
cab_df['date_time_ns'].min()

### Defining Functions

In [None]:

def zscore(series):
    return (series - series.mean())/series.std()

def mean(series):
    return series.sum()/series.count()

def log_function(series):
    return np.log(series)

### Preprocessing datasets in order to merge them

### Creating a Primary Key taking (Location, Date & Time) from respective datasets

In [None]:
cab_df['merge_col'] = cab_df.source.astype(str) +" - "+ cab_df.date_time_ns.dt.date.astype("str") +" - "+ cab_df.date_time_ns.dt.hour.astype("str")

In [None]:
weather_df['merge_col'] = weather_df.location.astype(str) +" - "+ weather_df.date_time_ns.dt.date.astype("str") +" - "+ weather_df.date_time_ns.dt.hour.astype("str")

In [None]:
weather_df.info()

In [None]:
weather_df.index = weather_df['merge_col']

In [None]:
weather_df['merge_col'].nunique()

In [None]:
duplicateRowsDF = weather_df.drop_duplicates(keep='last', subset='merge_col', inplace=True)

In [None]:
print("Duplicate Rows except first occurrence based on all columns are :")
print(duplicateRowsDF)

In [None]:
weather_df.info()

In [None]:
merged_df = cab_df.join(weather_df,on=['merge_col'],rsuffix ='_w')

In [None]:
merged_df.count()

## Filling 0 for missing values for 'Rain' (Missing Data at Random)

In [None]:
merged_df['rain'].fillna(0,inplace=True)

As observing from the Null value plot, looks like we received some redundancies while merging the datasets on the primary key. We can dop 

In [None]:
merged_df = merged_df[pd.notnull(merged_df['date_time_ns_w'])]

In [None]:
plt.figure(figsize=(9,5))
sns.heatmap(merged_df.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
merged_df.columns

In [None]:
print(merged_df['cab_type'].value_counts(normalize=True))

### Fetching | 'Hour' | 'Day of Week' | 'Month' | 'Day in Month' | from Time Stamp of the cab bookings

In [None]:
merged_df['hour'] = merged_df.date_time_ns.dt.hour
merged_df['dayofweek'] = merged_df.date_time_ns.dt.dayofweek
merged_df['dayofmonth'] = merged_df.date_time_ns.dt.day

In [None]:
merged_df.info()

In [None]:
merged_df.set_index('date_time_ns_w')

In [None]:
plt.rcParams["figure.figsize"] = [10,3]

merged_df.groupby(['hour', 'cab_type'])['id'].count().unstack().plot(kind='line', marker='.')
plt.show()

In [None]:

merged_df_peak=merged_df.copy()

In [None]:
plt.rcParams["figure.figsize"] = [10,3]

merged_df.groupby(['hour', 'cab_type'])['price'].mean().unstack().plot(kind='line', marker='.')
plt.show()

In [None]:
fig = px.histogram(merged_df, x='hour', y='price', color='cab_type',histfunc='avg', barmode='group')
po.plot(fig, filename = 'price vs hour.html_avg', auto_open=False)

In [None]:
fig = px.histogram(merged_df, x='hour', y='price', color='cab_type',histfunc='max', barmode='group')

po.plot(fig, filename = 'price vs hour_max.html', auto_open=False)

In [None]:
sns.jointplot(x='hour',y='price', data=merged_df, kind='kde')

In [None]:
merged_df.head()

In [None]:
merged_df['highsurge']=merged_df['surge_multiplier'].apply(lambda x:'Yes' if x>1 else 'No')

In [None]:
fig=px.scatter(merged_df.sort_values(by=['hour']), x='hour', y='price', 
               color='highsurge', marginal_y='box', marginal_x='histogram')
po.plot(fig, filename = 'price vs hour.html', auto_open=False)

In [None]:
fig=px.scatter(merged_df.sort_values(by=['dayofweek']), x='dayofweek', y='price', 
               color='highsurge', marginal_y='box', marginal_x='histogram', trendline='ols')
po.plot(fig, filename = 'price vs day.html', auto_open=False)

In [None]:
fig=px.scatter_3d(merged_df.sort_values(by=['hour']), x='hour', y='price', z='dayofweek'
               , color='highsurge')
po.plot(fig, filename = 'price vs hour vs day.html', auto_open=False)

In [None]:
plt.rcParams["figure.figsize"] = [14,8]
cab_corr = merged_df.corr()
sns.heatmap(cab_corr, cmap='YlGnBu', annot=True)
plt.show()

In [None]:
sns.pairplot(merged_df, 
            vars=['price', 'distance'],
            hue='cab_type', 
            palette='husl',
            plot_kws={'alpha':0.8},
            size=5)
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = [10,3]
df_check = merged_df.loc['2018-11-26':'2018-12-18', :]
merged_df.groupby(['hour', 'cab_type'])['surge_multiplier'].max().unstack().plot(kind='line', marker='.')
plt.show()

In [None]:
merged_df[merged_df['cab_type']=='Uber'].groupby('hour')['surge_multiplier'].agg(np.max).max()

In [None]:
merged_df_lyft = merged_df[merged_df.loc[:,'cab_type']=='Lyft']

In [None]:
merged_df_null=merged_df[merged_df['price'].isnull()]

In [None]:
merged_df_notnull=merged_df[merged_df['price'].notnull()]
merged_df_notnull.set_index('date_time_ns_w', inplace=True)

In [None]:
merged_df_notnull.drop(columns=['time_stamp_w','time_stamp','date_time_ns'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(10,2))
sns.heatmap(merged_df_notnull.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
plt.rcParams["figure.figsize"] = [16,9]
sns.set(rc={'figure.figsize':(16,9)})
sns.swarmplot(x='cab_type', y='price', data=merged_df_notnull, hue='hour',size=10)
plt.show()

In [None]:
merged_df_notnull['cab_type'].dtype

In [None]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [None]:
le_dict = {}
for col in merged_df_notnull.dtypes[merged_df_notnull.dtypes=="object"].index:
    print(col)
    le_dict[col] = preprocessing.LabelEncoder()
    merged_df_notnull[col] = le_dict[col].fit_transform(merged_df_notnull[col].astype(str))

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(merged_df_notnull)

In [None]:
scaled_features=scaler.transform(merged_df_notnull)

In [None]:
scaled_df = pd.DataFrame(scaled_features)
scaled_df.columns = merged_df_notnull.columns
scaled_df.head()

In [None]:

def lasso_regressor(X,y, names):
    
    lasso = Lasso(alpha=0.1)

    lasso_coef = lasso.fit(X_new, y_new).coef_

    plt.plot(range(len(names)), lasso_coef)

    plt.xticks(range(len(names)), names, rotation=90)

    plt.ylabel('Coefficients')

    plt.show()

In [None]:

def random_forest_regressor(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                        test_size=0.3, 
                                                        random_state=21)


    rf = RandomForestRegressor(n_estimators=500,
                               min_samples_leaf=0.1,
                               random_state=21)

    rf.fit(X_train, y_train)

    return rf

In [None]:
names = merged_df_notnull.drop('price', axis=1).columns
X_new = merged_df_notnull.drop('price', axis=1).values
y_new = merged_df_notnull['price'].values
lasso_regressor(X_new, y_new, names)

In [None]:
X = merged_df_notnull.drop('price', axis=1).values
y = merged_df_notnull['price'].values

rf = random_forest_regressor(X,y)
importances_rf = pd.Series(rf.feature_importances_, index=merged_df_notnull.drop('price', axis=1).columns)

sorted_importances_rf = importances_rf.sort_values()

sorted_importances_rf.plot(kind='barh', color='lightgreen')

plt.show()

In [None]:
X = merged_df_notnull[['distance','surge_multiplier','hour']].values
y = merged_df_notnull[['price']].values

In [None]:
def perform_grid_search_cv(grid_dt, X_train, X_test, y_train, y_test):
    grid_dt.fit(X_train, y_train)
    best_hyperparams = grid_dt.best_params_
    best_cv_score = grid_dt.best_score_
    best_model = grid_dt.best_estimator_ 
    test_acc = best_model.score(X_test, y_test)
    print(best_hyperparams)
    print(best_cv_score)
    print(best_model)
    print(test_acc)

In [None]:
gbt = GradientBoostingRegressor(random_state=21)
params_dt = {
    'n_estimators': [100,300,500,700],
    'min_samples_leaf': [3, 4, 6, 8],
    'max_depth':[3,4,5,6],
    'learning_rate': [0.10,0.5,0, 0.0012]
}


grid_dt = GridSearchCV(estimator=gbt,
                       param_grid=params_dt,
                       scoring='r2',
                       cv=10,
                       n_jobs=-1
                      )



X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.3, 
                                                    random_state=1)

In [None]:
perform_grid_search_cv(grid_dt, X_train, X_test, y_train, y_test)

In [None]:
rf = RandomForestRegressor(random_state=21)

params_dt = {
    'n_estimators': [100,300,500,700],
    'min_samples_leaf': [0.04, 0.06, 0.08]
}


grid_dt = GridSearchCV(estimator=rf,
                       param_grid=params_dt,
                       scoring='r2',
                       cv=10,
                       n_jobs=1
                      )



X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.3, 
                                                    random_state=1)

perform_grid_search_cv(grid_dt, X_train, X_test, y_train, y_test)


In [None]:
dt = DecisionTreeRegressor(random_state=1)

params_dt = {
    'max_depth':[3,4,5,6],
    'min_samples_leaf': [0.04, 0.06, 0.08],
    'max_features':[0.2,0.4,0.6,0.8]
}


grid_dt = GridSearchCV(estimator=dt,
                       param_grid=params_dt,
                       scoring='r2',
                       cv=10,
                       n_jobs=1
                      )


X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.3, 
                                                    random_state=1)

perform_grid_search_cv(grid_dt, X_train, X_test, y_train, y_test)

In [None]:
X = merged_df_notnull[['distance','surge_multiplier','hour']].values
y = merged_df_notnull[['price']].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn=KNeighborsClassifier(n_neighbors=3)

knn.fit(X_train, y_train)