In [1]:
import pandas as pd
dataset = pd.read_csv('training_dataset.csv')
dataset = dataset[['fhv','boro','bus']]
dataset[:3]

Unnamed: 0,fhv,boro,bus
0,8971309.0,Bronx,538711
1,12832770.0,Brooklyn,615477
2,40729540.0,Manhattan,424707


### We will use OHE encoding for the 'borough' categorical data.

In [2]:
dataset_onehot = dataset.copy()

In [3]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown ='ignore', sparse=False)

In [4]:
x1 = dataset_onehot['boro'].values.reshape(-1,1)

In [5]:
enc.fit(x1)

OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=False)

In [6]:
enc.categories_

[array(['Bronx', 'Brooklyn', 'Manhattan', 'Queens', 'Staten Island'],
       dtype=object)]

In [7]:
print(enc.fit_transform(x1))

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]


In [8]:
enc_df = pd.DataFrame(enc.fit_transform(x1),columns=enc.categories_)
enc_df[:5]

Unnamed: 0,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0


In [9]:
#Concatenate back with our original training set
dataset = pd.concat([dataset,enc_df],axis=1)
dataset

Unnamed: 0,fhv,boro,bus,"(Bronx,)","(Brooklyn,)","(Manhattan,)","(Queens,)","(Staten Island,)"
0,8971309.0,Bronx,538711,1.0,0.0,0.0,0.0,0.0
1,12832770.0,Brooklyn,615477,0.0,1.0,0.0,0.0,0.0
2,40729540.0,Manhattan,424707,0.0,0.0,1.0,0.0,0.0
3,6830110.0,Queens,358402,0.0,0.0,0.0,1.0,0.0
4,101464.0,Staten Island,128997,0.0,0.0,0.0,0.0,1.0
5,4096583.0,Bronx,522826,1.0,0.0,0.0,0.0,0.0
6,20551960.0,Brooklyn,612752,0.0,1.0,0.0,0.0,0.0
7,53682800.0,Manhattan,410479,0.0,0.0,1.0,0.0,0.0
8,12687580.0,Queens,359768,0.0,0.0,0.0,1.0,0.0
9,430712.0,Staten Island,128042,0.0,0.0,0.0,0.0,1.0


In [10]:
X = dataset[['fhv',('Bronx',),('Brooklyn',),('Manhattan',),('Queens',), ('Staten Island',)]].values
X

array([[8.97130921e+06, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [1.28327732e+07, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [4.07295407e+07, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [6.83011011e+06, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 0.00000000e+00],
       [1.01464049e+05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.00000000e+00],
       [4.09658300e+06, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [2.05519560e+07, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [5.36827990e+07, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00],
       [1.26875780e+07, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 

In [11]:
Y = dataset['bus'].values
Y

array([538711, 615477, 424707, 358402, 128997, 522826, 612752, 410479,
       359768, 128042, 483663, 581227, 380075, 350548, 123960])

In [12]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X,Y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [13]:
#Given 1 million FHV rides and the borough Bronx - based on OHE - encoding - the model predicts bus count
model.predict([[1E7,1,0,0,0,0]])

array([516492.89369266])

In [14]:
X1 = dataset['fhv'].values

In [15]:
#Import ACS dataset on median income by borough
acs = pd.read_excel('https://www1.nyc.gov/assets/planning/download/office/data-maps/nyc-population/acs/econ_2017acs1yr_nyc.xlsx')

In [16]:
acs.columns[0]

'DP03: SELECTED ECONOMIC CHARACTERISTICS'

In [17]:
acs.loc[82]

DP03: SELECTED ECONOMIC CHARACTERISTICS          Median household income (dollars)
Unnamed: 1                                                                  60,879
Unnamed: 2                                                                  +/-500
Unnamed: 3                                                                     (X)
Unnamed: 4                                                                     (X)
Unnamed: 5                                                                  37,397
Unnamed: 6                                                                +/-1,444
Unnamed: 7                                                                     (X)
Unnamed: 8                                                                     (X)
Unnamed: 9                                                                  56,942
Unnamed: 10                                                               +/-1,151
Unnamed: 11                                                                    (X)
Unna

In [18]:
incomes = [37397, 56942, 85071, 64509, 79201, 37397, 56942, 85071, 64509, 79201, 37397,56942, 85071, 64509, 79201]
dataset['incomes'] = incomes
dataset[:5]

Unnamed: 0,fhv,boro,bus,"(Bronx,)","(Brooklyn,)","(Manhattan,)","(Queens,)","(Staten Island,)",incomes
0,8971309.0,Bronx,538711,1.0,0.0,0.0,0.0,0.0,37397
1,12832770.0,Brooklyn,615477,0.0,1.0,0.0,0.0,0.0,56942
2,40729540.0,Manhattan,424707,0.0,0.0,1.0,0.0,0.0,85071
3,6830110.0,Queens,358402,0.0,0.0,0.0,1.0,0.0,64509
4,101464.0,Staten Island,128997,0.0,0.0,0.0,0.0,1.0,79201


In [19]:
X1 = dataset[['fhv','incomes']].values
X2 = dataset[['fhv','incomes',('Bronx',),('Brooklyn',),('Manhattan',),('Queens',), ('Staten Island',)]].values

In [20]:
from sklearn.model_selection import cross_val_score
models = []
scores = []

def train(model, X):
    model.fit(X, Y)
    models.append(model)
    score = cross_val_score(
        model, X, Y, cv=3,
        scoring='neg_mean_absolute_error',
    ).mean()
    scores.append(score)
    return score

In [21]:
train(LinearRegression(),X1)

-58656.85960888894

In [22]:
train(LinearRegression(),X2)

-20911.170092623724

In [23]:
from sklearn.linear_model import BayesianRidge
train(BayesianRidge(), X1)

-131180.95309426452

In [24]:
train(BayesianRidge(), X2)

-131180.953094359

In [25]:
from sklearn.svm import SVR
train(SVR(gamma='scale'), X1)

-131998.33333333334

In [26]:
train(SVR(gamma='scale'), X2)

-131998.33333333334

In [27]:
from sklearn.linear_model import LassoLars
train(LassoLars(), X1)

-58657.83723905733

In [28]:
train(LassoLars(),X2)

-20888.164483087643

In [29]:
import numpy as np
best_index = np.argmax(scores)
best_index

7

In [30]:
best_model = models[best_index]
best_model

LassoLars(alpha=1.0, copy_X=True, eps=2.220446049250313e-16,
     fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
     positive=False, precompute='auto', verbose=False)

In [33]:
import pickle
pickle.dump(best_model, open('model3.pkl', 'wb'))