# Predicting demand of Uber Service
## Project 2: Classification Green and Uber Cabs
## Jose Oros, Annamali Kathir

In [1]:
#import the libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn import neighbors
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, r2_score, mean_absolute_error, mean_squared_error

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, classification_report, fbeta_score
from imblearn.over_sampling import RandomOverSampler


%matplotlib inline

Read cab data

In [2]:
green_trip = pd.read_csv('../Data/green_trips.csv')
#yellow_15q2= pd.read_csv('../Data/yellow_trips_2014Q2.csv')
#yellow_14q3= pd.read_csv('../Data/yellow_trips_2014Q3.csv')
#yellow_15q1= pd.read_csv('../Data/yellow_trips_2015Q1.csv')
#yellow_15q2= pd.read_csv('../Data/yellow_trips_2015Q2.csv')

Take a sample

In [3]:
green_trip_sub = green_trip.sample(50000)
green_trip_sub = green_trip_sub[(green_trip_sub.pickup_longitude != 0) & (green_trip_sub.pickup_latitude != 0)]
green_trip_sub = green_trip_sub.drop(['dropoff_datetime','dropoff_longitude','passenger_count','trip_distance', 'total_amount', 'dropoff_latitude'],axis=1)

In [4]:
green_trip_sub.columns

Index(['pickup_datetime', 'pickup_longitude', 'pickup_latitude'], dtype='object')

Map the data

In [5]:
zones_df = pd.read_csv('../Data/zones.csv')
geo_df = pd.read_csv('../Data/geographic.csv')

In [6]:
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

In [7]:
#creating a polygon for each of the NTA CODES in a dictionary, having the NTA code as the key and the polygon as the value
geo_new = {}
for col in geo_df:
    count = 0
    long = []
    lat = []
    for coord in geo_df[col].dropna():
        if count % 2 == 0:
            long.append(coord)
        else:
            lat.append(coord)
        count += 1
    
    poly = Polygon(list(zip(lat,long)))
    geo_new[col] = poly
    

In [8]:
#Function to check coordinates and output what NTA code they belong to
def check_coords(point, geo_new):
    for key,area in geo_new.items():
        if area.contains(Point(point)):
            return key
    

In [9]:
#run the function
green_trip_sub['zipped'] = list(zip(green_trip_sub['pickup_latitude'],green_trip_sub['pickup_longitude']))
green_trip_sub['nta_code'] = [check_coords(x,geo_new) for x in green_trip_sub['zipped']]

In [35]:
green_trip_sub.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,zipped,nta_code,time,hour,day_week,day_month,day_year,month,date_only,week_year,trip
0,2015-05-04 08:49:09,-73.961197,40.716579,"(40.7165794373, -73.9611968994)",BK73,08:49:09,8,0,4,124,5,2015-05-04,19,0
1,2015-02-11 15:21:41,-73.953438,40.82259,"(40.8225898743, -73.9534378052)",MN04,15:21:41,15,2,11,42,2,2015-02-11,7,0
2,2015-06-18 18:34:44,-73.912041,40.775227,"(40.775226593, -73.9120407104)",QN72,18:34:44,18,3,18,169,6,2015-06-18,25,0
3,2014-08-03 00:43:22,-73.885834,40.842884,"(40.8428840637, -73.8858337402)",BX17,00:43:22,0,6,3,215,8,2014-08-03,31,0
4,2015-01-15 13:36:28,-73.953346,40.787098,"(40.7870979309, -73.9533462524)",MN33,13:36:28,13,3,15,15,1,2015-01-15,3,0


In [36]:
green_trip_sub.dropna(inplace=True)
green_trip_sub.drop(['zipped','pickup_longitude','pickup_latitude'], axis=1, inplace=True)
green_trip_sub = green_trip_sub.reset_index(drop=True)

Now, include time information features

In [38]:
green_trip_sub['pickup_datetime'] = pd.to_datetime(green_trip_sub['pickup_datetime'])

In [39]:
green_trip_sub['time'] = green_trip_sub.pickup_datetime.dt.time
green_trip_sub['hour'] = green_trip_sub.pickup_datetime.dt.hour
green_trip_sub['day_week'] = green_trip_sub.pickup_datetime.dt.dayofweek
green_trip_sub['day_month'] = green_trip_sub.pickup_datetime.dt.day
green_trip_sub['day_year'] = green_trip_sub.pickup_datetime.dt.dayofyear
green_trip_sub['month'] = green_trip_sub.pickup_datetime.dt.month
green_trip_sub['date_only'] = green_trip_sub.pickup_datetime.dt.date
green_trip_sub['week_year'] = green_trip_sub.pickup_datetime.dt.weekofyear

# Classification Model

In [40]:
#import uber trips

In [17]:
uber_trip = pd.read_csv('uber_trip.csv')
uber_trip_sub = uber_trip.sample(50000)

In [41]:
#all data
uber_trip_sub['pickup_datetime'] = pd.to_datetime(uber_trip_sub['pickup_datetime'])
uber_trip_sub['time'] = uber_trip_sub.pickup_datetime.dt.time
uber_trip_sub['hour'] =uber_trip_sub.pickup_datetime.dt.hour
uber_trip_sub['day_week'] = uber_trip_sub.pickup_datetime.dt.dayofweek
uber_trip_sub['day_month'] = uber_trip_sub.pickup_datetime.dt.day
uber_trip_sub['day_year'] = uber_trip_sub.pickup_datetime.dt.dayofyear
uber_trip_sub['month'] = uber_trip_sub.pickup_datetime.dt.month
uber_trip_sub['date_only'] = uber_trip_sub.pickup_datetime.dt.date
uber_trip_sub['week_year'] = uber_trip_sub.pickup_datetime.dt.weekofyear

In [42]:
uber_trip_sub.head()

Unnamed: 0,pickup_datetime,nta_code,time,hour,day_week,day_month,day_year,month,date_only,trip,week_year
0,2015-03-09 15:17:00,MN24,15:17:00,15,0,9,68,3,2015-03-09,1,11
1,2015-06-13 12:09:00,BK21,12:09:00,12,5,13,164,6,2015-06-13,1,24
2,2015-03-24 23:17:00,MN34,23:17:00,23,1,24,83,3,2015-03-24,1,13
3,2015-04-25 14:40:00,BK09,14:40:00,14,5,25,115,4,2015-04-25,1,17
4,2015-05-15 16:22:00,QN01,16:22:00,16,4,15,135,5,2015-05-15,1,20


In [43]:
uber_trip_sub.shape

(50000, 11)

In [45]:
#uber_trip_sub.drop(['Unnamed: 0','Unnamed: 0.1'], axis=1, inplace=True)

In [46]:
uber_trip_sub.reset_index(drop=True,inplace=True)

In [166]:
uber_trip_sub.head()

Unnamed: 0,pickup_datetime,nta_code,time,hour,day_week,day_month,day_year,month,date_only,trip,week_year
0,2015-03-09 15:17:00,MN24,15:17:00,15,0,9,68,3,2015-03-09,1,11
1,2015-06-13 12:09:00,BK21,12:09:00,12,5,13,164,6,2015-06-13,1,24
2,2015-03-24 23:17:00,MN34,23:17:00,23,1,24,83,3,2015-03-24,1,13
3,2015-04-25 14:40:00,BK09,14:40:00,14,5,25,115,4,2015-04-25,1,17
4,2015-05-15 16:22:00,QN01,16:22:00,16,4,15,135,5,2015-05-15,1,20


Now we label the data

In [167]:
uber_trip_sub['trip'] = 1
green_trip_sub['trip'] = 0

In [168]:
#green_trip_sub.drop(['pickup_longitude', 'pickup_latitude'], axis=1, inplace=True)

In [169]:
#Concatenate data sets
all_cabs = pd.concat([uber_trip_sub, green_trip_sub])

### Create training and testing data

In [170]:
all_cabs.head()

Unnamed: 0,date_only,day_month,day_week,day_year,hour,month,nta_code,pickup_datetime,time,trip,week_year
0,2015-03-09,9,0,68,15,3,MN24,2015-03-09 15:17:00,15:17:00,1,11
1,2015-06-13,13,5,164,12,6,BK21,2015-06-13 12:09:00,12:09:00,1,24
2,2015-03-24,24,1,83,23,3,MN34,2015-03-24 23:17:00,23:17:00,1,13
3,2015-04-25,25,5,115,14,4,BK09,2015-04-25 14:40:00,14:40:00,1,17
4,2015-05-15,15,4,135,16,5,QN01,2015-05-15 16:22:00,16:22:00,1,20


In [171]:
nta_dummies = pd.get_dummies(all_cabs['nta_code'])
all_cabs_cla = pd.concat([all_cabs,nta_dummies], axis=1)
all_cabs_cla = all_cabs_cla.drop(['nta_code'], axis=1)

In [172]:
len(all_cabs_cla[all_cabs_cla['trip']==0])

48716

In [173]:
all_cabs_cla.columns

Index(['date_only', 'day_month', 'day_week', 'day_year', 'hour', 'month',
       'pickup_datetime', 'time', 'trip', 'week_year',
       ...
       'SI07', 'SI08', 'SI12', 'SI14', 'SI24', 'SI25', 'SI28', 'SI35', 'SI36',
       'SI37'],
      dtype='object', length=193)

In [174]:
#delete datetime objects
all_cabs_cla.drop(['pickup_datetime','time', 'date_only'], axis=1, inplace=True)
all_cabs_cla.dropna(inplace=True)

In [175]:
all_cabs_cla.shape

(98716, 190)

In [176]:
all_cabs_cla.drop(['day_year','month', 'week_year'], axis=1, inplace=True)

In [177]:
#all_cabs_cla.drop(['day_month','day_week', 'hour'], axis=1, inplace=True)

In [178]:
x = all_cabs_cla.drop(['trip'], axis=1)
y = all_cabs_cla.trip

In [188]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.20)

## Classifiers

In [189]:
classifiers = {}
classifier_parameters = {}

##### Random Forest classifier
classifiers['Random Forest'] = Pipeline([('clf', RandomForestClassifier())])
classifier_parameters['Random Forest'] = {'clf__max_depth':(1, 3, 9, 12, 15)}

#AdaBoost classifier
classifiers['AdaBoost'] = Pipeline([('clf', AdaBoostClassifier())])
classifier_parameters['AdaBoost'] = {'clf__n_estimators':(30, 40, 50, 60, 70)}

##### SVM
#classifiers['SVM'] = Pipeline([('clf', SVC())])
#classifier_parameters['SVM'] = {'clf__C':(0.1, 1, 10), 'clf__kernel': ('poly', 'rbf'), 'clf__gamma': (0.1, 0.5, 1)}

#### Logistic Regression with Lasso
classifiers['LogReg'] = Pipeline([('clf', LogisticRegression(penalty='l1'))])
classifier_parameters['LogReg'] = {'clf__C':(0.1, 1, 10)}

#### kNN
classifiers['kNN'] = Pipeline([('clf', neighbors.KNeighborsClassifier())])
classifier_parameters['kNN'] = {'clf__n_neighbors':(3,5,7), 'clf__weights': ('uniform', 'distance')}

### Train Algorithm - Cross Validation

In [190]:
# Create a label encoder to transform output labels.
le = LabelEncoder() 

# Split features and class into two dataframes.
X_training = x_train.values
y_training = le.fit_transform(y_train.values)

# Initialize scores dictionary
scores = pd.DataFrame(columns=['fold', 'algorithm', 'parameters', 'accuracy', 'precision', 'recall', 'fbeta_score', 'f1_score'])

# 10 fold CV
kf = KFold(n_splits=10, shuffle=True)

# Outer Cross Validation
fold = 0
for train_index, test_index in kf.split(X_training):
    X_train, X_test = X_training[train_index], X_training[test_index]
    Y_train, Y_test = y_training[train_index], y_training[test_index]
    
    fold = fold + 1

    # Inner CV
    for name, clf in classifiers.items():
        print('Fold ' + str(fold) + ': ' + name)
        if name in classifier_parameters:
            gs = GridSearchCV(estimator=clf, param_grid=classifier_parameters[name])
            gs.fit(X_train, Y_train)
            y_pred = gs.predict(X_test)
            best_params = str(gs.best_params_)
        else:
            clf.fit(X_train, Y_train)
            y_pred = clf.predict(Y_test)
            best_params = 'default'
        
        # collect the scores for printing out later
        scores = scores.append(pd.DataFrame(data={'fold':[fold],
                                                  'algorithm':[name], 
                                                  'parameters':[best_params], 
                                                  'accuracy':[accuracy_score(Y_test, y_pred)], 
                                                  'precision':[precision_score(Y_test, y_pred, average='weighted')],
                                                  'recall':[recall_score(Y_test, y_pred, average='weighted')],
                                                  'fbeta_score':[fbeta_score(Y_test, y_pred, beta=1)],
                                                  'f1_score':[f1_score(Y_test, y_pred, average='weighted')]}), 
                               ignore_index=True)
        

Fold 1: kNN
Fold 1: Random Forest
Fold 1: LogReg
Fold 1: AdaBoost
Fold 2: kNN
Fold 2: Random Forest
Fold 2: LogReg
Fold 2: AdaBoost
Fold 3: kNN
Fold 3: Random Forest
Fold 3: LogReg
Fold 3: AdaBoost
Fold 4: kNN
Fold 4: Random Forest
Fold 4: LogReg
Fold 4: AdaBoost
Fold 5: kNN
Fold 5: Random Forest
Fold 5: LogReg
Fold 5: AdaBoost
Fold 6: kNN
Fold 6: Random Forest
Fold 6: LogReg
Fold 6: AdaBoost
Fold 7: kNN
Fold 7: Random Forest
Fold 7: LogReg
Fold 7: AdaBoost
Fold 8: kNN
Fold 8: Random Forest
Fold 8: LogReg
Fold 8: AdaBoost
Fold 9: kNN
Fold 9: Random Forest
Fold 9: LogReg
Fold 9: AdaBoost
Fold 10: kNN
Fold 10: Random Forest
Fold 10: LogReg
Fold 10: AdaBoost


In [191]:
scores[['algorithm', 'accuracy', 'precision', 'recall', 'f1_score', 'fbeta_score']].groupby(['algorithm']).mean()

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,fbeta_score
algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AdaBoost,0.872056,0.891459,0.872056,0.870635,0.857913
LogReg,0.878045,0.892859,0.878045,0.877038,0.86668
Random Forest,0.850035,0.881719,0.850035,0.847118,0.827099
kNN,0.781733,0.781782,0.781733,0.781725,0.784545


In [192]:
scores[['algorithm', 'f1_score', 'parameters']][scores['algorithm']=='Random Forest']

Unnamed: 0,algorithm,f1_score,parameters
1,Random Forest,0.840427,{'clf__max_depth': 12}
5,Random Forest,0.855153,{'clf__max_depth': 15}
9,Random Forest,0.852105,{'clf__max_depth': 15}
13,Random Forest,0.841352,{'clf__max_depth': 12}
17,Random Forest,0.864996,{'clf__max_depth': 12}
21,Random Forest,0.857036,{'clf__max_depth': 12}
25,Random Forest,0.8443,{'clf__max_depth': 15}
29,Random Forest,0.829346,{'clf__max_depth': 12}
33,Random Forest,0.842266,{'clf__max_depth': 15}
37,Random Forest,0.844194,{'clf__max_depth': 12}


### Test in test data set

In [184]:
le = LabelEncoder() 

# Split features and class into two dataframes.
X_training = x_train.values
y_training = le.fit_transform(y_train.values)

clf = RandomForestClassifier(max_depth=15)
clf.fit(X_training, y_training)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [186]:
y_pred = clf.predict(x_test)
y_test_le = le.fit_transform(y_test.values)

print(fbeta_score(y_test_le, y_pred, beta=1))

0.83803301238


Let us look at how the top locations of each type of cab

In [158]:
green_trip_sub.groupby('nta_code').count().sort_values(['trip'], ascending=False).head(10)

Unnamed: 0_level_0,pickup_datetime,time,hour,day_week,day_month,day_year,month,date_only,week_year,trip
nta_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BK73,3601,3601,3601,3601,3601,3601,3601,3601,3601,3601
BK38,2437,2437,2437,2437,2437,2437,2437,2437,2437,2437
MN34,2433,2433,2433,2433,2433,2433,2433,2433,2433,2433
QN70,2416,2416,2416,2416,2416,2416,2416,2416,2416,2416
MN11,2319,2319,2319,2319,2319,2319,2319,2319,2319,2319
MN09,2127,2127,2127,2127,2127,2127,2127,2127,2127,2127
MN33,2049,2049,2049,2049,2049,2049,2049,2049,2049,2049
QN29,1931,1931,1931,1931,1931,1931,1931,1931,1931,1931
BK37,1749,1749,1749,1749,1749,1749,1749,1749,1749,1749
MN03,1743,1743,1743,1743,1743,1743,1743,1743,1743,1743


In [159]:
uber_trip_sub.groupby('nta_code').count().sort_values(['trip'], ascending=False).head(10)

Unnamed: 0_level_0,pickup_datetime,time,hour,day_week,day_month,day_year,month,date_only,trip,week_year
nta_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MN17,6404,6404,6404,6404,6404,6404,6404,6404,6404,6404
MN13,4784,4784,4784,4784,4784,4784,4784,4784,4784,4784
MN24,3675,3675,3675,3675,3675,3675,3675,3675,3675,3675
MN23,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660
QN98,1898,1898,1898,1898,1898,1898,1898,1898,1898,1898
MN40,1878,1878,1878,1878,1878,1878,1878,1878,1878,1878
MN22,1774,1774,1774,1774,1774,1774,1774,1774,1774,1774
MN25,1600,1600,1600,1600,1600,1600,1600,1600,1600,1600
MN15,1447,1447,1447,1447,1447,1447,1447,1447,1447,1447
MN20,1429,1429,1429,1429,1429,1429,1429,1429,1429,1429
