In [2]:
import sys
sys.path.append('/Users/hongyili/cloudfinal')

In [3]:
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
import geohash
from sklearn.metrics import mean_squared_error

In [4]:
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best

In [5]:
names = ["time_cat", "time_num", "time_cos", "time_sin", "day_cat", "day_num", "day_cos", "day_sin", "weekend", "geohash", "pickups"]
dftaxi=pd.read_csv("/Users/hongyili/Desktop/big_Data_final/train_csv/train2.csv", header=None, names = names)
print dftaxi.shape

(218979, 11)


In [6]:
print dftaxi.head()

  time_cat  time_num  time_cos  time_sin    day_cat   day_num   day_cos  \
0     4:00  0.177083  0.442289  0.896873  Wednesday  0.311012 -0.374029   
1    13:00  0.552083 -0.946930 -0.321439   Saturday  0.793155  0.267839   
2     0:00  0.010417  0.997859  0.065403     Friday  0.572917 -0.896873   
3     8:30  0.364583 -0.659346  0.751840     Friday  0.623512 -0.713687   
4    13:00  0.552083 -0.946930 -0.321439  Wednesday  0.364583 -0.659346   

    day_sin  weekend  geohash  pickups  
0  0.927417        0  dr5rege        1  
1 -0.963464        1  dr72mtw        1  
2 -0.442289        0  dr5rt8b        1  
3 -0.700465        0  dr5ruv6        4  
4  0.751840        0  dr5rvj4        3  


In [7]:
itrain, itest = train_test_split(xrange(dftaxi.shape[0]), train_size=0.8)
mask=np.ones(dftaxi.shape[0], dtype='int')
mask[itrain]=1
mask[itest]=0
mask = (mask==1)
mask[:10]

In [8]:
Xnames = ["time_cat", "time_num", "time_cos", "time_sin", "day_cat",
          "day_num", "day_cos", "day_sin", "weekend", "geohash"]
X = dftaxi[Xnames]
y = np.log10(dftaxi['pickups']+1)

In [9]:
def decodegeo(geo, which):
    if len(geo) == 7:
        geodecoded = geohash.decode(geo)
        return geodecoded[which]
    else:
        return 0
X['latitude'] = X['geohash'].apply(lambda geo: decodegeo(geo, 0))
X['longitude'] = X['geohash'].apply(lambda geo: decodegeo(geo, 1))

In [10]:
print X.head()

  time_cat  time_num  time_cos  time_sin    day_cat   day_num   day_cos  \
0     4:00  0.177083  0.442289  0.896873  Wednesday  0.311012 -0.374029   
1    13:00  0.552083 -0.946930 -0.321439   Saturday  0.793155  0.267839   
2     0:00  0.010417  0.997859  0.065403     Friday  0.572917 -0.896873   
3     8:30  0.364583 -0.659346  0.751840     Friday  0.623512 -0.713687   
4    13:00  0.552083 -0.946930 -0.321439  Wednesday  0.364583 -0.659346   

    day_sin  weekend  geohash   latitude  longitude  
0  0.927417        0  dr5rege  40.713272 -74.010086  
1 -0.963464        1  dr72mtw  40.856094 -73.929062  
2 -0.442289        0  dr5rt8b  40.698166 -73.937302  
3 -0.700465        0  dr5ruv6  40.766830 -73.967514  
4  0.751840        0  dr5rvj4  40.765457 -73.956528  


In [11]:
X = X.join(pd.get_dummies(X['time_cat']))\
     .join(pd.get_dummies(X['day_cat']))\
     .drop(['time_cat','day_cat','geohash'], axis=1)

In [12]:
Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask]
n_samples = Xtrain.shape[0]
n_features = Xtrain.shape[1]
print Xtrain.head()

   time_num  time_cos  time_sin   day_num   day_cos   day_sin  weekend  \
0  0.177083  0.442289  0.896873  0.311012 -0.374029  0.927417        0   
1  0.552083 -0.946930 -0.321439  0.793155  0.267839 -0.963464        1   
2  0.010417  0.997859  0.065403  0.572917 -0.896873 -0.442289        0   
3  0.364583 -0.659346  0.751840  0.623512 -0.713687 -0.700465        0   
4  0.552083 -0.946930 -0.321439  0.364583 -0.659346  0.751840        0   

    latitude  longitude  0:00    ...      8:30  9:00  9:30  Friday  Monday  \
0  40.713272 -74.010086   0.0    ...       0.0   0.0   0.0     0.0     0.0   
1  40.856094 -73.929062   0.0    ...       0.0   0.0   0.0     0.0     0.0   
2  40.698166 -73.937302   1.0    ...       0.0   0.0   0.0     1.0     0.0   
3  40.766830 -73.967514   0.0    ...       1.0   0.0   0.0     1.0     0.0   
4  40.765457 -73.956528   0.0    ...       0.0   0.0   0.0     0.0     0.0   

   Saturday  Sunday  Thursday  Tuesday  Wednesday  
0       0.0     0.0       0.0     

In [13]:
print X.head()

   time_num  time_cos  time_sin   day_num   day_cos   day_sin  weekend  \
0  0.177083  0.442289  0.896873  0.311012 -0.374029  0.927417        0   
1  0.552083 -0.946930 -0.321439  0.793155  0.267839 -0.963464        1   
2  0.010417  0.997859  0.065403  0.572917 -0.896873 -0.442289        0   
3  0.364583 -0.659346  0.751840  0.623512 -0.713687 -0.700465        0   
4  0.552083 -0.946930 -0.321439  0.364583 -0.659346  0.751840        0   

    latitude  longitude  0:00    ...      8:30  9:00  9:30  Friday  Monday  \
0  40.713272 -74.010086   0.0    ...       0.0   0.0   0.0     0.0     0.0   
1  40.856094 -73.929062   0.0    ...       0.0   0.0   0.0     0.0     0.0   
2  40.698166 -73.937302   1.0    ...       0.0   0.0   0.0     1.0     0.0   
3  40.766830 -73.967514   0.0    ...       1.0   0.0   0.0     1.0     0.0   
4  40.765457 -73.956528   0.0    ...       0.0   0.0   0.0     0.0     0.0   

   Saturday  Sunday  Thursday  Tuesday  Wednesday  
0       0.0     0.0       0.0     

In [33]:
knn_estimator = KNeighborsRegressor()


In [21]:
Xtrain_mean = Xtrain.mean()
Xtrain_std_dev = Xtrain.std()
Xtrain_normalized = (Xtrain - Xtrain_mean)/Xtrain_std_dev
Xtest_normalized = (Xtest - Xtrain_mean)/Xtrain_std_dev

In [36]:
knn_parameters = {"n_neighbors": [1,2,5]}
knn_best = cv_optimize(knn_estimator, knn_parameters, Xtrain_normalized, ytrain, score_func='mean_squared_error')

BEST {'n_neighbors': 5} -0.0760098565416 [mean: -0.11498, std: 0.00055, params: {'n_neighbors': 1}, mean: -0.09028, std: 0.00061, params: {'n_neighbors': 2}, mean: -0.07601, std: 0.00062, params: {'n_neighbors': 5}]


In [37]:
knn_reg=knn_best.fit(Xtrain_normalized, ytrain)
knn_training_accuracy = knn_reg.score(Xtrain_normalized, ytrain)
knn_test_accuracy = knn_reg.score(Xtest_normalized, ytest)

In [38]:
print "R^2 on training data: %0.4f" % (knn_training_accuracy)
print "R^2 on test data:     %0.4f" % (knn_test_accuracy)

R^2 on training data: 0.5598
R^2 on test data:     0.3229


In [39]:
np.round(np.power(10,np.column_stack((knn_reg.predict(Xtest_normalized),ytest))) - 1,decimals=0).astype(int)

In [15]:
import pickle

In [17]:
loaded_model = pickle.load(open('/Users/hongyili/Desktop/big_Data_final/finalized_model.sav', 'rb'))

In [None]:
data =[-1.301637,0.662949,1.456215,-0.740259,-0.542234,1.364702,-0.675616,40.7769271,-73.8739659,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.409786,-0.390731,-0.429153,-0.432979,-0.402595,-0.393212,2.510074
]

In [None]:
loaded_model.score(Xtrain_normalized, ytrain)

In [1]:
print X.head()

   time_num  time_cos  time_sin   day_num   day_cos   day_sin  weekend  \
0  0.177083  0.442289  0.896873  0.311012 -0.374029  0.927417        0   
1  0.552083 -0.946930 -0.321439  0.793155  0.267839 -0.963464        1   
2  0.010417  0.997859  0.065403  0.572917 -0.896873 -0.442289        0   
3  0.364583 -0.659346  0.751840  0.623512 -0.713687 -0.700465        0   
4  0.552083 -0.946930 -0.321439  0.364583 -0.659346  0.751840        0   

    latitude  longitude  0:00    ...      8:30  9:00  9:30  Friday  Monday  \
0  40.713272 -74.010086   0.0    ...       0.0   0.0   0.0     0.0     0.0   
1  40.856094 -73.929062   0.0    ...       0.0   0.0   0.0     0.0     0.0   
2  40.698166 -73.937302   1.0    ...       0.0   0.0   0.0     1.0     0.0   
3  40.766830 -73.967514   0.0    ...       1.0   0.0   0.0     1.0     0.0   
4  40.765457 -73.956528   0.0    ...       0.0   0.0   0.0     0.0     0.0   

   Saturday  Sunday  Thursday  Tuesday  Wednesday  
0       0.0     0.0       0.0     

In [2]:
print Xtrain_normalized

        time_num  time_cos  time_sin   day_num   day_cos   day_sin   weekend  \
0      -1.301637  0.662949  1.456215 -0.740259 -0.542234  1.364702 -0.675616   
1       0.030142 -1.273624 -0.314704  0.933328  0.369747 -1.301538  1.480122   
2      -1.893539  1.437415  0.247604  0.168850 -1.285104 -0.566654 -0.675616   
4       0.030142 -1.273624 -0.314704 -0.554305 -0.947620  1.117129 -0.675616   
5      -0.487772 -1.112672  0.960103 -0.626620 -0.800173  1.229411 -0.675616   
6      -0.709735 -0.728068  1.361146 -0.657613 -0.732691  1.271491 -0.675616   
7       1.065971  0.820864 -1.056076  0.086204 -1.364531 -0.371204 -0.675616   
8       1.139958  0.965527 -0.940327 -1.391098  1.003219  1.044687 -0.675616   
9       1.139958  0.965527 -0.940327  1.584167  1.399536 -0.113974  1.480122   
10      0.178118 -1.112672 -0.655033 -0.037766 -1.426602 -0.061520 -0.675616   
11      1.361921  1.296640 -0.490369 -0.368351 -1.247864  0.750574 -0.675616   
12      0.104130 -1.203844 -0.490369 -1.

In [3]:
print Xtrain_normalized.head()

   time_num  time_cos  time_sin   day_num   day_cos   day_sin   weekend  \
0 -1.301637  0.662949  1.456215 -0.740259 -0.542234  1.364702 -0.675616   
1  0.030142 -1.273624 -0.314704  0.933328  0.369747 -1.301538  1.480122   
2 -1.893539  1.437415  0.247604  0.168850 -1.285104 -0.566654 -0.675616   
4  0.030142 -1.273624 -0.314704 -0.554305 -0.947620  1.117129 -0.675616   
5 -0.487772 -1.112672  0.960103 -0.626620 -0.800173  1.229411 -0.675616   

   latitude  longitude     0:00    ...          8:30     9:00      9:30  \
0 -0.861566  -1.124154 -0.15127    ...     -0.153953 -0.15294 -0.152940   
1  2.514268   0.841296 -0.15127    ...     -0.153953 -0.15294 -0.152940   
2 -1.218625   0.641420  6.61065    ...     -0.153953 -0.15294 -0.152940   
4  0.371912   0.175042 -0.15127    ...     -0.153953 -0.15294 -0.152940   
5 -0.179907  -0.924277 -0.15127    ...     -0.153953 -0.15294  6.538483   

     Friday    Monday  Saturday    Sunday  Thursday   Tuesday  Wednesday  
0 -0.409786 -0.390731 -

In [4]:
knn_reg.predict(Xtrain_normalized.head())

In [5]:
print knn_reg.predict(Xtrain_normalized.head())

[ 0.39645425  0.33624825  0.4316725   0.803139    1.0433577 ]


In [6]:
print np.round(np.power(10,np.column_stack((knn_reg.predict(Xtest_normalized),ytest))) - 1,decimals=0).astype(int)

[[ 8  4]
 [ 5 15]
 [ 2  2]
 ..., 
 [ 1  1]
 [ 3  2]
 [ 2  1]]


In [7]:
print np.round(np.power(10,np.column_stack((knn_reg.predict(Xtest_normalized.head()),ytest.head()))) - 1,decimals=0).astype(int)

[[ 8  4]
 [ 5 15]
 [ 2  2]
 [ 5  5]
 [ 5 17]]


In [8]:
print ytest.head()

3     0.698970
20    1.204120
21    0.477121
26    0.778151
29    1.255273
Name: pickups, dtype: float64


In [11]:
data = Xtest_normalized.head(1)

In [13]:
print Xtest_normalized.head(1)

   time_num  time_cos  time_sin   day_num   day_cos   day_sin   weekend  \
3 -0.635747 -0.872731  1.245398  0.344473 -1.024829 -0.930695 -0.675616   

   latitude  longitude     0:00    ...          8:30     9:00     9:30  \
3  0.404372   -0.09146 -0.15127    ...      6.495447 -0.15294 -0.15294   

     Friday    Monday  Saturday    Sunday  Thursday   Tuesday  Wednesday  
3  2.440284 -0.390731 -0.429153 -0.432979 -0.402595 -0.393212  -0.398392  

[1 rows x 64 columns]


In [15]:
print Xtest_normalized.head(1)

   time_num  time_cos  time_sin   day_num   day_cos   day_sin   weekend  \
3 -0.635747 -0.872731  1.245398  0.344473 -1.024829 -0.930695 -0.675616   

   latitude  longitude     0:00    ...          8:30     9:00     9:30  \
3  0.404372   -0.09146 -0.15127    ...      6.495447 -0.15294 -0.15294   

     Friday    Monday  Saturday    Sunday  Thursday   Tuesday  Wednesday  
3  2.440284 -0.390731 -0.429153 -0.432979 -0.402595 -0.393212  -0.398392  

[1 rows x 64 columns]
