In [2]:
import sys
sys.path.append('/Users/hongyili/Untitled Folder')

In [3]:
import numpy as np
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
import geohash
from sklearn.metrics import mean_squared_error
import random
import math
import datetime

In [4]:
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None, verbose=0):
    if score_func:
        gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func, verbose=verbose)
    else:
        gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds, verbose=verbose)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_, gs.scorer_
    print "Best score: ", gs.best_score_
    best = gs.best_estimator_
    return best

In [5]:
names = ["year","month","day","time_cat", "time_num", "time_cos", "time_sin", "day_cat", "day_num", "day_cos", "day_sin", "weekend", "geohash", "pickups"]
dftrain=pd.read_csv("/Users/hongyili/Desktop/big_Data_final/train_csv/train.csv", header=None, names = names)
print dftrain.shape

(868979, 14)


In [6]:
dfvalid=pd.read_csv("/Users/hongyili/Desktop/big_Data_final/valid_csv/valid.csv", header=None, names = names)
print dfvalid.shape

(770979, 14)


In [7]:
weather=pd.read_csv("/Users/hongyili/Desktop/big_Data_final/nyc-weather-data.csv")
weather.ix[weather.SNWD <= -9999, 'SNWD'] = 0
weather.ix[weather.SNOW <= -9999, 'SNOW'] = 0
weather.ix[weather.AWND <= -9999, 'AWND'] = 0
weather['year'] = (weather['DATE']/10000).apply(math.floor)
weather['month'] = ((weather['DATE'].mod(10000))/100).apply(math.floor)
weather['day'] = weather['DATE'].mod(100)
weather = weather[['year','month','day','PRCP','SNWD','SNOW','TMAX','TMIN','AWND']]
weather['PRCP'] = weather['PRCP'] / 10.
weather['TMAX'] = weather['TMAX'] / 10.
weather['TMIN'] = weather['TMIN'] / 10.
weather['AWND'] = weather['AWND'] / 10. * 3.6
weather.columns = ['year','month','day','precipitation','snow_depth','snowfall','max_temp','min_temp','avg_wind']
weather.head()

In [8]:
def get_yearday(df):
    date = datetime.date(df['year'],df['month'],df['day'])
    return (date.timetuple().tm_yday-1)/365.
def decodegeo(geo, which):
    if len(geo) >= 6:
        geodecoded = geohash.decode(geo)
        return geodecoded[which]
    else:
        return 0
def further_data_prep(df):
    df = pd.merge(df, weather, how='left', on=['year','month','day'])
    df['year_num'] = df.apply(lambda x:get_yearday(x),axis=1)
    df['month_num'] = (df['day']-1)/30.
    df['year_sin'] = (df['year_num'] * 2 * math.pi).apply(math.sin)
    df['year_cos'] = (df['year_num'] * 2 * math.pi).apply(math.cos)
    df['month_sin'] = (df['month_num'] * 2 * math.pi).apply(math.sin)
    df['month_cos'] = (df['month_num'] * 2 * math.pi).apply(math.cos)
    df['latitude'] = df['geohash'].apply(lambda geo: decodegeo(geo, 0))
    df['longitude'] = df['geohash'].apply(lambda geo: decodegeo(geo, 1))
    return df

In [9]:
dftrain = further_data_prep(dftrain)
dfvalid = further_data_prep(dfvalid)

In [10]:
dfvalid.head()

In [11]:
print dfvalid.head()

   year  month  day time_cat  time_num  time_cos  time_sin    day_cat   day_num   day_cos   day_sin  weekend  geohash  pickups  precipitation  snow_depth  snowfall  max_temp  min_temp  avg_wind  year_num  month_num  year_sin  year_cos  month_sin  month_cos   latitude  longitude
0  2015      4   28    14:30  0.614583 -0.751840 -0.659346    Tuesday  0.230655  0.121251  0.992622        0  dr5ru54        1            0.0           0         0      21.7      10.0     12.60  0.320548   0.900000  0.903356 -0.428892  -0.587785   0.809017  40.754471 -74.000473
1  2015      4    5    20:30  0.864583  0.659346 -0.751840     Sunday  0.980655  0.992622 -0.121251        1  dr5ru34        4            0.0           0         0      16.1       5.6      7.92  0.257534   0.133333  0.998880 -0.047321   0.743145   0.669131  40.743484 -73.989487
2  2015      4   23    09:30  0.406250 -0.831470  0.555570   Thursday  0.486607 -0.996461  0.084051        0  dr5rusy       14            0.0           0         0

In [12]:
Xnames = ['month','day','latitude','longitude','year_num','year_cos','year_sin',
          'month_cos','month_sin','day_num','day_cos','day_sin',
          'time_num','time_cos','time_sin','max_temp','min_temp','avg_wind']
Xtrain = dftrain[Xnames]
Xvalid = dfvalid[Xnames]
ytrain = np.log10(dftrain['pickups']+1)
yvalid = np.log10(dfvalid['pickups']+1)

In [13]:
max_samples = 2000000
if Xtrain.shape[0] > max_samples:
    rows = random.sample(Xtrain.index, max_samples)
    Xtrain = Xtrain.ix[rows]
    ytrain = ytrain.ix[rows]
print Xtrain.shape

(868979, 18)


In [15]:
reg = RandomForestRegressor(n_estimators=1, max_depth=30, n_jobs=-1, warm_start=True)
for n in range(1,51):
    reg.set_params(n_estimators=n)
    reg.fit(Xtrain,ytrain)
    training_accuracy = reg.score(Xtrain, ytrain)
    valid_accuracy = reg.score(Xvalid, yvalid)
    rmsetrain = np.sqrt(mean_squared_error(reg.predict(Xtrain),ytrain))
    rmsevalid = np.sqrt(mean_squared_error(reg.predict(Xvalid),yvalid))

In [16]:
pd.DataFrame(np.round(np.power(10,np.column_stack((reg.predict(Xvalid),yvalid))) - 1,decimals=0).astype(int)).head(20)

In [17]:
rmse = np.sqrt(mean_squared_error(reg.predict(Xvalid),yvalid))

In [18]:
import operator
dict_feat_imp = dict(zip(list(Xtrain.columns.values),reg.feature_importances_))
sorted_features = sorted(dict_feat_imp.items(), key=operator.itemgetter(1), reverse=True)
sorted_features

In [19]:
print sorted_features

[('longitude', 0.35018536691969482), ('latitude', 0.3220133537075871), ('time_num', 0.065631530472179261), ('day_sin', 0.031831495796949653), ('time_cos', 0.030141845295291292), ('day_num', 0.029319608386783661), ('time_sin', 0.024142720688378517), ('day_cos', 0.023740400079713262), ('avg_wind', 0.018060849844586214), ('month_sin', 0.014656824678866634), ('year_sin', 0.014575039636754265), ('month_cos', 0.013946798218800929), ('min_temp', 0.013172274920847066), ('day', 0.013106469328412243), ('max_temp', 0.012822307514109286), ('year_cos', 0.010454026453752749), ('year_num', 0.010182849968188168), ('month', 0.0020162380891047644)]


In [20]:
dftest=pd.read_csv("/Users/hongyili/Desktop/big_Data_final/test_csv/test.csv", header=None, names = names)


In [21]:
print dftest.shape

(2414160, 14)


In [22]:
dftest = further_data_prep(dftest)

In [23]:
dftest.head()

In [24]:
print dftest.head()

   year  month  day time_cat  time_num  time_cos  time_sin   day_cat   day_num   day_cos   day_sin  weekend  geohash  pickups  precipitation  snow_depth  snowfall  max_temp  min_temp  avg_wind  year_num  month_num  year_sin  year_cos     month_sin  month_cos   latitude  longitude
0  2015      5   31    06:00  0.260417 -0.065403  0.997859    Sunday  0.894345  0.787627 -0.616153        1  dr5rsp7        5           37.1           0         0      30.6      13.9      9.00  0.410959   1.000000  0.530730 -0.847541 -2.449294e-16   1.000000  40.733871 -73.999100
1  2015      5   16    00:30  0.031250  0.980785  0.195090  Saturday  0.718750 -0.195090 -0.980785        1  dr5ryj1        1            7.6           0         0      23.9      13.9      3.24  0.369863   0.500000  0.729558 -0.683919  1.224647e-16  -1.000000  40.765457 -73.913956
2  2015      5   16    21:00  0.885417  0.751840 -0.659346  Saturday  0.840774  0.539926 -0.841713        1  dr5rury        4            7.6           0     

In [25]:
time_data = dftest[(dftest['month'] == 5) & (dftest['day'] == 1)]
time_data = time_data.drop(['geohash','pickups','year','time_cat','day_cat','latitude','longitude'], axis=1).drop_duplicates()
print time_data.shape
time_data.head()

(48, 21)


In [26]:
time_cols = list(Xtrain.columns.values)
time_cols.remove('latitude')
time_cols.remove('longitude')
loc_data = Xtrain.drop(time_cols, axis=1).drop_duplicates()

loc_data = loc_data[(loc_data['latitude'] > 40.5) & (loc_data['latitude'] < 41.1) &
                    (loc_data['longitude'] > -74.1) & (loc_data['longitude'] < -73.6)]

print loc_data.shape
loc_data.head()

(17059, 2)


In [27]:
time_data['key'] = 1
loc_data['key'] = 1
result = pd.merge(time_data, loc_data, on='key').drop(['key'], axis=1)
print result.shape[0]
result = result[Xnames]
print result.head()

818832
   month  day   latitude  longitude  year_num  year_cos  year_sin  month_cos  month_sin  day_num   day_cos   day_sin  time_num  time_cos  time_sin  max_temp  min_temp  avg_wind
0      5    1  40.754471 -74.000473  0.328767 -0.474951  0.880012        1.0        0.0  0.61756 -0.739379 -0.673289  0.322917 -0.442289  0.896873      17.2       9.4      8.28
1      5    1  40.766830 -73.978500  0.328767 -0.474951  0.880012        1.0        0.0  0.61756 -0.739379 -0.673289  0.322917 -0.442289  0.896873      17.2       9.4      8.28
2      5    1  40.765457 -73.913956  0.328767 -0.474951  0.880012        1.0        0.0  0.61756 -0.739379 -0.673289  0.322917 -0.442289  0.896873      17.2       9.4      8.28
3      5    1  40.746231 -73.948288  0.328767 -0.474951  0.880012        1.0        0.0  0.61756 -0.739379 -0.673289  0.322917 -0.442289  0.896873      17.2       9.4      8.28
4      5    1  40.746231 -74.004593  0.328767 -0.474951  0.880012        1.0        0.0  0.61756 -0.739379 -

In [28]:
yy = dftest[['month','day','day_num','latitude','longitude','pickups']]
yy = yy[(yy['month'] == 5) & (yy['day'] == 1)]
result['pred_pickups'] = np.power(10,reg.predict(result)) - 1
result = pd.merge(result, yy, how='left', on=['month','day','day_num','latitude','longitude'])
result.head(10)

In [29]:
print result.shape[0]
result.head()

818832


In [30]:
result = result.drop(['month','day','time_cos','day_num','time_sin','day_cos','day_sin',
                      'max_temp','min_temp','avg_wind',
                      'year_num','year_sin','year_cos','month_sin','month_cos'], axis=1)
result.head()

In [31]:
result.to_csv('/Users/hongyili/Desktop/big_Data_final/weather.csv')