In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV



In [2]:
data = pd.read_csv("train.csv", header = 0)
test_data = pd.read_csv("test.csv", header = 0)

In [3]:
#how competition is scored
def get_rmsle(pred, actual):
    diff = np.log(pred + 1) - np.log(actual + 1)
    mean_error = np.square(diff).mean()
    return np.sqrt(mean_error)

In [4]:
# Extract year, month, day, hour from date time
def engineer_data(data):
    data['month'] = pd.DatetimeIndex(data.datetime).month
    data['dayofweek'] = pd.DatetimeIndex(data.datetime).dayofweek
    data['day'] = pd.DatetimeIndex(data.datetime).day
    data['hour'] = pd.DatetimeIndex(data.datetime).hour
    data['year'] = pd.DatetimeIndex(data.datetime).year    
    return data

In [5]:
#seperate casual and registered data from main features
def process_data(data, features):
    features = data[features].values
    labels_c = np.log(data['casual'].values + 1)
    labels_r = np.log(data['registered'].values + 1)
    return features, labels_c, labels_r

In [6]:
# Custom split is similar to how training and test data are seperated
def custom_split(data):
    train = data[data['day'] <= 15]
    test = data[data['day'] > 15]
    return train, test

In [7]:
#tune algorithm
df = engineer_data(data)
features = ['weather', 'holiday','atemp', 'temp','humidity', 'windspeed',
            'workingday', 'season', 'hour','year','month',
            'dayofweek']

train, test = custom_split(df)
features_train, labels_c_train, labels_r_train = process_data(train,features)
features_test, labels_c_test, labels_r_test = process_data(test,features)

params = {'n_estimators': [1000],
         'max_features' : [.8],
         'min_samples_split' : [11]}
    
reg = GridSearchCV(RandomForestRegressor(random_state = 111, n_jobs = -1), params)

reg_c = reg.fit(features_train, labels_c_train)
pred_c = np.exp(reg.predict(features_test)) - 1
  
reg_r = reg.fit(features_train, labels_r_train)
pred_r = np.exp(reg.predict(features_test)) - 1
    
pred = np.round(pred_c + pred_r)  

print reg_c.best_estimator_
print " "
get_rmsle(pred, np.exp(labels_c_test) + np.exp(labels_r_test) - 2)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.8, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=11,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=111, verbose=0, warm_start=False)
 


0.33002074861715619

In [9]:
#test on test dataset
df_test = engineer_data(test_data)

features_train, labels_c_train, labels_r_train = process_data(df,features)
test = df_test[features].values

reg = RandomForestRegressor(n_estimators = 1000, max_features = 0.8, 
                            min_samples_split = 11, random_state = 111, n_jobs = -1)
    
reg_c = reg.fit(features_train, labels_c_train)
pred_c = np.exp(reg.predict(test)) - 1
  
reg_r = reg.fit(features_train, labels_r_train)
pred_r = np.exp(reg.predict(test)) - 1
    
df_test['count'] = np.round(pred_c+pred_r)

#make csv
final = df_test[['datetime','count']]
final.to_csv('my_prediction.csv', index=False)