In [19]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [20]:
data = pd.read_csv('./data/train.csv')
data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [21]:
data['datetime']

0        2011-01-01 00:00:00
1        2011-01-01 01:00:00
2        2011-01-01 02:00:00
3        2011-01-01 03:00:00
4        2011-01-01 04:00:00
                ...         
10881    2012-12-19 19:00:00
10882    2012-12-19 20:00:00
10883    2012-12-19 21:00:00
10884    2012-12-19 22:00:00
10885    2012-12-19 23:00:00
Name: datetime, Length: 10886, dtype: object

In [18]:
pd.to_datetime('2011-01-01 00:00:00').month

1

In [22]:
print('Train Shape: ', data.shape)

Train Shape:  (10886, 12)


In [23]:
def preprocess_inputs(data):
    
    data = data.copy()
    
    data['datetime'] = pd.to_datetime(data['datetime'])
    
    data['month'] = data['datetime'].apply(lambda x: x.month)
    data['day'] = data['datetime'].apply(lambda x: x.day)
    data['hour'] = data['datetime'].apply(lambda x: x.hour)
    
    data = data.drop('datetime', axis=1)
    
    weather_one_hot = pd.get_dummies(data['weather'], prefix='weather')
    data = pd.concat([data, weather_one_hot], axis=1)
    data = data.drop('weather', axis=1)
    
    y = data['count']
    X = data.drop('count', axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = pd.DataFrame(scaler.transform(X_train), index = X_train.index, columns = X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns = X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [24]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [28]:
X_train.columns

Index(['season', 'holiday', 'workingday', 'temp', 'atemp', 'humidity',
       'windspeed', 'casual', 'registered', 'month', 'day', 'hour',
       'weather_1', 'weather_2', 'weather_3', 'weather_4'],
      dtype='object')

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
model = RandomForestClassifier(n_estimators=300, bootstrap = True, max_features = 'sqrt')

In [None]:
model.fit(X_train, y_train)