In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error as mae

In [2]:
def trainModelTestTrainSplit(data, model):
    c = data.columns.difference(['Count'])
    X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(data[c], data['Count'], test_size=0.3)
    model.verbose=1
    model.fit(X_train.as_matrix(), y_train.as_matrix())
    pred = model.predict(X_test)
    print ("result ", mae(pred, y_test))
    return model

In [3]:
def trainModelKfold(data, model, countFold = 5):
    meanMae = 0
    
    c = data.columns.difference(['Count'])
    kf = KFold(len(data), n_folds=countFold, shuffle=True)
    
    for train_index, test_index in kf:
        X_train, X_test = data[c].iloc[train_index], data[c].iloc[test_index]
        y_train, y_test = data['Count'].iloc[train_index], data['Count'].iloc[test_index]
        model.fit(X_train.as_matrix(), y_train.as_matrix())
        pred = model.predict(X_test)
        print (mae(pred, y_test))
        meanMae += mae(pred, y_test)
    meanMae /= countFold
    print ("result ", meanMae)
    return model

In [7]:
features = pd.read_csv('data_transform/all.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
data = features.fillna(100000000)
#data = data[data.columns.difference(['Point', 'Date'])]

In [9]:
remove_column = ['IsTest', 'IsTrain', 'IsValidation', 'Date', 'Point', 'CityName', 'MeanForThroughDay', 'MeanForMonthDayByPoints' \
                 ,'MeanForThroughDayByPoints', 'MeanForThroughWeekByPoints', 'MeanForYearDayByPoints']

In [17]:
dataTrain = data[data.IsTrain == 1]
dataTrain = dataTrain[dataTrain.columns.difference(remove_column)]

dataValidation =  data[data.IsValidation == 1]
dataValidation = dataValidation[dataValidation.columns.difference(remove_column)]

X_test = dataValidation[dataValidation.columns.difference(['Count'])]
y_test = dataValidation.Count

In [18]:
def trainModelKfoldOnValidation(data, model, countFold = 5):
    meanMae = 0
    
    c = data.columns.difference(['Count'])
    kf = KFold(len(data), n_folds=countFold, shuffle=True)
    
    for train_index, test_index in kf:
        X_train, X_test = data[c].iloc[train_index], data[c].iloc[test_index]
        y_train, y_test = data['Count'].iloc[train_index], data['Count'].iloc[test_index]
        model.fit(X_train.as_matrix(), y_train.as_matrix())
        pred = model.predict(X_test)
        print (mae(pred, y_test))
        meanMae += mae(pred, y_test)
    meanMae /= countFold
    print ("result ", meanMae)
    return model

In [16]:
model = trainModelTestTrainSplit(dataTrain, RandomForestRegressor())
pred = model.predict(X_test)
print (mae(pred, y_test))

result  17.1075044669
17.2152733584


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [19]:
model = trainModelKfoldOnValidation(dataTrain, RandomForestRegressor(), countFold = 25)
pred = model.predict(X_test)
print (mae(pred, y_test))

16.632610083
18.7105296745
18.443905552
16.315252074
16.8954690491
17.0089342693
16.1523292916
17.0751754946
17.5008296107
16.970134014
17.7065092534
16.8759412891
15.4571155073
16.1772814295
16.3549457562
16.8608806637
16.846330568
17.8841097639
16.939948947
17.321952776
16.6019783025
16.478366305
16.4514358647
16.62744097
16.6901723038
result  16.9191831525
16.875267573


In [None]:
m = trainModelKfold(data, LinearRegression(), countFold = 25)
print(data.columns.difference(['Count']))
m.coef_