In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error as mae

In [2]:
def trainModelTestTrainSplit(data, model):
    c = data.columns.difference(['Count'])
    X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(data[c], data['Count'], test_size=0.3)
    model.verbose=1
    model.fit(X_train.as_matrix(), y_train.as_matrix())
    pred = model.predict(X_test)
    print ("result ", mae(pred, y_test))
    return model

In [3]:
def trainModelKfold(data, model, countFold = 5):
    meanMae = 0
    
    c = data.columns.difference(['Count'])
    kf = KFold(len(data), n_folds=countFold, shuffle=True)
    
    for train_index, test_index in kf:
        X_train, X_test = data[c].iloc[train_index], data[c].iloc[test_index]
        y_train, y_test = data['Count'].iloc[train_index], data['Count'].iloc[test_index]
        model.fit(X_train.as_matrix(), y_train.as_matrix())
        pred = model.predict(X_test)
        print (mae(pred, y_test))
        meanMae += mae(pred, y_test)
    meanMae /= countFold
    print ("result ", meanMae)
    return model

In [2]:
features = pd.read_csv('data_transform/all.csv')
features.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Count,IsTest,IsTrain,IsValidation,Date,Year,Month,MonthDay,WeekDay,YearDay,...,MeanForYear,MeanForYearDay,MeanForMonthByPoints,MeanForMonthDayByPoints,MeanForThroughDayByPoints,MeanForThroughMonthByPoints,MeanForThroughWeekByPoints,MeanForWeekDayByPoints,MeanForYearByPoints,MeanForYearDayByPoints
0,8.0,0,1,0,2011-10-06,2011,10,6,3,278,...,87.318182,94.327731,259.317757,225.255814,8.0,32.333333,32.333333,214.42623,87.318182,171.25
1,88.0,0,1,0,2011-10-07,2011,10,7,4,279,...,87.318182,91.530201,259.317757,254.404762,88.0,32.333333,32.333333,291.581522,87.318182,185.25
2,1.0,0,1,0,2011-10-09,2011,10,9,6,281,...,87.318182,143.346821,259.317757,244.022222,1.0,32.333333,32.333333,262.022099,87.318182,223.8
3,1.0,0,1,0,2011-11-15,2011,11,15,1,318,...,87.318182,124.594937,257.108696,229.959184,1.0,1.0,1.0,198.868852,87.318182,190.25
4,2.0,0,0,1,2011-12-08,2011,12,8,3,341,...,87.318182,78.402299,230.669643,247.133333,,101.277778,18.0,214.42623,87.318182,227.75


In [7]:
features.columns

Index(['Count', 'IsTest', 'IsTrain', 'IsValidation', 'Date', 'Year', 'Month',
       'MonthDay', 'WeekDay', 'YearDay', 'ThroughMonth', 'ThroughWeek',
       'ThroughDay', 'Point', 'CityName', 'CityID', 'BranchNumber',
       'MeanForMonth', 'MeanForMonthDay', 'MeanForThroughDay',
       'MeanForThroughMonth', 'MeanForThroughWeek', 'MeanForWeekDay',
       'MeanForYear', 'MeanForYearDay', 'MeanForMonthByPoints',
       'MeanForMonthDayByPoints', 'MeanForThroughDayByPoints',
       'MeanForThroughMonthByPoints', 'MeanForThroughWeekByPoints',
       'MeanForWeekDayByPoints', 'MeanForYearByPoints',
       'MeanForYearDayByPoints'],
      dtype='object')

In [23]:
#data = features.fillna(100000000)
#data = data[data.columns.difference(['Point', 'Date'])]

In [83]:
remove_column = ['IsTest', 'IsTrain', 'IsValidation', 'Date', 'Point', 'CityName', 'MeanForThroughDay', 'MeanForMonthDayByPoints' \
                 ,'MeanForThroughDayByPoints', 'MeanForThroughWeekByPoints', 'MeanForYearDayByPoints']

In [17]:
dataTrain = data[data.IsTrain == 1]
dataTrain = dataTrain[dataTrain.columns.difference(remove_column)]

dataValidation =  data[data.IsValidation == 1]
dataValidation = dataValidation[dataValidation.columns.difference(remove_column)]

X_test = dataValidation[dataValidation.columns.difference(['Count'])]
y_test = dataValidation.Count

In [35]:
def trainModelKfoldOnValidation(data, model, countFold = 5):
    meanMae = 0
    
    c = data.columns.difference(['Count'])
    kf = KFold(len(data), n_folds=countFold, shuffle=True)
    
    for train_index, test_index in kf:
        X_train, X_test = data[c].iloc[train_index], data[c].iloc[test_index]
        y_train, y_test = data['Count'].iloc[train_index], data['Count'].iloc[test_index]
        model.fit(X_train.as_matrix(), y_train.as_matrix())
        pred = model.predict(X_test)
        print (mae(pred, y_test))
        meanMae += mae(pred, y_test)
    meanMae /= countFold
    print ("result ", meanMae)
    return model

In [16]:
model = trainModelTestTrainSplit(dataTrain, RandomForestRegressor())
pred = model.predict(X_test)
print (mae(pred, y_test))

result  17.1075044669
17.2152733584


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [19]:
model = trainModelKfoldOnValidation(dataTrain, RandomForestRegressor(), countFold = 25)
pred = model.predict(X_test)
print (mae(pred, y_test))

16.632610083
18.7105296745
18.443905552
16.315252074
16.8954690491
17.0089342693
16.1523292916
17.0751754946
17.5008296107
16.970134014
17.7065092534
16.8759412891
15.4571155073
16.1772814295
16.3549457562
16.8608806637
16.846330568
17.8841097639
16.939948947
17.321952776
16.6019783025
16.478366305
16.4514358647
16.62744097
16.6901723038
result  16.9191831525
16.875267573


In [None]:
m = trainModelKfold(data, LinearRegression(), countFold = 25)
print(data.columns.difference(['Count']))
m.coef_

# Weather (be or not to be)

In [52]:
features = pd.read_csv('data_transform/all.csv')
remove_column = ['IsTest', 'IsTrain', 'IsValidation', 'lat', 'lng', 'FirstOrder', 'BranchNumber', 'CityID', 'Date', 'Point', 'CityName', 'MeanForThroughDay', 'MeanForMonthDayByPoints' \
                 ,'MeanForThroughDayByPoints', 'MeanForThroughWeekByPoints', 'MeanForYearDayByPoints']

  interactivity=interactivity, compiler=compiler, result=result)


## without weather

In [53]:
weather = pd.read_csv('data_transform/tmp_weather.csv')
weather = weather[pd.notnull(weather.summary)]
weather.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Date,CityName,lat,lng,summary,icon,precipType,temperature,apparentTemperature,dewPoint,humidity,windSpeed,windBearing,visibility,cloudCover,pressure
0,2011-10-06,Сыктывкар,61.647851,50.833903,Overcast,cloudy,rain,4.95,1.77,4.13,0.94,4.05,195.0,8.32,1.0,990.66
1,2011-10-07,Сыктывкар,61.647851,50.833903,Overcast,cloudy,rain,7.22,5.09,5.04,0.86,3.13,240.0,11.27,1.0,
2,2011-10-09,Сыктывкар,61.647851,50.833903,Partly Cloudy,partly-cloudy-night,rain,9.71,9.0,8.08,0.9,1.82,229.0,8.48,0.5,1007.93
3,2011-11-15,Сыктывкар,61.647851,50.833903,Overcast,cloudy,snow,-1.04,-5.66,-1.32,0.98,4.14,205.0,5.63,1.0,988.27
4,2011-12-08,Сыктывкар,61.647851,50.833903,Overcast,cloudy,snow,-1.78,-3.57,-2.19,0.97,1.38,171.0,10.54,1.0,1013.8


In [54]:
#column with text to number
uniq = weather.summary.unique()
weather.summary = weather.summary.apply(lambda x: np.where(uniq == x)[0][0]) 

uniq = weather.icon.unique()
weather.icon = weather.icon.apply(lambda x: np.where(uniq == x)[0][0]) 

uniq = weather.precipType.unique()
weather.precipType = weather.precipType.apply(lambda x: np.where(uniq == x)[0][0]) 

weather.head()

Unnamed: 0,Date,CityName,lat,lng,summary,icon,precipType,temperature,apparentTemperature,dewPoint,humidity,windSpeed,windBearing,visibility,cloudCover,pressure
0,2011-10-06,Сыктывкар,61.647851,50.833903,0,0,0,4.95,1.77,4.13,0.94,4.05,195.0,8.32,1.0,990.66
1,2011-10-07,Сыктывкар,61.647851,50.833903,0,0,0,7.22,5.09,5.04,0.86,3.13,240.0,11.27,1.0,
2,2011-10-09,Сыктывкар,61.647851,50.833903,1,1,0,9.71,9.0,8.08,0.9,1.82,229.0,8.48,0.5,1007.93
3,2011-11-15,Сыктывкар,61.647851,50.833903,0,0,1,-1.04,-5.66,-1.32,0.98,4.14,205.0,5.63,1.0,988.27
4,2011-12-08,Сыктывкар,61.647851,50.833903,0,0,1,-1.78,-3.57,-2.19,0.97,1.38,171.0,10.54,1.0,1013.8


In [55]:
indexes = weather.index
with_weather = features.iloc[indexes]

dataTrain = with_weather[with_weather.IsTrain == 1]
dataTrain = dataTrain[dataTrain.columns.difference(remove_column)]

X_train = dataTrain[dataTrain.columns.difference(['Count'])].as_matrix()
y_train = dataTrain.Count.as_matrix()

dataValidation =  with_weather[with_weather.IsValidation == 1]
dataValidation = dataValidation[dataValidation.columns.difference(remove_column)]

X_test = dataValidation[dataValidation.columns.difference(['Count'])].as_matrix()
y_test = dataValidation.Count.as_matrix()
dataTrain

Unnamed: 0,Count,MeanForMonth,MeanForMonthByPoints,MeanForMonthDay,MeanForThroughMonth,MeanForThroughMonthByPoints,MeanForThroughWeek,MeanForWeekDay,MeanForWeekDayByPoints,MeanForYear,MeanForYearByPoints,MeanForYearDay,Month,MonthDay,ThroughDay,ThroughMonth,ThroughWeek,WeekDay,Year,YearDay
0,8.0,112.145788,259.317757,104.137491,32.333333,32.333333,32.333333,90.875335,214.426230,87.318182,87.318182,94.327731,10,6,0,0,0,3,2011,278
1,88.0,112.145788,259.317757,107.172263,32.333333,32.333333,32.333333,123.721962,291.581522,87.318182,87.318182,91.530201,10,7,1,0,0,4,2011,279
2,1.0,112.145788,259.317757,107.880699,32.333333,32.333333,32.333333,123.640722,262.022099,87.318182,87.318182,143.346821,10,9,3,0,0,6,2011,281
3,1.0,104.213605,257.108696,108.351462,1.000000,1.000000,1.000000,87.286115,198.868852,87.318182,87.318182,124.594937,11,15,40,1,6,1,2011,318
5,43.0,105.650704,230.669643,107.880699,101.277778,101.277778,18.000000,123.721962,291.581522,87.318182,87.318182,72.645161,12,9,64,2,9,4,2011,342
6,10.0,105.650704,230.669643,102.119765,101.277778,101.277778,18.000000,142.693482,312.527174,87.318182,87.318182,81.650000,12,10,65,2,9,5,2011,343
7,1.0,105.650704,230.669643,102.062201,101.277778,101.277778,18.000000,123.640722,262.022099,87.318182,87.318182,103.202532,12,11,66,2,9,6,2011,344
8,60.0,105.650704,230.669643,100.133333,101.277778,101.277778,100.400000,87.286115,198.868852,87.318182,87.318182,122.585106,12,13,68,2,10,1,2011,346
10,96.0,105.650704,230.669643,108.351462,101.277778,101.277778,100.400000,90.875335,214.426230,87.318182,87.318182,81.652632,12,15,70,2,10,3,2011,348
11,134.0,105.650704,230.669643,103.290323,101.277778,101.277778,100.400000,123.721962,291.581522,87.318182,87.318182,77.175824,12,16,71,2,10,4,2011,349


In [56]:
iter_count = 100
av = 0
for i in range(iter_count):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    av += mae(pred, y_test)
print(av / iter_count)

15.7560147679


In [57]:
weather_columns = weather.columns.difference(['Date', 'CityName', 'lat', 'lng'])
with_weather[weather_columns] = weather[weather_columns]
with_weather
with_weather.fillna(method='bfill', inplace=True)
with_weather.fillna(method='ffill', inplace=True)
dataTrain = with_weather[with_weather.IsTrain == 1]
dataTrain = dataTrain[dataTrain.columns.difference(remove_column)]

X_train = dataTrain[dataTrain.columns.difference(['Count'])].as_matrix()
y_train = dataTrain.Count.as_matrix()

dataValidation =  with_weather[with_weather.IsValidation == 1]
dataValidation = dataValidation[dataValidation.columns.difference(remove_column)]

X_test = dataValidation[dataValidation.columns.difference(['Count'])].as_matrix()
y_test = dataValidation.Count.as_matrix()
dataTrain

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


Unnamed: 0,Count,MeanForMonth,MeanForMonthByPoints,MeanForMonthDay,MeanForThroughMonth,MeanForThroughMonthByPoints,MeanForThroughWeek,MeanForWeekDay,MeanForWeekDayByPoints,MeanForYear,...,dewPoint,humidity,icon,precipType,pressure,summary,temperature,visibility,windBearing,windSpeed
0,8.0,112.145788,259.317757,104.137491,32.333333,32.333333,32.333333,90.875335,214.426230,87.318182,...,4.13,0.94,0,0,990.66,0,4.95,8.32,195.0,4.05
1,88.0,112.145788,259.317757,107.172263,32.333333,32.333333,32.333333,123.721962,291.581522,87.318182,...,5.04,0.86,0,0,1007.93,0,7.22,11.27,240.0,3.13
2,1.0,112.145788,259.317757,107.880699,32.333333,32.333333,32.333333,123.640722,262.022099,87.318182,...,8.08,0.90,1,0,1007.93,1,9.71,8.48,229.0,1.82
3,1.0,104.213605,257.108696,108.351462,1.000000,1.000000,1.000000,87.286115,198.868852,87.318182,...,-1.32,0.98,0,1,988.27,0,-1.04,5.63,205.0,4.14
5,43.0,105.650704,230.669643,107.880699,101.277778,101.277778,18.000000,123.721962,291.581522,87.318182,...,-4.04,0.89,0,1,1026.65,0,-2.44,9.06,188.0,0.89
6,10.0,105.650704,230.669643,102.119765,101.277778,101.277778,18.000000,142.693482,312.527174,87.318182,...,-6.47,0.86,0,1,1026.66,0,-4.43,10.61,163.0,2.45
7,1.0,105.650704,230.669643,102.062201,101.277778,101.277778,18.000000,123.640722,262.022099,87.318182,...,-8.93,0.88,0,1,1018.29,0,-7.28,8.42,173.0,4.10
8,60.0,105.650704,230.669643,100.133333,101.277778,101.277778,100.400000,87.286115,198.868852,87.318182,...,-11.34,0.88,0,1,1017.16,0,-9.77,8.40,192.0,3.49
10,96.0,105.650704,230.669643,108.351462,101.277778,101.277778,100.400000,90.875335,214.426230,87.318182,...,-11.45,0.84,0,1,1024.86,0,-9.23,10.80,170.0,3.61
11,134.0,105.650704,230.669643,103.290323,101.277778,101.277778,100.400000,123.721962,291.581522,87.318182,...,-6.72,0.94,0,1,1022.50,0,-5.90,7.19,162.0,2.77


In [58]:
iter_count = 100
av = 0
for i in range(iter_count):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    av += mae(pred, y_test)
print(av / iter_count)

15.7694493671


In [59]:
f = pd.DataFrame()
f['val'] = model.feature_importances_
f['Name'] = dataTrain.columns.difference(['Count'])
f.sort_values(by='val')

Unnamed: 0,val,Name
8,2.2e-05,MeanForYear
17,2.6e-05,Year
24,0.000147,precipType
23,0.000402,icon
26,0.000446,summary
9,0.000508,MeanForYearByPoints
11,0.000575,Month
14,0.000619,ThroughMonth
0,0.000997,MeanForMonth
1,0.001,MeanForMonthByPoints
