In [2]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.linear_model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error as mae
from __future__ import division
from sklearn.linear_model import Ridge

In [3]:
features = pd.read_csv('data_transform/all.csv')

In [3]:
remove_column = ['IsTest', 'IsTrain', 'IsValidation', 'Date', 'Point', 'CityName', 'FirstOrder']

In [4]:
data = features.fillna(100000000)

In [5]:
dataTrain = data[data.IsTrain == 1]
dataTrain = dataTrain[dataTrain.columns.difference(remove_column)]

dataValidation =  data[data.IsValidation == 1]
dataValidation = dataValidation[dataValidation.columns.difference(remove_column)]

X_test = dataValidation[dataValidation.columns.difference(['Count'])]
y_test = dataValidation.Count

In [6]:
currentColumn = dataTrain.columns

In [7]:
def trainModelTestTrainSplit(data, model):
    c = data.columns.difference(['Count'])
    X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(data[c], data['Count'], test_size=0.3)
    #model.verbose=1
    model.fit(X_train.as_matrix(), y_train.as_matrix())
    pred = model.predict(X_test)
    #print ("result on train " + str(mae(pred, y_test)))
    return model

In [8]:
def fitOnTrainCheckOnVal(dataTrain, dataValidation, model):
    model = trainModelTestTrainSplit(dataTrain, model)

    X_test = dataValidation[dataValidation.columns.difference(['Count'])]
    y_test = dataValidation.Count

    pred = model.predict(X_test)
    return (mae(pred, y_test))

In [9]:
def trainModelKfoldOnValidation(dataTrain, dataValidation, model, countFold = 5):
    meanMae = 0
    
    c = dataTrain.columns.difference(['Count'])
    kf = KFold(len(dataTrain), n_folds=countFold, shuffle=True)
    
    X_valid = dataValidation[c]
    y_valid = dataValidation.Count
    
    for train_index, test_index in kf:
        X_train, X_test = dataTrain[c].iloc[train_index], dataTrain[c].iloc[test_index]
        y_train, y_test = dataTrain['Count'].iloc[train_index], dataTrain['Count'].iloc[test_index]
        model.fit(X_train.as_matrix(), y_train.as_matrix())
        pred = model.predict(X_valid)
        #print (mae(pred, y_valid))
        meanMae += mae(pred, y_valid)
    meanMae /= countFold
    #print ("result ", meanMae)
    return meanMae

In [10]:
def RemoveTrash(dataTrain, dataValidation, model):
    features = pd.read_csv('data_transform/all.csv')
    column = features.columns
    remove_column = ['IsTest', 'IsTrain', 'IsValidation', 'Date', 'Point', 'CityName', 'FirstOrder']
    column = column.difference(remove_column)

    startScore = 10000000000000000000000000000000000000000
    localScore = 100000

    while (localScore < startScore):
        name = "None"
        prevStartScore = startScore
        startScore = trainModelKfoldOnValidation(dataTrain[column], dataValidation[column], model)
        localScore = startScore
        print "Coef between pasted baseline and current " + str(prevStartScore / startScore) 
        if( (prevStartScore / startScore) < 0.98):
            return column
        
        
        for i in column:
            if(i == 'Count'):
                continue
            c = column.difference([i])
            res = fitOnTrainCheckOnVal(dataTrain[c], dataValidation[c], model)
            if(res < localScore):
                localScore = res
                name = i
            print i,
        if(localScore < startScore):
            column = column.drop(name)
        print ""
        print "|_end iteration_| start score: " + str(startScore) + ", end score: " + str(localScore) + ", name feature: " + name
    return column

In [11]:
column = RemoveTrash(dataTrain, dataValidation, Ridge())

Coef between pasted baseline and current 1.00000110466e+32
BranchNumber CityID MeanForMonth MeanForMonthByPoints MeanForMonthDay MeanForMonthDayByPoints MeanForThroughDay MeanForThroughDayByPoints MeanForThroughMonth MeanForThroughMonthByPoints MeanForThroughWeek MeanForThroughWeekByPoints MeanForWeekDay MeanForWeekDayByPoints MeanForYear MeanForYearByPoints MeanForYearDay MeanForYearDayByPoints Month MonthDay ThroughDay ThroughMonth ThroughWeek WeekDay Year YearDay 
|_end iteration_| start score: 99999889.5338, end score: 29644148.4493, name feature: MeanForThroughDayByPoints
Coef between pasted baseline and current 3.46158329857
BranchNumber CityID MeanForMonth MeanForMonthByPoints MeanForMonthDay MeanForMonthDayByPoints MeanForThroughDay MeanForThroughMonth MeanForThroughMonthByPoints MeanForThroughWeek MeanForThroughWeekByPoints MeanForWeekDay MeanForWeekDayByPoints MeanForYear MeanForYearByPoints MeanForYearDay MeanForYearDayByPoints Month MonthDay ThroughDay ThroughMonth ThroughW

In [18]:
column = RemoveTrash(dataTrain, dataValidation, sklearn.linear_model.BayesianRidge())

 Coef between pasted baseline and current 1.00000105768e+32
BranchNumber CityID MeanForMonth MeanForMonthByPoints MeanForMonthDay MeanForMonthDayByPoints MeanForThroughDay MeanForThroughDayByPoints MeanForThroughMonth MeanForThroughMonthByPoints MeanForThroughWeek MeanForThroughWeekByPoints MeanForWeekDay MeanForWeekDayByPoints MeanForYear MeanForYearByPoints MeanForYearDay MeanForYearDayByPoints Month MonthDay ThroughDay ThroughMonth ThroughWeek WeekDay Year YearDay lat lng 
|_end iteration_| start score: 99999894.2325, end score: None, name feature: BranchNumber
Coef between pasted baseline and current 0.999999997655
CityID MeanForMonth MeanForMonthByPoints MeanForMonthDay MeanForMonthDayByPoints MeanForThroughDay MeanForThroughDayByPoints MeanForThroughMonth MeanForThroughMonthByPoints MeanForThroughWeek MeanForThroughWeekByPoints MeanForWeekDay MeanForWeekDayByPoints MeanForYear MeanForYearByPoints MeanForYearDay MeanForYearDayByPoints Month MonthDay ThroughDay ThroughMonth Through

KeyboardInterrupt: 

In [225]:
column = RemoveTrash(dataTrain, dataValidation, sklearn.linear_model.ElasticNet())

10002.5404826
BranchNumber CityID MeanForMonth MeanForMonthByPoints MeanForMonthDay MeanForMonthDayByPoints MeanForThroughDay MeanForThroughDayByPoints MeanForThroughMonth MeanForThroughMonthByPoints MeanForThroughWeek MeanForThroughWeekByPoints MeanForWeekDay MeanForWeekDayByPoints MeanForYear MeanForYearByPoints MeanForYearDay MeanForYearDayByPoints Month MonthDay ThroughDay ThroughMonth ThroughWeek WeekDay Year YearDay lat lng 
|_end iteration_| start score: 99974601.6267, end score: 29823154.2782, name feature: MeanForThroughDayByPoints
3.32924409581
BranchNumber CityID MeanForMonth MeanForMonthByPoints MeanForMonthDay MeanForMonthDayByPoints MeanForThroughDay MeanForThroughMonth MeanForThroughMonthByPoints MeanForThroughWeek MeanForThroughWeekByPoints MeanForWeekDay MeanForWeekDayByPoints MeanForYear MeanForYearByPoints MeanForYearDay MeanForYearDayByPoints Month MonthDay ThroughDay ThroughMonth ThroughWeek WeekDay Year YearDay lat lng 
|_end iteration_| start score: 30029219.4713



In [14]:
column = RemoveTrash(dataTrain, dataValidation, RandomForestRegressor())

Coef between pasted baseline and current 8.84849514138e+36
BranchNumber CityID MeanForMonth MeanForMonthByPoints MeanForMonthDay MeanForMonthDayByPoints MeanForThroughDay MeanForThroughDayByPoints MeanForThroughMonth MeanForThroughMonthByPoints MeanForThroughWeek MeanForThroughWeekByPoints MeanForWeekDay MeanForWeekDayByPoints MeanForYear MeanForYearByPoints MeanForYearDay MeanForYearDayByPoints Month MonthDay ThroughDay ThroughMonth ThroughWeek WeekDay Year YearDay 
|_end iteration_| start score: 1130.13567169, end score: 67.3198176028, name feature: MeanForThroughDayByPoints
Coef between pasted baseline and current 14.2328561493
BranchNumber CityID MeanForMonth MeanForMonthByPoints MeanForMonthDay MeanForMonthDayByPoints MeanForThroughDay MeanForThroughMonth MeanForThroughMonthByPoints MeanForThroughWeek MeanForThroughWeekByPoints MeanForWeekDay MeanForWeekDayByPoints MeanForYear MeanForYearByPoints MeanForYearDay MeanForYearDayByPoints Month MonthDay ThroughDay ThroughMonth ThroughW

In [15]:
column

Index([u'CityID', u'Count', u'MeanForMonthDay', u'MeanForThroughMonthByPoints',
       u'MeanForThroughWeek', u'MeanForWeekDay', u'MeanForWeekDayByPoints',
       u'MeanForYear', u'MeanForYearByPoints', u'MeanForYearDay',
       u'ThroughDay', u'Year', u'YearDay'],
      dtype='object')

In [16]:
print(fitOnTrainCheckOnVal(dataTrain[column], dataValidation[column], RandomForestRegressor(100)))

16.7736942675


In [17]:
print(fitOnTrainCheckOnVal(dataTrain[column], dataValidation[column], RandomForestRegressor(900)))

16.5949390401


In [56]:
col = [u'BranchNumber', u'CityID', u'Count', u'MeanForThroughMonthByPoints',
       u'MeanForThroughWeek', u'MeanForWeekDay', u'MeanForWeekDayByPoints',
       u'MeanForYearByPoints', u'MeanForYearDay', u'Month', u'ThroughDay',
       u'lat', u'CourseUS']

In [57]:
print(fitOnTrainCheckOnVal(dataTrain[col], dataValidation[col], RandomForestRegressor(100)))

16.7121868672


In [58]:
print(fitOnTrainCheckOnVal(dataTrain[col], dataValidation[col], RandomForestRegressor(900)))

16.6755277537
