In [23]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error as mae


features = pd.read_csv('data_transform/all.csv')

data = features.fillna(100000000)

remove_column = ['IsTest', 'IsTrain', 'IsValidation', 'Date', 'Point', 'CityName'
				 , 'MeanForThroughDay', 'MeanForMonthDayByPoints'
                 , 'MeanForThroughDayByPoints', 'MeanForThroughWeekByPoints'
                 , 'MeanForYearDayByPoints', u'Prev2Day', u'Prev3Day'
                 , u'Next2Day', u'Next3Day', 'NextExisting21Value'
                 , 'PrevExisting21Value', 'NextExisting14Value'
                 , 'PrevExisting14Value', 'NextExisting3Value'
                 , 'PrevExisting3Value'
                 , 'BranchNumber', 'MeanForMonth', 'Next1Day', 'Prev7Day'
                 ]


dataTrain = data[data.IsTrain == 1]
dataTrain = dataTrain[dataTrain.columns.difference(remove_column)]

X_train = dataTrain[dataTrain.columns.difference(['Count'])]
y_train = dataTrain[['Count']]

dataValidation =  data[data.IsValidation == 1]
dataValidation = dataValidation[dataValidation.columns.difference(remove_column)]

X_test = dataValidation[dataValidation.columns.difference(['Count'])]
y_test = dataValidation[['Count']]

dataResult =  data[data.IsValidation == 1]
dataResult = dataResult[dataResult.columns.difference(remove_column)]

X_res = dataResult[dataResult.columns.difference(['Count'])]


dtrain = xgb.DMatrix( X_train, label=y_train)
dtest = xgb.DMatrix( X_test, label=y_test)
dres = xgb.DMatrix( X_res )
evallist  = [(dtest,'eval'), (dtrain,'train')]

param = {'max_depth': 6,
 		'silent': 1,
 		'eta': 0.01,
 		'eval_metric': 'mae'
 }

num_round = 1000

bst = xgb.train( param, dtrain, num_round, evallist, verbose_eval=300 )


result = bst.predict(dres)

[0]	eval-mae:103.633	train-mae:104.724
[300]	eval-mae:16.9867	train-mae:15.8775
[600]	eval-mae:16.0142	train-mae:14.6194
[900]	eval-mae:15.6822	train-mae:14.0441


In [24]:
resDataFrame = pd.DataFrame()
resDataFrame["Date"] = data.Date[data.IsTest == True]
resDataFrame.Date = resDataFrame.Date.apply(lambda x: pd.to_datetime(x))
resDataFrame["Point"] = data.Point[data.IsTest == True]
resDataFrame.index = range(resDataFrame.shape[0])
resDataFrame["Count"] = pd.Series(result)

In [25]:
resDataFrame.head()

Unnamed: 0,Date,Point,Count
0,2011-12-14,Сыктывкар-1,107.465218
1,2011-12-19,Сыктывкар-1,146.673203
2,2011-12-26,Сыктывкар-1,100.477905
3,2011-12-27,Сыктывкар-1,133.700912
4,2012-01-12,Сыктывкар-1,136.116058


In [26]:
resDataFrame.tail()

Unnamed: 0,Date,Point,Count
7332,2016-09-27,Новоалтайск-1,
7333,2016-09-27,Энгельс-1,
7334,2016-09-27,Зеленоград-1,
7335,2016-09-27,Ленинск-Кузнецкий-1,
7336,2016-09-27,Самара-3,


In [27]:
resDataFrame.shape

(7337, 3)

In [28]:
originTest = pd.read_csv("data_original/test.csv")

In [29]:
originTest.Date = originTest.Date.apply(lambda x: pd.to_datetime(x, format='%d.%m.%Y'))

originTest.head()

In [30]:
resDataFrame[(resDataFrame.Date == originTest.Date[0]) & (resDataFrame.Point == originTest.Point[0]) ]

Unnamed: 0,Date,Point,Count
6495,2016-07-08,Ульяновск-2,224.535782


In [31]:
resNP = np.array([np.nan] * originTest.shape[0])
for i in originTest.index:
    resNP[i] = resDataFrame.Count[(resDataFrame.Date == originTest.Date[i]) &
                                             (resDataFrame.Point == originTest.Point[i])]

In [32]:
resNP.shape

(7337,)

In [33]:
originTest.Count = pd.Series(resNP)

In [34]:
originTest.Count = originTest.Count.apply(lambda x: 1 if x < 1 else x)

In [35]:
originTest.Count = originTest.Count.apply(lambda x: round(x) if abs(x - round(x)) < 0.15 else x)


In [36]:
originTest['Year'] = originTest.Date.apply(lambda x: x.year)
originTest['Month'] = originTest.Date.apply(lambda x: x.month)
originTest['MonthDay'] = originTest.Date.apply(lambda x: x.day)

In [37]:
OriginalDate = []
k = 9
for i in range(originTest.shape[0]):
    daystr = str(originTest.MonthDay[i]) if originTest.MonthDay[i] >= 10 else  "0" + str(originTest.MonthDay[i])
    monthstr = str(originTest.Month[i]) if originTest.Month[i] >= 10 else  "0" + str(originTest.Month[i])
    OriginalDate.append(daystr + "." + monthstr + "." +  str(originTest.Year[i]))

In [38]:
originTest["OriginalDate"] = pd.Series(OriginalDate)

In [39]:
originTest.head()

Unnamed: 0,Date,Point,Count,Year,Month,MonthDay,OriginalDate
0,2016-07-08,Ульяновск-2,224.535782,2016,7,8,08.07.2016
1,2016-07-25,Балашиха-1,222.713669,2016,7,25,25.07.2016
2,2014-11-09,Ухта-1,101.448448,2014,11,9,09.11.2014
3,2015-11-12,Альметьевск-1,85.229507,2015,11,12,12.11.2015
4,2014-07-17,Ухта-1,67.0,2014,7,17,17.07.2014


In [40]:
originTest.tail()

Unnamed: 0,Date,Point,Count,Year,Month,MonthDay,OriginalDate
7332,2015-05-27,Белгород-1,109.673622,2015,5,27,27.05.2015
7333,2015-01-31,Самара-3,51.0,2015,1,31,31.01.2015
7334,2015-11-12,Вологда-1,97.3582,2015,11,12,12.11.2015
7335,2016-05-07,Домодедово-1,85.723549,2016,5,7,07.05.2016
7336,2015-04-18,Махачкала-1,47.0,2015,4,18,18.04.2015


In [41]:
originTest = originTest[["OriginalDate", "Point", "Count"]]

In [42]:
originTest.head()

Unnamed: 0,OriginalDate,Point,Count
0,08.07.2016,Ульяновск-2,224.535782
1,25.07.2016,Балашиха-1,222.713669
2,09.11.2014,Ухта-1,101.448448
3,12.11.2015,Альметьевск-1,85.229507
4,17.07.2014,Ухта-1,67.0


In [43]:
originTest.to_csv('submission.csv', index=None, header=None)