In [1]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.linear_model as linear_model 
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error as mae
import datetime
import matplotlib.pyplot as plt
from statsmodels.tsa.arima_model import ARIMA
%matplotlib inline

In [2]:
train = pd.read_csv('data_original/train.csv', header=None, names=['Date', 'Point', 'Count'])

cities = train.Point.apply(lambda x: x.split('-')[0])
cities_set = cities.unique()
cities = cities.apply(lambda x: np.where(cities_set==x)[0][0])
train.insert(len(train.columns)-1, 'City', cities)

points_set = pd.Series(train.Point).unique()
train.Point = train.Point.apply(lambda x: np.where(points_set==x)[0][0])

train.Date = train.Date.apply(lambda x: pd.to_datetime(x, format='%d.%m.%Y'))

week_day = train.Date.apply(lambda x: x.weekday())
month = train.Date.apply(lambda x: x.month + (x.year - 2000) * 12)
year = train.Date.apply(lambda x: x.year)
start_date = pd.to_datetime('1.1.2000')
week = train.Date.apply(lambda x: int((x - start_date).days / 7))
year_day = train.Date.apply(lambda x: (x-pd.to_datetime(str(x.year)+'-01-01')).days)
monthweek_day = train.Date.apply(lambda x: x.month * 7  + x.weekday())

train.insert(len(train.columns) - 1, 'WeekDay', week_day)
train.insert(len(train.columns) - 1, 'Month', month)
train.insert(len(train.columns) - 1, 'Year', year)
train.insert(len(train.columns) - 1, 'Week', week)
train.insert(len(train.columns) - 1, 'MonthWeekDay', monthweek_day)
train.insert(len(train.columns) - 1, 'YearDay', year_day)
train

Unnamed: 0,Date,Point,City,WeekDay,Month,Year,Week,MonthWeekDay,YearDay,Count
0,2016-08-08,0,0,0,200,2016,866,56,220,42
1,2015-03-07,1,1,5,183,2015,792,26,65,81
2,2016-10-05,2,2,2,202,2016,874,72,278,138
3,2015-02-22,3,3,6,182,2015,790,20,52,94
4,2016-08-06,4,4,5,200,2016,866,61,218,42
5,2016-09-28,5,5,2,201,2016,873,65,271,123
6,2016-09-13,6,6,1,201,2016,871,64,256,186
7,2016-01-29,7,7,4,193,2016,838,11,28,30
8,2016-08-17,8,8,2,200,2016,867,58,229,64
9,2013-08-23,9,7,4,164,2013,711,60,234,48


In [3]:
train = train.sort_values(by=['Date'], axis=0)
start_date = pd.to_datetime('2016-01-01')
sub_train = train.sort_values(by=['YearDay'], axis=0)
to_plot = pd.DataFrame()
unique_city = sub_train.City.unique()

groupped = sub_train.groupby(['YearDay'])['Count']
mean = groupped.mean().sort_values()
edge = [135, 165, 180]
max_days = mean.tail(30).index
train


Unnamed: 0,Date,Point,City,WeekDay,Month,Year,Week,MonthWeekDay,YearDay,Count
6717,2011-10-06,43,7,3,142,2011,613,73,278,8
39451,2011-10-07,43,7,4,142,2011,613,74,279,88
4145,2011-10-09,43,7,6,142,2011,614,76,281,1
832,2011-11-15,43,7,1,143,2011,619,78,318,1
23494,2011-12-08,43,7,3,144,2011,622,87,341,2
11011,2011-12-09,43,7,4,144,2011,622,88,342,43
40849,2011-12-10,43,7,5,144,2011,623,89,343,10
44356,2011-12-11,43,7,6,144,2011,623,90,344,1
22017,2011-12-13,43,7,1,144,2011,623,85,346,60
22219,2011-12-15,43,7,3,144,2011,623,87,348,96


Среднее за месяц, неделю, год, день недели


In [4]:
year_point_mean = train.groupby(['Year', 'Point'])['Count'].mean()
month_point_mean = train.groupby(['Month', 'Point'])['Count'].mean()
week_point_mean = train.groupby(['Week', 'Point'])['Count'].mean()
weekday_point_mean = train.groupby(['WeekDay', 'Point'])['Count'].mean()
monthweekday_point_mean = train.groupby(['MonthWeekDay', 'Point'])['Count'].mean()

train_mean = pd.DataFrame()

def get_year(x, p):
    return year_point_mean[x][p]
def get_month(x, p):
    return month_point_mean[x][p]
def get_week(x, p):
    return week_point_mean[x][p]
def get_weekday(x, p):
    return weekday_point_mean[x][p]
def get_monthweekday(x, p):
    return monthweekday_point_mean[x][p]
train_mean.insert(len(train_mean.columns), 'YearPoint', train[['Year', 'Point']].apply(lambda x: get_year(*x), axis=1))
train_mean.insert(len(train_mean.columns), 'MonthPoint', train[['Month', 'Point']].apply(lambda x: get_month(*x), axis=1))
train_mean.insert(len(train_mean.columns), 'WeekPoint', train[['Week', 'Point']].apply(lambda x: get_week(*x), axis=1))
train_mean.insert(len(train_mean.columns), 'WeekDayPoint', train[['WeekDay', 'Point']].apply(lambda x: get_weekday(*x), axis=1))
train_mean.insert(len(train_mean.columns), 'MonthWeekDayPoint', train[['MonthWeekDay', 'Point']].apply(lambda x: get_monthweekday(*x), axis=1))

train_mean.insert(len(train_mean.columns), 'Count', train.Count)
train_mean

Unnamed: 0,YearPoint,MonthPoint,WeekPoint,WeekDayPoint,MonthWeekDayPoint,Count
6717,83.608696,32.333333,48.000000,213.454128,212.705882,8
39451,83.608696,32.333333,48.000000,288.495614,315.900000,88
4145,83.608696,32.333333,1.000000,263.560185,308.833333,1
832,83.608696,1.000000,1.000000,195.944954,179.533333,1
23494,83.608696,96.052632,22.500000,213.454128,195.750000,2
11011,83.608696,96.052632,22.500000,288.495614,270.421053,43
40849,83.608696,96.052632,60.200000,311.777778,316.947368,10
44356,83.608696,96.052632,60.200000,263.560185,265.421053,1
22017,83.608696,96.052632,60.200000,195.944954,202.157895,60
22219,83.608696,96.052632,60.200000,213.454128,195.750000,96


In [5]:
def trainModelTestTrainSplit(data, model):
    c = data.columns.difference(['Count'])
    X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(data[c], data['Count'], test_size=0.3, random_state=0)
    model.verbose=1
    model.fit(X_train.as_matrix(), y_train.as_matrix())
    pred = model.predict(X_test)
    #pred = map(round, pred)
    print ("result ", mae(pred, y_test))
    return model

def trainModelKfold(data, model, countFold = 5):
    meanMae = 0
    
    c = data.columns.difference(['Count'])
    countFold = 5
    kf = KFold(len(data), n_folds=countFold, shuffle=True)
    
    for train_index, test_index in kf:
        X_train, X_test = data[c].loc[train_index], data[c].loc[test_index]
        y_train, y_test = data['Count'].loc[train_index], data['Count'].loc[test_index]
        model.fit(X_train.as_matrix(), y_train.as_matrix())
        pred = model.predict(X_test)
        print (mae(pred, y_test))
        meanMae += mae(pred, y_test)
    meanMae /= countFold
    print ("result ", meanMae)
    return model

In [11]:
linear_regression = linear_model.LinearRegression(n_jobs = -1)
kf = trainModelKfold(train_mean, linear_regression)
tts = trainModelTestTrainSplit(train_mean[['WeekDayPoint', 'Count']], linear_regression)
print (train_mean.columns)
print (kf.coef_, tts.coef_)


20.8964920853
21.1940985058
20.3454379805
20.7114141995
20.8931211593
result  20.8081127861
result  31.2953308967
Index(['YearPoint', 'MonthPoint', 'WeekPoint', 'WeekDayPoint',
       'MonthWeekDayPoint', 'Count'],
      dtype='object')
[ 1.00454886] [ 1.00454886]


In [188]:
train_mean.corr()

Unnamed: 0,YearPoint,MonthPoint,WeekPoint,WeekDayPoint,MonthWeekDayPoint,Count
YearPoint,1.0,0.965297,0.924771,0.826435,0.793343,0.786046
MonthPoint,0.965297,1.0,0.95702,0.799404,0.80022,0.814305
WeekPoint,0.924771,0.95702,1.0,0.769345,0.77622,0.85056
WeekDayPoint,0.826435,0.799404,0.769345,1.0,0.94127,0.717462
MonthWeekDayPoint,0.793343,0.80022,0.77622,0.94127,1.0,0.762228
Count,0.786046,0.814305,0.85056,0.717462,0.762228,1.0
