In [1]:
%pylab inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_absolute_error

Populating the interactive namespace from numpy and matplotlib


** Загрузим данные **

In [2]:
file_list = ['data_april.csv', 'data_may.csv', 'data_june.csv']

In [3]:
def load_data(f_list):
    first_file = True
    for data_file in f_list:
        data = pd.read_csv(data_file, parse_dates=['hour'], dtype={"area": int16, "trip_count": float32})
        if first_file:
            result = data
            first_file = False
        else:
            result = pd.concat([result, data], ignore_index=True)
    return result    

In [4]:
data = load_data(file_list)

In [5]:
data.index = data.hour
data.drop(['hour'], axis = 1, inplace=True)

In [6]:
data.head(7)

Unnamed: 0_level_0,area,trip_count
hour,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-04-01 00:00:00,1075,46.0
2016-04-01 01:00:00,1075,25.0
2016-04-01 02:00:00,1075,12.0
2016-04-01 03:00:00,1075,8.0
2016-04-01 04:00:00,1075,9.0
2016-04-01 05:00:00,1075,7.0
2016-04-01 06:00:00,1075,16.0


In [7]:
print data.index.min(), '-', data.index.max()

2016-04-01 00:00:00 - 2016-06-30 23:00:00


In [8]:
regions_list = data.area.unique()
regions_list

array([1075, 1076, 1077, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132,
       1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182,
       1183, 1184, 1221, 1222, 1223, 1224, 1225, 1227, 1228, 1229, 1230,
       1231, 1232, 1233, 1234, 1235, 1272, 1273, 1274, 1278, 1279, 1280,
       1281, 1282, 1283, 1284, 1285, 1286, 1287, 1326, 1327, 1331, 1332,
       1333, 1334, 1335, 1336, 1337, 1338, 1339, 1376, 1377, 1378, 1380,
       1382, 1383, 1384, 1385, 1386, 1387, 1388, 1389, 1390, 1426, 1431,
       1434, 1435, 1436, 1437, 1438, 1439, 1441, 1442, 1480, 1482, 1483,
       1530, 1532, 1533, 1580, 1630, 1684, 1733, 1734, 1783, 2068, 2069,
       2118, 2119, 2168])

In [9]:
train_begin = '2016-04-01 00:00:00'
train_end = '2016-04-30 23:00:00'

test_begin = '2016-05-01 00:00:00'
test_end = '2016-05-31 23:00:00'

verification_begin = '2016-06-01 00:00:00'
verification_end = '2016-06-30 23:00:00'

** Создаём набор признаков **

In [10]:
def make_features(data):
    #value = pd.DataFrame()
    value = data
    
    # часы
    value['hour'] = data.index.hour
    # дни недели
    value['weekday'] = data.index.weekday
    # выходные
    value['weekend'] = value['weekday'].isin([5, 6]).astype(int)
    # дни месяца
    value['day'] = data.index.day
    # месяц
    value['month'] = data.index.month    
    
    # суммарное количество поездок за предшествующие полдня
    value['prev_12h_sum'] = data.trip_count.rolling(12).sum()
    # суммарное количество поездок за предшествующий день
    value['prev_24h_sum'] = data.trip_count.rolling(24).sum()
    # суммарное количество поездок за предшествующую неделю
    value['prev_Week_sum'] = data.trip_count.rolling(168).sum()
    
    # количество поездок в моменты времени Yt-6
    value['prev_6h'] = data.trip_count.shift(6).values
    # количество поездок в моменты времени Yt-8
    value['prev_7h'] = data.trip_count.shift(7).values
    # количество поездок в моменты времени Yt-10
    value['prev_8h'] = data.trip_count.shift(8).values
    # количество поездок в моменты времени Yt-12
    value['prev_9h'] = data.trip_count.shift(9).values
    
    # количество поездок в моменты времени Yt-48
    value['prev_48h'] = data.trip_count.shift(48).values
    # количество поездок в моменты времени Yt-72
    value['prev_72h'] = data.trip_count.shift(72).values
        
    # Добавляем целевые значения для каждой из 6 можделей
    for i in xrange(1, 7):
        value['target_' + str(i)] = data.trip_count.shift(-i).values
    
    value.fillna(0, inplace=True)
    
    return value

In [11]:
frame = make_features(data)

In [12]:
#frame.index = frame.date_time
#frame.fillna(0, inplace=True)

In [13]:
frame.head(10)

Unnamed: 0_level_0,area,trip_count,hour,weekday,weekend,day,month,prev_12h_sum,prev_24h_sum,prev_Week_sum,...,prev_8h,prev_9h,prev_48h,prev_72h,target_1,target_2,target_3,target_4,target_5,target_6
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-04-01 00:00:00,1075,46.0,0,4,0,1,4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,25.0,12.0,8.0,9.0,7.0,16.0
2016-04-01 01:00:00,1075,25.0,1,4,0,1,4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,12.0,8.0,9.0,7.0,16.0,43.0
2016-04-01 02:00:00,1075,12.0,2,4,0,1,4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.0,9.0,7.0,16.0,43.0,70.0
2016-04-01 03:00:00,1075,8.0,3,4,0,1,4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,9.0,7.0,16.0,43.0,70.0,59.0
2016-04-01 04:00:00,1075,9.0,4,4,0,1,4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7.0,16.0,43.0,70.0,59.0,72.0
2016-04-01 05:00:00,1075,7.0,5,4,0,1,4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,16.0,43.0,70.0,59.0,72.0,86.0
2016-04-01 06:00:00,1075,16.0,6,4,0,1,4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,43.0,70.0,59.0,72.0,86.0,110.0
2016-04-01 07:00:00,1075,43.0,7,4,0,1,4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,70.0,59.0,72.0,86.0,110.0,104.0
2016-04-01 08:00:00,1075,70.0,8,4,0,1,4,0.0,0.0,0.0,...,46.0,0.0,0.0,0.0,59.0,72.0,86.0,110.0,104.0,93.0
2016-04-01 09:00:00,1075,59.0,9,4,0,1,4,0.0,0.0,0.0,...,25.0,46.0,0.0,0.0,72.0,86.0,110.0,104.0,93.0,111.0


---

In [14]:
features_list = ['hour', 'weekday', 'weekend', 'day', 'month',
                 'prev_12h_sum', 'prev_24h_sum', 'prev_Week_sum',
                 'prev_6h', 'prev_7h', 'prev_8h', 'prev_9h', 'prev_48h', 'prev_72h',
                 'area', 'trip_count', 'target_1', 'target_2', 'target_3', 'target_4',
                 'target_5', 'target_6']

---

In [31]:
area_Q = []
total_Q = []

for area in regions_list:
        
    train_ = train[train.area == area]
    test_ = test[test.area == area]    
        
    for i in xrange(1,7): 
        model_fitted = model.fit(train_[features_list].values, train_['target_' + str(i)].values)
        predictions = model_fitted.predict(test_[features_list].values)
        Q = mean_absolute_error(test_['target_' + str(i)], predictions)
    
        area_Q.append(Q)
        mean_area_Q = np.array(area_Q).mean()

        total_Q.append(mean_area_Q) 

print 'Q_may = ',np.array(total_Q).mean()

Q_may =  0.0540731739438


---