# Week 1

In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
%matplotlib inline

In [3]:
item_categories = pd.read_csv('./item_categories.csv')
sales_train_v2 = pd.read_csv('./sales_train_v2.csv')
test = pd.read_csv('./test.csv')
items = pd.read_csv('./items.csv')
shops = pd.read_csv('./shops.csv')

In [6]:
item_categories.head()

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2
3,Аксессуары - PS4,3
4,Аксессуары - PSP,4


In [7]:
sales_train_v2.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [8]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [9]:
items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,3,40
4,***КОРОБКА (СТЕКЛО) D,4,40


In [10]:
shops.head()

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2
3,"Балашиха ТРК ""Октябрь-Киномир""",3
4,"Волжский ТЦ ""Волга Молл""",4


In [12]:
sales_train_v2.dtypes

date               object
date_block_num      int64
shop_id             int64
item_id             int64
item_price        float64
item_cnt_day      float64
dtype: object

In [27]:
sales_train_v2['date'] = pd.to_datetime(sales_train_v2['date'].apply(lambda x: datetime.strptime(x, '%d.%m.%Y')))

In [35]:
dayofweek = sales_train_v2['date'].apply(lambda x: x.dayofweek)
year = sales_train_v2['date'].apply(lambda x: x.year)
month = sales_train_v2['date'].apply(lambda x: x.month)
day = sales_train_v2['date'].apply(lambda x: x.day)
sales_train_v2['dayofweek'] = dayofweek
sales_train_v2['year'] = year
sales_train_v2['month'] = month
sales_train_v2['day'] = day
sales_train_v2['revenue'] = sales_train_v2['item_price'] * sales_train_v2['item_cnt_day']

sales_train_v2.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,dayofweek,year,month,day,revenue
0,2013-01-02,0,59,22154,999.0,1.0,2,2013,1,2,999.0
1,2013-01-03,0,25,2552,899.0,1.0,3,2013,1,3,899.0
2,2013-01-05,0,25,2552,899.0,-1.0,5,2013,1,5,-899.0
3,2013-01-06,0,25,2554,1709.05,1.0,6,2013,1,6,1709.05
4,2013-01-15,0,25,2555,1099.0,1.0,1,2013,1,15,1099.0


In [47]:
month_sales_sum = sales_train_v2.groupby(['year', 'month', 'shop_id', 'item_id'], as_index=False)[['item_cnt_day', 'revenue']].sum()
month_sales_sum = month_sales_sum.rename(columns={
    'item_cnt_day': 'item_cnt_month',
    'revenue': 'revenue_month',
})
month_sales_sum

Unnamed: 0,year,month,shop_id,item_id,item_cnt_month,revenue_month
0,2013,1,0,32,6.0,1326.0
1,2013,1,0,33,3.0,1041.0
2,2013,1,0,35,1.0,247.0
3,2013,1,0,43,1.0,221.0
4,2013,1,0,51,2.0,257.0
5,2013,1,0,61,1.0,195.0
6,2013,1,0,75,1.0,76.0
7,2013,1,0,88,1.0,76.0
8,2013,1,0,95,1.0,193.0
9,2013,1,0,96,1.0,70.0


In [48]:
month_sales_mean = sales_train_v2.groupby(['year', 'month', 'shop_id', 'item_id'], as_index=False)['item_price'].mean()
month_sales_mean = month_sales_mean.rename(columns={'item_price': 'mean_price_month'})
month_sales_mean

Unnamed: 0,year,month,shop_id,item_id,mean_price_month
0,2013,1,0,32,221.0
1,2013,1,0,33,347.0
2,2013,1,0,35,247.0
3,2013,1,0,43,221.0
4,2013,1,0,51,128.5
5,2013,1,0,61,195.0
6,2013,1,0,75,76.0
7,2013,1,0,88,76.0
8,2013,1,0,95,193.0
9,2013,1,0,96,70.0


In [67]:
month_sales = pd.merge(month_sales_sum, month_sales_mean, on=['year', 'month', 'shop_id', 'item_id'])
train_features = month_sales[['year', 'month', 'shop_id', 'item_id']]
train_targets = month_sales['item_cnt_month'].clip(0, 20)

In [68]:
print(train_features.head())
print(train_targets.head())

   year  month  shop_id  item_id
0  2013      1        0       32
1  2013      1        0       33
2  2013      1        0       35
3  2013      1        0       43
4  2013      1        0       51
0    6.0
1    3.0
2    1.0
3    1.0
4    2.0
Name: item_cnt_month, dtype: float64


In [69]:
X_train, X_valid, y_train, y_valid = train_test_split(train_features, train_targets, test_size=0.2, random_state=42)

In [70]:
regt = DecisionTreeRegressor(max_depth=3)
regt.fit(X_train, y_train)
y_predict = regt.predict(X_valid).clip(0, 20)
rms = sqrt(mean_squared_error(y_valid, y_predict))
print('root mean squared error: {}'.format(rms))

root mean squared error: 2.5205139623040105


In [83]:
test_features = test[['shop_id', 'item_id']]
test_features['year'] = 2015
test_features['month'] = 11
test_features = test_features[['year', 'month', 'shop_id', 'item_id']]
test_targets = regt.predict(test_features).clip(0, 20)

In [88]:
submit_frame = pd.read_csv('./sample_submission.csv')
submit_frame['item_cnt_month'] = test_targets
submit_frame.to_csv('predict_future_sales_test.csv', index=False)