In [1]:
# Import libraries

import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from patsy import dmatrix
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import datetime
from dateutil.parser import parse
# Pretty display for notebooks

%matplotlib inline


# Allows the use of display() for DataFrames
from IPython.display import display 

# Ignore the warnings
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
train = pd.read_csv("../asset/train.csv")
test = pd.read_csv("../asset/test.csv")
weather = pd.read_csv("../asset/weather.csv")
key = pd.read_csv("../asset/key.csv")
submission_example = pd.read_csv("../asset/sampleSubmission.csv")

# Success - Display the first record


import pickle
import awesome_functions as cf

# 원본을 유지하기 위해서 카피
df_train = train.copy()
df_weather = weather.copy()
df_key = key.copy()
df_test = test.copy()

In [2]:
total = pd.read_csv("./asset/total_201807051435.csv")

In [3]:
total.shape

(229230, 32)

In [4]:
total.columns

Index(['date', 'store_nbr', 'item_nbr', 'units', 'log1p', 'date2',
       'station_nbr', 'preciptotal_flag', 'depart_flag', 'weekday',
       'is_weekend', 'is_holiday', 'is_holiday_weekday', 'is_holiday_weekend',
       'day', 'month', 'year', 'holiday_name', 'around_BlackFriday', 'tmax',
       'tmin', 'tavg', 'depart', 'dewpoint', 'wetbulb', 'heat', 'cool',
       'preciptotal', 'stnpressure', 'resultspeed', 'resultdir', 'avgspeed'],
      dtype='object')

In [5]:
total.head()

Unnamed: 0,date,store_nbr,item_nbr,units,log1p,date2,station_nbr,preciptotal_flag,depart_flag,weekday,...,depart,dewpoint,wetbulb,heat,cool,preciptotal,stnpressure,resultspeed,resultdir,avgspeed
0,2012-01-01,1,9,29,3.401197,2012-01-01,1,0.0,0.0,6,...,11,36,40,24,0,0.05,29.78,3.6,20.0,4.6
1,2012-01-01,1,28,2,1.098612,2012-01-01,1,0.0,0.0,6,...,11,36,40,24,0,0.05,29.78,3.6,20.0,4.6
2,2012-01-01,1,40,0,0.0,2012-01-01,1,0.0,0.0,6,...,11,36,40,24,0,0.05,29.78,3.6,20.0,4.6
3,2012-01-01,1,47,0,0.0,2012-01-01,1,0.0,0.0,6,...,11,36,40,24,0,0.05,29.78,3.6,20.0,4.6
4,2012-01-01,1,51,1,0.693147,2012-01-01,1,0.0,0.0,6,...,11,36,40,24,0,0.05,29.78,3.6,20.0,4.6


In [6]:
total["is_weekend"].unique()

array([1, 0], dtype=int64)

In [7]:
model = sm.OLS.from_formula("log1p ~ C(item_nbr):scale(cool)+ C(item_nbr):scale(heat) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) + 0", data=total)

In [8]:
result = model.fit()

In [9]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  log1p   R-squared:                       0.860
Model:                            OLS   Adj. R-squared:                  0.859
Method:                 Least Squares   F-statistic:                     1176.
Date:                Fri, 06 Jul 2018   Prob (F-statistic):               0.00
Time:                        15:29:45   Log-Likelihood:            -2.3080e+05
No. Observations:              229230   AIC:                         4.640e+05
Df Residuals:                  228042   BIC:                         4.763e+05
Df Model:                        1187                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

# 해당 모델을 Sklearn을 활용하여 교차검증

In [10]:
matrix_df = pd.DataFrame(dmatrix("C(item_nbr):scale(cool)+ C(item_nbr):scale(heat) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) + 0", data=total))

In [11]:
model = LinearRegression(fit_intercept=False)

In [12]:
result = model.fit(matrix_df,total["log1p"])

In [13]:
result.score(matrix_df,total["log1p"])

0.8595395681031526

In [14]:
cv = KFold(10)

In [15]:
kfold = cross_val_score(result,matrix_df,total["log1p"], scoring="r2", cv=cv)

In [16]:
kfold, kfold.mean()

(array([-3.19445563e+23, -3.67977033e+21,  7.85324707e-01, -6.00033821e+22,
         8.40436742e-01, -7.71048608e+23, -1.96752250e+22,  8.74172282e-01,
        -2.57623019e+22, -4.38873370e+22]), -1.2435021868456443e+23)

# Lasso를 활용하여 정규화 한뒤, 교차검증

In [18]:
model = linear_model.Lasso(alpha = 0.01)

In [19]:
result = model.fit(matrix_df,total["log1p"])

In [20]:
result.score(matrix_df,total["log1p"])

0.6842481055650056

In [21]:
cv = KFold(10)

In [22]:
kfold = cross_val_score(result,matrix_df,total["log1p"], scoring="r2", cv=cv)

In [23]:
kfold, kfold.mean()

(array([0.81270983, 0.56045926, 0.58906582, 0.50703572, 0.79559728,
        0.63467041, 0.62473362, 0.82614631, 0.41730078, 0.34265746]),
 0.6110376502470286)

# Month변수 추가

In [17]:
matrix_df = pd.DataFrame(dmatrix("C(item_nbr):scale(cool)+ C(item_nbr):scale(heat) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) + C(item_nbr):C(month)+ 0", data=total))

In [18]:
model = linear_model.Lasso(alpha = 0.01)

In [19]:
result = model.fit(matrix_df,total["log1p"])

In [20]:
result.score(matrix_df,total["log1p"])

0.41214555590487995

In [21]:
cv = KFold(10)

In [None]:
kfold = cross_val_score(result,matrix_df,total["log1p"], scoring="r2", cv=cv)

In [None]:
kfold, kfold.mean()

# Holiday 변수 추가

In [31]:
matrix_df = pd.DataFrame(dmatrix("C(item_nbr):scale(cool)+ C(item_nbr):scale(heat) + C(item_nbr):scale(preciptotal) + C(item_nbr):scale(depart) + C(item_nbr):C(weekday) + C(item_nbr):C(month) + C(item_nbr):C(is_holiday) + 0", data=total))
model = linear_model.Lasso(alpha = 0.01)
result = model.fit(matrix_df,total["log1p"])
result.score(matrix_df,total["log1p"])
cv = KFold(10)
kfold = cross_val_score(result,matrix_df,total["log1p"], scoring="r2", cv=cv)
kfold, kfold.mean()

(array([0.81270983, 0.56045926, 0.58906582, 0.50703572, 0.79559728,
        0.63467041, 0.62473362, 0.82614631, 0.41730078, 0.34265746]),
 0.6110376502470286)

In [54]:
total.pivot_table(values="units", index ="store_nbr",columns="item_nbr", aggfunc=np.sum)

item_nbr,1,2,3,4,5,6,7,8,9,10,...,102,103,104,105,106,107,108,109,110,111
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,27396.0,,...,,,,,,,,,,
2,,,,,55104.0,,,,,,...,,,,,,,,,,
3,,1083.0,,,56663.0,,,,,,...,31.0,,,,,,,1394.0,,
4,,,,,,,,,117123.0,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,100.0,,,,,,
6,,,,,,,,,50431.0,,...,,,,,,534.0,,,,
7,,,,,62230.0,,721.0,,,,...,,,,,,,,,,
8,,,,,32647.0,,,,,,...,,,,,,,,,,
9,,,,,45024.0,,,,,745.0,...,,,,73.0,,,,,,
10,,,,,61866.0,,,,,,...,,,,,,,,,,


# 35번 스토어에서만 팔린 제품

In [61]:
df_exception = train[(train["item_nbr"]==24)|(train["item_nbr"]==63)|(train["item_nbr"]==66)]

In [62]:
df_exception =df_exception[df_exception["store_nbr"]==35]

In [65]:
df_exception.head()

Unnamed: 0,date,store_nbr,item_nbr,units
746165,2012-06-01,35,24,0
746204,2012-06-01,35,63,0
746207,2012-06-01,35,66,9
751160,2012-06-02,35,24,0
751199,2012-06-02,35,63,8
