In [1]:
# Import libraries

import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from patsy import dmatrix
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import datetime
from dateutil.parser import parse
# Pretty display for notebooks

%matplotlib inline


# Allows the use of display() for DataFrames
from IPython.display import display 

# Ignore the warnings
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
train = pd.read_csv("../asset/train.csv")
test = pd.read_csv("../asset/test.csv")
weather = pd.read_csv("../asset/weather.csv")
key = pd.read_csv("../asset/key.csv")
submission_example = pd.read_csv("../asset/sampleSubmission.csv")

# Success - Display the first record


import pickle
import awesome_functions as cf

# 원본을 유지하기 위해서 카피
df_train = train.copy()
df_weather = weather.copy()
df_key = key.copy()
df_test = test.copy()

# 35번 스토어에서만 팔린 제품

In [2]:
df_exception = train[(train["item_nbr"]==24)|(train["item_nbr"]==63)|(train["item_nbr"]==66)]

In [3]:
df_exception =df_exception[df_exception["store_nbr"]==35]

In [4]:
df_exception.head()

Unnamed: 0,date,store_nbr,item_nbr,units
746165,2012-06-01,35,24,0
746204,2012-06-01,35,63,0
746207,2012-06-01,35,66,9
751160,2012-06-02,35,24,0
751199,2012-06-02,35,63,8


In [5]:
# date타입을 parsing해서 year를 반환
def handle_date_Y(a):
    return parse(a).year

# date타입을 parsing해서 month를 반환
def handle_date_M(a):
    return parse(a).month

# date타입을 parsing해서 day를 반환
def handle_date_D(a):
    return parse(a).day

# date타입을 parsing해서 weekday를 반환
def handle_date_WeekDay(a):
    return parse(a).weekday()

In [6]:
df_exception['year'] = df_exception['date'].apply(handle_date_Y)
df_exception['month'] = df_exception['date'].apply(handle_date_M)
df_exception['weekday'] = df_exception['date'].apply(handle_date_WeekDay)

In [7]:
df_exception.head()

Unnamed: 0,date,store_nbr,item_nbr,units,year,month,weekday
746165,2012-06-01,35,24,0,2012,6,4
746204,2012-06-01,35,63,0,2012,6,4
746207,2012-06-01,35,66,9,2012,6,4
751160,2012-06-02,35,24,0,2012,6,5
751199,2012-06-02,35,63,8,2012,6,5


In [8]:
import re
with open('./asset/holiday_names.txt') as file:  
    data = file.readlines()
    holidays = []
    BF = []
    months = ["", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
    for holiday in data:
        if re.findall("BlackFriday", holiday) != []:
            BF.append(holiday.split(" ")[:3])
        else:
            holidays.append(holiday.split(" ")[:3])
    for i in range(len(holidays)): 
        month_idx = months.index(holidays[i][1])
        if month_idx < 10:
            holidays[i][1] = "0" + str(month_idx)
        else:
            holidays[i][1] = str(month_idx)
    for z in range(len(holidays)): 
        if len(holidays[z][2]) == 1: 
            holidays[z][2] = "0" + holidays[z][2]
    for z in range(len(BF)) :
        BF[z][1] = "11"
        
        
# holiday 구분하기 
def isholiday(dt): 
    if dt.split("-") in BF :
        return "black friday"
    elif dt.split("-") in holidays :
        return "holiday"
    else :
        return "regular day"
    
df = df_weather.copy() # weather 데이터프레임

# holiday라는 컬럼추가. 위에서 보듯 세가지로만 구분되어있음.
# from_formular에서 C('holiday')사용하면 3가지 컬럼으로 바뀐다는 의미.
df_exception['holiday'] = df_exception['date'].apply(isholiday) 

In [9]:
df_exception["is_weekend"] = np.where(df_exception["weekday"]<5, 0,1)

In [10]:
df_exception.head()

Unnamed: 0,date,store_nbr,item_nbr,units,year,month,weekday,holiday,is_weekend
746165,2012-06-01,35,24,0,2012,6,4,regular day,0
746204,2012-06-01,35,63,0,2012,6,4,regular day,0
746207,2012-06-01,35,66,9,2012,6,4,regular day,0
751160,2012-06-02,35,24,0,2012,6,5,regular day,1
751199,2012-06-02,35,63,8,2012,6,5,regular day,1


In [11]:
df_exception["holiday"].unique()

array(['regular day', 'holiday', 'black friday'], dtype=object)

In [12]:
df_exception["is_holiday"] = np.where(df_exception["holiday"]=="regular day", 0,1)

In [13]:
df_exception["log1p"] = np.log1p(df_exception["units"])

In [14]:
df_exception.head()

Unnamed: 0,date,store_nbr,item_nbr,units,year,month,weekday,holiday,is_weekend,is_holiday,log1p
746165,2012-06-01,35,24,0,2012,6,4,regular day,0,0,0.0
746204,2012-06-01,35,63,0,2012,6,4,regular day,0,0,0.0
746207,2012-06-01,35,66,9,2012,6,4,regular day,0,0,2.302585
751160,2012-06-02,35,24,0,2012,6,5,regular day,1,0,0.0
751199,2012-06-02,35,63,8,2012,6,5,regular day,1,0,2.197225


In [15]:
model = sm.OLS.from_formula("log1p ~ C(item_nbr):C(year):C(month) + C(item_nbr):C(is_holiday) + C(item_nbr):C(is_weekend) + 0", data=df_exception)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  log1p   R-squared:                       0.871
Model:                            OLS   Adj. R-squared:                  0.866
Method:                 Least Squares   F-statistic:                     187.1
Date:                Fri, 06 Jul 2018   Prob (F-statistic):               0.00
Time:                        14:48:47   Log-Likelihood:                -1668.5
No. Observations:                2553   AIC:                             3517.
Df Residuals:                    2463   BIC:                             4043.
Df Model:                          89                                         
Covariance Type:            nonrobust                                         
                                                   coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------

In [16]:
matrix_df = pd.DataFrame(dmatrix("C(item_nbr):C(year):C(month) + C(item_nbr):C(is_holiday) + C(item_nbr):C(is_weekend) + 0", data=df_exception))
model = LinearRegression(fit_intercept=True)
result = model.fit(matrix_df,df_exception["log1p"])
result.score(matrix_df,df_exception["log1p"])
cv = KFold(5)
kfold = cross_val_score(result,matrix_df,df_exception["log1p"], scoring="r2", cv=cv)
kfold, kfold.mean()

(array([-7.30237582e+25, -9.51214773e+25, -6.46896766e+23, -1.56609050e+26,
        -5.70387154e+23]), -6.519431387701235e+25)

In [17]:
model = linear_model.Lasso(alpha = 0.01)
result = model.fit(matrix_df,df_exception["log1p"])
result.score(matrix_df,df_exception["log1p"])
cv = KFold(5)
kfold = cross_val_score(result,matrix_df,df_exception["log1p"], scoring="r2", cv=cv)
kfold, kfold.mean()

(array([0.79897529, 0.17999407, 0.38815732, 0.39885093, 0.78307913]),
 0.5098113471161632)

In [18]:
model = sm.OLS.from_formula("log1p ~ C(item_nbr):C(month) + C(item_nbr):C(is_holiday) + C(item_nbr):C(is_weekend) + 0", data=df_exception)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  log1p   R-squared:                       0.706
Model:                            OLS   Adj. R-squared:                  0.701
Method:                 Least Squares   F-statistic:                     147.2
Date:                Fri, 06 Jul 2018   Prob (F-statistic):               0.00
Time:                        14:48:48   Log-Likelihood:                -2720.3
No. Observations:                2553   AIC:                             5525.
Df Residuals:                    2511   BIC:                             5770.
Df Model:                          41                                         
Covariance Type:            nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
C(it

In [19]:
matrix_df = pd.DataFrame(dmatrix("C(item_nbr):C(month) + C(item_nbr):C(is_holiday) + C(item_nbr):C(is_weekend) + 0", data=df_exception))
model = LinearRegression(fit_intercept=True)
result = model.fit(matrix_df,df_exception["log1p"])
result.score(matrix_df,df_exception["log1p"])
cv = KFold(5)
kfold = cross_val_score(result,matrix_df,df_exception["log1p"], scoring="r2", cv=cv)
kfold, kfold.mean()

(array([ 0.70064003, -0.0692108 ,  0.2581782 ,  0.20091234,  0.69064889]),
 0.3562337314436138)

In [20]:
model = linear_model.Lasso(alpha = 0.01)
result = model.fit(matrix_df,df_exception["log1p"])
result.score(matrix_df,df_exception["log1p"])
cv = KFold(5)
kfold = cross_val_score(result,matrix_df,df_exception["log1p"], scoring="r2", cv=cv)
kfold, kfold.mean()

(array([0.60812387, 0.25266849, 0.30300443, 0.37535456, 0.6233774 ]),
 0.43250575043562395)

In [21]:
df_test2 = pd.read_csv("./dt_test2_201807061248.csv")

In [22]:
model = sm.OLS.from_formula("log1p ~ C(item_nbr):C(month) + C(item_nbr):C(is_holiday) + C(item_nbr):C(weekday) + 0", data=df_exception)
result = model.fit_regularized(alpha=0.01, L1_wt=0)
y_new = result.predict(df_test2)

In [23]:
y_new.head()

0    0.658363
1    1.012718
2    1.210705
3    0.658363
4    1.012718
dtype: float64

In [24]:
y_new = pd.DataFrame(y_new,columns=["result"])

In [25]:
y_new.head()

Unnamed: 0,result
0,0.658363
1,1.012718
2,1.210705
3,0.658363
4,1.012718


In [26]:
y_new["result"] = np.exp(y_new["result"])-1

In [27]:
y_new["result"]

0         0.931628
1         1.753074
2         2.355848
3         0.931628
4         1.753074
5         2.355848
6         0.931628
7         1.753074
8         2.355848
9         0.931628
10        1.753074
11        2.355848
12        1.763476
13        2.321529
14        9.972644
15        1.763476
16        2.321529
17        9.972644
18        1.763476
19        2.321529
20        9.972644
21        1.763476
22        2.321529
23        9.972644
24        1.390170
25        2.579164
26        9.239724
27        1.390170
28        2.579164
29        9.239724
           ...    
14211     0.205839
14212     0.322340
14213     9.388565
14214     0.205839
14215     0.322340
14216     9.388565
14217     0.205839
14218     0.322340
14219     9.388565
14220     0.205839
14221     0.322340
14222     9.388565
14223     0.205839
14224     0.322340
14225     9.388565
14226     0.284427
14227     0.448111
14228    11.674939
14229     0.284427
14230     0.448111
14231    11.674939
14232     0.

In [28]:
test_retult2 = pd.concat([df_test2,y_new],axis=1)

In [29]:
test_retult2.head()

Unnamed: 0,date,store_nbr,item_nbr,station_nbr,tmax,tmin,tavg,depart,dewpoint,wetbulb,...,resultdir,avgspeed,year,month,day,weekday,holiday,is_weekend,is_holiday,result
0,2013-04-01,2,24,14,71,42,56,1,41,48,...,4.0,11.0,2013,4,1,0,regular day,0,0,0.931628
1,2013-04-01,2,63,14,71,42,56,1,41,48,...,4.0,11.0,2013,4,1,0,regular day,0,0,1.753074
2,2013-04-01,2,66,14,71,42,56,1,41,48,...,4.0,11.0,2013,4,1,0,regular day,0,0,2.355848
3,2013-04-01,6,24,14,71,42,56,1,41,48,...,4.0,11.0,2013,4,1,0,regular day,0,0,0.931628
4,2013-04-01,6,63,14,71,42,56,1,41,48,...,4.0,11.0,2013,4,1,0,regular day,0,0,1.753074


In [30]:
test_retult2 = test_retult2[["date","store_nbr","item_nbr","result"]]

In [36]:
test_retult2.head()

Unnamed: 0,date,store_nbr,item_nbr,result
0,2013-04-01,2,24,0.931628
1,2013-04-01,2,63,1.753074
2,2013-04-01,2,66,2.355848
3,2013-04-01,6,24,0.931628
4,2013-04-01,6,63,1.753074


In [37]:
test_retult2

Unnamed: 0,date,store_nbr,item_nbr,result
0,2013-04-01,2,24,0.931628
1,2013-04-01,2,63,1.753074
2,2013-04-01,2,66,2.355848
3,2013-04-01,6,24,0.931628
4,2013-04-01,6,63,1.753074
5,2013-04-01,6,66,2.355848
6,2013-04-01,38,24,0.931628
7,2013-04-01,38,63,1.753074
8,2013-04-01,38,66,2.355848
9,2013-04-01,42,24,0.931628


In [32]:
cf.saveDataFrameToCsv(test_retult2,"test_result2")