# 제목
### 부제목

#### **목차**
* __1. 데이터 전처리__
* __2. 데이터 살펴보기__
* __3. 날씨가 구매에 영향을 미칠까__
* __4. 날씨가 sns 언급횟수에 영향을 미칠까__ 

#### **사용 데이터 목록**

* buy2018_1.csv
* buy2018_2.csv
* buy2019_1.csv
* buy2019_2.csv
* sns2018_1.csv
* sns2018_2.csv
* sns2019_1.csv
* sns2019_2.csv

In [None]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns


from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

In [None]:
# 그래프 한글 폰트 설정 (한번 돌린 후 런타임 다시시작하고 다시 돌려야 제대로 나옴)

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

%config InlineBackend.figure_format = 'retina'
!apt -qq -y install fonts-nanum

fontpath =  '/usr/share/fonts/un-core/UnDotum.ttf',
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='UnDotum') 
mpl.font_manager._rebuild()

from IPython.display import set_matplotlib_formats
#폰트 선명하게 보이게 하기 위해
set_matplotlib_formats('retina')

## Test
plt.figure(figsize=(2,2))
plt.text(0.3, 0.3, '한글 폰트 테스트', size=15)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

## __1. 데이터 전처리__ 

### __1-1. buy, sns 데이터 전처리__

In [None]:
buy2018_1 = pd.read_csv("buy2018_1.csv")
buy2018_2 = pd.read_csv("buy2018_2.csv")
buy2019_1 = pd.read_csv("buy2019_1.csv")
buy2019_2 = pd.read_csv("buy2019_2.csv")

sns2018_1 = pd.read_csv("sns2018_1.csv")
sns2018_2 = pd.read_csv("sns2018_2.csv")
sns2019_1 = pd.read_csv("sns2019_1.csv")
sns2019_2 = pd.read_csv("sns2019_2.csv")

buy2018_1=pd.DataFrame(buy2018_1)
buy2018_2=pd.DataFrame(buy2018_2)
buy2019_1=pd.DataFrame(buy2019_1)
buy2019_2=pd.DataFrame(buy2019_2)

sns2018_1=pd.DataFrame(sns2018_1)
sns2018_2=pd.DataFrame(sns2018_2)
sns2019_1=pd.DataFrame(sns2019_1)
sns2019_2=pd.DataFrame(sns2019_2)

In [None]:
buy2018_1=buy2018_1.drop("Unnamed: 0", axis=1)
buy2018_1.columns = ['date', 'sex', 'age', 'bcat', 'scat', 'qty']

buy2018_2=buy2018_2.drop("Unnamed: 0", axis=1)
buy2018_2.columns = ['date', 'sex', 'age', 'bcat', 'scat', 'qty']

buy2019_1=buy2019_1.drop("Unnamed: 0", axis=1)
buy2019_1.columns = ['date', 'sex', 'age', 'bcat', 'scat', 'qty']

buy2019_2=buy2019_2.drop("Unnamed: 0", axis=1)
buy2019_2.columns = ['date', 'sex', 'age', 'bcat', 'scat', 'qty']

sns2018_1=sns2018_1.drop("Unnamed: 0", axis=1)
sns2018_1.columns = ['date', 'bcat', 'scat', 'cnt']

sns2018_2=sns2018_2.drop("Unnamed: 0", axis=1)
sns2018_2.columns = ['date', 'bcat', 'scat', 'cnt']

sns2019_1=sns2019_1.drop("Unnamed: 0", axis=1)
sns2019_1.columns = ['date', 'bcat', 'scat', 'cnt']

sns2019_2=sns2019_2.drop("Unnamed: 0", axis=1)
sns2019_2.columns = ['date', 'bcat', 'scat', 'cnt']

In [None]:
buy2018 = pd.concat([buy2018_1,buy2018_2], axis=0)
buy2019 = pd.concat([buy2019_1,buy2019_2], axis=0)

buy2018=buy2018.reset_index(drop=True)
buy2019=buy2019.reset_index(drop=True)

sns2018 = pd.concat([sns2018_1,sns2018_2], axis=0)
sns2019 = pd.concat([sns2019_1,sns2019_2], axis=0)

sns2018=sns2018.reset_index(drop=True)
sns2019=sns2019.reset_index(drop=True)

In [None]:
buy2018["date"]=pd.to_datetime(buy2018["date"],format='%Y%m%d')
buy2019["date"]=pd.to_datetime(buy2019["date"],format='%Y%m%d')
sns2018["date"]=pd.to_datetime(sns2018["date"],format='%Y%m%d')
sns2019["date"]=pd.to_datetime(sns2019["date"],format='%Y%m%d')

In [None]:
buy2018.head()

### __1-2. 날씨 데이터 전처리__

In [None]:
ta18 = pd.read_csv("ta18.csv") #기온
rhm18 = pd.read_csv("rhm18.csv") #상대습도
cl18 = pd.read_csv("cloud18.csv") #구름

rn18_1 = pd.read_csv("rn18_1.csv") #강수
rn18_2 = pd.read_csv("rn18_2.csv") 
rn18_3 = pd.read_csv("rn18_3.csv") 
rn18_4 = pd.read_csv("rn18_4.csv") 
rn18_5 = pd.read_csv("rn18_5.csv") 
rn18_6 = pd.read_csv("rn18_6.csv") 
rn18_7 = pd.read_csv("rn18_7.csv") 
rn18_8 = pd.read_csv("rn18_8.csv") 
rn18_9 = pd.read_csv("rn18_9.csv") 
rn18_10 = pd.read_csv("rn18_10.csv") 
rn18_11 = pd.read_csv("rn18_11.csv")
rn18_12 = pd.read_csv("rn18_12.csv") 

In [None]:
ta19 = pd.read_csv("ta19.csv") #기온
rhm19 = pd.read_csv("rhm19.csv") #상대습도
cl19 = pd.read_csv("cloud19.csv") #구름

rn19_1 = pd.read_csv("rn19_1.csv") #강수
rn19_2 = pd.read_csv("rn19_2.csv") 
rn19_3 = pd.read_csv("rn19_3.csv") 
rn19_4 = pd.read_csv("rn19_4.csv") 
rn19_5 = pd.read_csv("rn19_5.csv") 
rn19_6 = pd.read_csv("rn19_6.csv") 
rn19_7 = pd.read_csv("rn19_7.csv") 
rn19_8 = pd.read_csv("rn19_8.csv") 
rn19_9 = pd.read_csv("rn19_9.csv") 
rn19_10 = pd.read_csv("rn19_10.csv") 
rn19_11 = pd.read_csv("rn19_11.csv")
rn19_12 = pd.read_csv("rn19_12.csv") 

In [None]:
ta18=pd.DataFrame(ta18)
rhm18=pd.DataFrame(rhm18)
cl18=pd.DataFrame(cl18)
rn18_1=pd.DataFrame(rn18_1)
rn18_2=pd.DataFrame(rn18_2)
rn18_3=pd.DataFrame(rn18_3)
rn18_4=pd.DataFrame(rn18_4)
rn18_5=pd.DataFrame(rn18_5)
rn18_6=pd.DataFrame(rn18_6)
rn18_7=pd.DataFrame(rn18_7)
rn18_8=pd.DataFrame(rn18_8)
rn18_9=pd.DataFrame(rn18_9)
rn18_10=pd.DataFrame(rn18_10)
rn18_11=pd.DataFrame(rn18_11)
rn18_12=pd.DataFrame(rn18_12)

ta19=pd.DataFrame(ta19)
rhm19=pd.DataFrame(rhm19)
cl19=pd.DataFrame(cl19)
rn19_1=pd.DataFrame(rn19_1)
rn19_2=pd.DataFrame(rn19_2)
rn19_3=pd.DataFrame(rn19_3)
rn19_4=pd.DataFrame(rn19_4)
rn19_5=pd.DataFrame(rn19_5)
rn19_6=pd.DataFrame(rn19_6)
rn19_7=pd.DataFrame(rn19_7)
rn19_8=pd.DataFrame(rn19_8)
rn19_9=pd.DataFrame(rn19_9)
rn19_10=pd.DataFrame(rn19_10)
rn19_11=pd.DataFrame(rn19_11)
rn19_12=pd.DataFrame(rn19_12)

In [None]:
ta18=ta18[["tma","avg_ta"]]
ta18.columns = ['date', 'avg_ta']
ta19=ta19[["tma","avg_ta"]]
ta19.columns = ['date', 'avg_ta']

rhm18=rhm18[["tma","avg_rhm"]]
rhm18.columns = ['date', 'avg_rhm']
rhm19=rhm19[["tma","avg_rhm"]]
rhm19.columns = ['date', 'avg_rhm']

cl18=cl18[["tma","avg_tca"]]
cl18.columns = ['date', 'avg_cl']
cl19=cl19[["tma","avg_tca"]]
cl19.columns = ['date', 'avg_cl']

In [None]:
rn18 = pd.concat([rn18_1,rn18_2,rn18_3,rn18_4,rn18_5,rn18_6,rn18_7,rn18_8,rn18_9,rn18_10,rn18_11,rn18_12], axis=0)
rn19 = pd.concat([rn19_1,rn19_2,rn19_3,rn19_4,rn19_5,rn19_6,rn19_7,rn19_8,rn19_9,rn19_10,rn19_11,rn19_12], axis=0)

rn18=rn18[["tma","sum_rn"]]
rn18.columns = ['date', 'sum_rn']
rn19=rn19[["tma","sum_rn"]]
rn19.columns = ['date', 'sum_rn']

In [None]:
ta18["date"]=pd.to_datetime(ta18["date"],format='%Y-%m-%d %H:%M:%S')
rhm18["date"]=pd.to_datetime(rhm18["date"],format='%Y-%m-%d %H:%M:%S')
cl18["date"]=pd.to_datetime(cl18["date"],format='%Y-%m-%d %H:%M:%S')
rn18["date"]=pd.to_datetime(rn18["date"],format='%Y-%m-%d %H:%M:%S')

ta19["date"]=pd.to_datetime(ta19["date"],format='%Y-%m-%d %H:%M:%S')
rhm19["date"]=pd.to_datetime(rhm19["date"],format='%Y-%m-%d %H:%M:%S')
cl19["date"]=pd.to_datetime(cl19["date"],format='%Y-%m-%d %H:%M:%S')
rn19["date"]=pd.to_datetime(rn19["date"],format='%Y-%m-%d %H:%M:%S')

ta18["date"]=pd.to_datetime(ta18['date']).dt.date
rhm18["date"]= pd.to_datetime(rhm18['date']).dt.date
cl18["date"]= pd.to_datetime(cl18['date']).dt.date
rn18["date"]= pd.to_datetime(rn18['date']).dt.date

ta19["date"]= pd.to_datetime(ta19['date']).dt.date
rhm19["date"]= pd.to_datetime(rhm19['date']).dt.date
cl19["date"]= pd.to_datetime(cl19['date']).dt.date
rn19["date"]= pd.to_datetime(rn19['date']).dt.date

In [None]:
ta18=ta18.groupby(["date"]).mean()
rhm18=rhm18.groupby(["date"]).mean()
cl18=cl18.groupby(["date"]).mean()
rn18=rn18.groupby(["date"]).mean()
ta19=ta19.groupby(["date"]).mean()
rhm19=rhm19.groupby(["date"]).mean()
cl19=cl19.groupby(["date"]).mean()
rn19=rn19.groupby(["date"]).mean()

In [None]:
weather18=pd.concat([ta18,rhm18,cl18,rn18],axis=1)
weather19=pd.concat([ta19,rhm19,cl19,rn19],axis=1)

In [None]:
weather18.columns = ['avg_ta','avg_rhm','avg_tca','sum_rn']
weather19.columns = ['avg_ta','avg_rhm','avg_tca','sum_rn']

In [None]:
weather18.isnull().sum()

In [None]:
weather18["sum_rn"]=weather18["sum_rn"].fillna(0)

In [None]:
weather19.isnull().sum()

In [None]:
weather19["sum_rn"]=weather19["sum_rn"].fillna(0)

In [None]:
weather18.to_csv("weather18.csv", index=False)
weather19.to_csv("weather19.csv", index=False)

## __2. 데이터 살펴보기__ 

In [None]:
buy2018['sex'].value_counts()

In [None]:
buy2018['age'].value_counts()

In [None]:
buy2018['bcat'].value_counts()

In [None]:
buy2019['sex'].value_counts()

In [None]:
buy2019['age'].value_counts()

In [None]:
buy2019['bcat'].value_counts()

### __2-1.뷰티 카테고리__

In [None]:
buy2018_b=buy2018[buy2018["bcat"]=="뷰티"]
buy2019_b=buy2019[buy2019["bcat"]=="뷰티"]
sns2018_b=sns2018[sns2018["bcat"]=="뷰티"]
sns2019_b=sns2019[sns2019["bcat"]=="뷰티"]

buy2018_b=buy2018_b.reset_index(drop=True)
buy2019_b=buy2019_b.reset_index(drop=True)
sns2018_b=sns2018_b.reset_index(drop=True)
sns2019_b=sns2019_b.reset_index(drop=True)

#### 2018년

In [None]:
# 시계열 (2018 전체)
b_18=buy2018_b.groupby(by='date').sum()
plt.figure(figsize=(20,10))
plt.plot(b_18['qty'])
plt.show()

In [None]:
sns.countplot(buy2018_b['sex'])

In [None]:
sns.countplot(buy2018_b['age'])

In [None]:
buy2018_b.scat.value_counts().sort_values(ascending=False)[:10]

In [None]:
b_top10=buy2018_b[(buy2018_b['scat']=='기초 화장용 크림')|
                  (buy2018_b['scat']=='샴푸')|
                  (buy2018_b['scat']=='헤어스타일링용 염색약')|
                  (buy2018_b['scat']=='기초 화장용 에센스')|
                  (buy2018_b['scat']=='기초 화장용 로션')|
                  (buy2018_b['scat']=='스킨케어 마스크팩')|
                  (buy2018_b['scat']=='기초 화장용 스킨')|
                  (buy2018_b['scat']=='클렌징 폼')|
                  (buy2018_b['scat']=='화장 비누')|
                  (buy2018_b['scat']=='기능성 아이케어 화장품')]

In [None]:
plt.figure(figsize=(10,5))
plt.xticks(rotation=90)
sns.barplot(data=b_top10, x='scat', hue='sex', y='qty', estimator=np.sum)

__ 소카테고리 분류__

In [None]:
def findcat(i):
    if "기능성" in i:
        return "기능성"
    elif "기초" in i:
        return "스킨케어"
    elif "스킨케어" in i:
        return "스킨케어"
    elif "남성" in i:
        return "남성"
    elif "네일" in i:
        return "네일"
    elif "바디" in i:
        return "바디"
    elif "메이크업" in i:
        return "메이크업"
    elif "뷰티" in i:
        return "뷰티"
    elif "클렌징" in i:
        return "클렌징"
    elif "헤어" in i:
        return "헤어"
    elif "선" in i:
        return "선"
    elif "풋" in i:
        return "풋"
    elif "핸드" in i:
        return "핸드"
    elif i=="샴푸":
        return "샴푸/트리트먼트"
    elif i=="트리트먼트":
        return "샴푸/트리트먼트"
    else:
        return "기타"

In [None]:
buy2018_b["cat"]=buy2018_b["scat"].apply(lambda x: findcat(x))
buy2019_b["cat"]=buy2019_b["scat"].apply(lambda x: findcat(x))
sns2018_b["cat"]=sns2018_b["scat"].apply(lambda x: findcat(x))
sns2019_b["cat"]=sns2019_b["scat"].apply(lambda x: findcat(x))

In [None]:
buy2018_b.cat.value_counts().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10,5))
plt.xticks(rotation=90)
sns.countplot(buy2018_b['cat'])

In [None]:
buy2018_b_gb=buy2018_b.groupby(['date','cat'])['qty'].sum()
buy2018_b_gb=buy2018_b_gb.reset_index()

In [None]:
plt.figure(figsize=(10,5))
for cat in buy2018_b["cat"].unique():
    d_ = buy2018_b_gb[(buy2018_b_gb["cat"]==cat)]
    plt.plot(d_["date"], d_["qty"], "-", label=str(cat), alpha=.8)
plt.grid()
plt.legend(bbox_to_anchor=(1.15, 1),loc='upper right', fontsize='xx-small')
plt.xticks(rotation=90)
plt.show()

#### 2019년

In [None]:
# 시계열 (2019 전체)
b_19=buy2019_b.groupby(by='date').sum()
plt.figure(figsize=(20,10))
plt.plot(b_19['qty'])
plt.show()

In [None]:
sns.countplot(buy2019_b['sex'])

In [None]:
sns.countplot(buy2019_b['age'])

In [None]:
buy2019_b.scat.value_counts().sort_values(ascending=False)[:10]

In [None]:
b_top10=buy2019_b[(buy2019_b['scat']=='기초 화장용 에센스')|
                  (buy2019_b['scat']=='기초 화장용 크림')|
                  (buy2019_b['scat']=='샴푸')|
                  (buy2019_b['scat']=='헤어스타일링용 염색약')|
                  (buy2019_b['scat']=='기초 화장용 로션')|
                  (buy2019_b['scat']=='기초 화장용 스킨')|
                  (buy2019_b['scat']=='화장 비누')|
                  (buy2019_b['scat']=='클렌징 폼')|
                  (buy2019_b['scat']=='트리트먼트')|
                  (buy2019_b['scat']=='스킨케어 마스크팩')]

In [None]:
plt.figure(figsize=(10,5))
plt.xticks(rotation=90)
sns.barplot(data=b_top10, x='scat', hue='sex', y='qty', estimator=np.sum)

__ 소카테고리 분류__

In [None]:
buy2019_b.cat.value_counts().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(10,5))
plt.xticks(rotation=90)
sns.countplot(buy2019_b['cat'])

In [None]:
buy2019_b_gb=buy2019_b.groupby(['date','cat'])['qty'].sum()
buy2019_b_gb=buy2019_b_gb.reset_index()

In [None]:
plt.figure(figsize=(10,5))
for cat in buy2019_b["cat"].unique():
    d_ = buy2019_b_gb[(buy2019_b_gb["cat"]==cat)]
    plt.plot(d_["date"], d_["qty"], "-", label=str(cat), alpha=.8)
plt.grid()
plt.legend(bbox_to_anchor=(1.15, 1),loc='upper right', fontsize='xx-small')
plt.xticks(rotation=90)
plt.show()

### __2-2. 냉난방 카테고리__

#### __2018년__

In [None]:
#2018 구매 냉난방만 추출
buy2018_EA=buy2018[(buy2018['bcat']=='냉난방가전')]

In [None]:
#성별분포
sns.countplot(buy2018_EA['sex'])

In [None]:
sns.countplot(buy2018_EA['age'])

In [None]:
plt.figure(figsize=(10,5))
plt.xticks(rotation=90)
sns.countplot(buy2018_EA['scat'])

In [None]:
# 2018구매 횟수 top 8 품목 분리
buy18_EA_T8=buy2018_EA[(buy2018_EA['scat']=='온열매트')|(buy2018_EA['scat']=='공기정화 용품')|(buy2018_EA['scat']=='공기청정기')|(buy2018_EA['scat']=='히터')|(buy2018_EA['scat']=='초음파식 가습기')|(buy2018_EA['scat']=='온수매트')|(buy2018_EA['scat']=='휴대용 선풍기')|(buy2018_EA['scat']=='의류건조기')]

In [None]:
plt.figure(figsize=(12,5))
plt.xticks(rotation=90)
sns.barplot(data=buy18_EA_T8, x='scat', hue='sex', y='qty', estimator=np.sum)

In [None]:
# 시계열 (2018 전체)
date3=buy2018_EA.groupby(by='date').sum()
plt.figure(figsize=(20,10))
plt.plot(date3['qty'])
plt.show()

#### __2019년__

In [None]:
# 냉난방만 추출
buy2019_EA=buy2019[(buy2019['scat']=='냉난방가전')]

In [None]:
#성별분포
sns.countplot(buy2019_EA['sex'])

In [None]:
# 시계열 2019
date4=buy2019.groupby(by='date')['qty'].sum()
plt.figure(figsize=(20,10))
plt.plot(date4)
plt.show()

### __2-3. 식품 카테고리__

In [None]:
buy2018_f=buy2018[buy2018["bcat"]=="식품"]
buy2019_f=buy2019[buy2019["bcat"]=="식품"]
sns2018_f=sns2018[sns2018["bcat"]=="식품"]
sns2019_f=sns2019[sns2019["bcat"]=="식품"]

buy2018_f=buy2018_f.reset_index(drop=True)
buy2019_f=buy2019_f.reset_index(drop=True)
sns2018_f=sns2018_f.reset_index(drop=True)
sns2019_f=sns2019_f.reset_index(drop=True)

#### 2018년

In [None]:
# 시계열 (2018 전체)
f_18=buy2018_f.groupby(by='date').sum()
plt.figure(figsize=(20,10))
plt.plot(f_18['qty'])
plt.show()

In [None]:
sns.countplot(buy2018_f['sex'])

In [None]:
sns.countplot(buy2018_f['age'])

In [None]:
buy2018_f.scat.value_counts().sort_values(ascending=False)[:10]

In [None]:
f_top10=buy2018_f[(buy2018_f['scat']=='두유')|
                  (buy2018_f['scat']=='쌀')|
                  (buy2018_f['scat']=='생수')|
                  (buy2018_f['scat']=='인스턴트커피')|
                  (buy2018_f['scat']=='비타민')|
                  (buy2018_f['scat']=='회')|
                  (buy2018_f['scat']=='한방 분말/환제품')|
                  (buy2018_f['scat']=='홍삼액/홍삼정')|
                  (buy2018_f['scat']=='프로바이오틱스')|
                  (buy2018_f['scat']=='탄산음료')]

In [None]:
plt.figure(figsize=(10,5))
plt.xticks(rotation=90)
sns.barplot(data=f_top10, x='scat', hue='sex', y='qty', estimator=np.sum)

In [None]:
f_top10_gb=f_top10.groupby(['date','scat'])['qty'].sum()
f_top10_gb=f_top10_gb.reset_index()

In [None]:
plt.figure(figsize=(10,5))
for cat in f_top10_gb["scat"].unique():
    d_ = f_top10_gb[(f_top10_gb["scat"]==cat)]
    plt.plot(d_["date"], d_["qty"], "-", label=str(cat), alpha=.8)
plt.grid()
plt.legend(bbox_to_anchor=(1.15, 1),loc='upper right', fontsize='xx-small')
plt.xticks(rotation=90)
plt.show()

#### 2019년

In [None]:
# 시계열 (2018 전체)
f_19=buy2019_f.groupby(by='date').sum()
plt.figure(figsize=(20,10))
plt.plot(f_19['qty'])
plt.show()

In [None]:
sns.countplot(buy2019_f['sex'])

In [None]:
sns.countplot(buy2019_f['age'])

In [None]:
buy2019_f.scat.value_counts().sort_values(ascending=False)[:10]

In [None]:
f_top10=buy2019_f[(buy2019_f['scat']=='원두커피')|
                  (buy2019_f['scat']=='루테인/눈 영양제')|
                  (buy2019_f['scat']=='프로바이오틱스')|
                  (buy2019_f['scat']=='커피음료')|
                  (buy2019_f['scat']=='두유')|
                  (buy2019_f['scat']=='비타민')|
                  (buy2019_f['scat']=='인스턴트커피')|
                  (buy2019_f['scat']=='생수')|
                  (buy2019_f['scat']=='한방 분말/환제품')|
                  (buy2019_f['scat']=='흰우유')]

In [None]:
plt.figure(figsize=(10,5))
plt.xticks(rotation=90)
sns.barplot(data=f_top10, x='scat', hue='sex', y='qty', estimator=np.sum)

In [None]:
f_top10_gb=f_top10.groupby(['date','scat'])['qty'].sum()
f_top10_gb=f_top10_gb.reset_index()

In [None]:
plt.figure(figsize=(10,5))
for cat in f_top10_gb["scat"].unique():
    d_ = f_top10_gb[(f_top10_gb["scat"]==cat)]
    plt.plot(d_["date"], d_["qty"], "-", label=str(cat), alpha=.8)
plt.grid()
plt.legend(bbox_to_anchor=(1.15, 1),loc='upper right', fontsize='xx-small')
plt.xticks(rotation=90)
plt.show()

# 여기부터 실행하시면 될거예요

In [None]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns


from datetime import datetime
from sklearn.preprocessing import MinMaxScaler

In [None]:
# 그래프 한글 폰트 설정 (한번 돌린 후 런타임 다시시작하고 다시 돌려야 제대로 나옴)

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

%config InlineBackend.figure_format = 'retina'
!apt -qq -y install fonts-nanum

fontpath =  '/usr/share/fonts/un-core/UnDotum.ttf',
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='UnDotum') 
mpl.font_manager._rebuild()

from IPython.display import set_matplotlib_formats
#폰트 선명하게 보이게 하기 위해
set_matplotlib_formats('retina')

## Test
plt.figure(figsize=(2,2))
plt.text(0.3, 0.3, '한글 폰트 테스트', size=15)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
buy2018_1 = pd.read_csv("buy2018_1.csv")
buy2018_2 = pd.read_csv("buy2018_2.csv")
buy2019_1 = pd.read_csv("buy2019_1.csv")
buy2019_2 = pd.read_csv("buy2019_2.csv")

sns2018_1 = pd.read_csv("sns2018_1.csv")
sns2018_2 = pd.read_csv("sns2018_2.csv")
sns2019_1 = pd.read_csv("sns2019_1.csv")
sns2019_2 = pd.read_csv("sns2019_2.csv")

buy2018_1=pd.DataFrame(buy2018_1)
buy2018_2=pd.DataFrame(buy2018_2)
buy2019_1=pd.DataFrame(buy2019_1)
buy2019_2=pd.DataFrame(buy2019_2)

sns2018_1=pd.DataFrame(sns2018_1)
sns2018_2=pd.DataFrame(sns2018_2)
sns2019_1=pd.DataFrame(sns2019_1)
sns2019_2=pd.DataFrame(sns2019_2)

In [None]:
buy2018_1=buy2018_1.drop("Unnamed: 0", axis=1)
buy2018_1.columns = ['date', 'sex', 'age', 'bcat', 'scat', 'qty']

buy2018_2=buy2018_2.drop("Unnamed: 0", axis=1)
buy2018_2.columns = ['date', 'sex', 'age', 'bcat', 'scat', 'qty']

buy2019_1=buy2019_1.drop("Unnamed: 0", axis=1)
buy2019_1.columns = ['date', 'sex', 'age', 'bcat', 'scat', 'qty']

buy2019_2=buy2019_2.drop("Unnamed: 0", axis=1)
buy2019_2.columns = ['date', 'sex', 'age', 'bcat', 'scat', 'qty']

sns2018_1=sns2018_1.drop("Unnamed: 0", axis=1)
sns2018_1.columns = ['date', 'bcat', 'scat', 'cnt']

sns2018_2=sns2018_2.drop("Unnamed: 0", axis=1)
sns2018_2.columns = ['date', 'bcat', 'scat', 'cnt']

sns2019_1=sns2019_1.drop("Unnamed: 0", axis=1)
sns2019_1.columns = ['date', 'bcat', 'scat', 'cnt']

sns2019_2=sns2019_2.drop("Unnamed: 0", axis=1)
sns2019_2.columns = ['date', 'bcat', 'scat', 'cnt']

In [None]:
buy2018 = pd.concat([buy2018_1,buy2018_2], axis=0)
buy2019 = pd.concat([buy2019_1,buy2019_2], axis=0)

buy2018=buy2018.reset_index(drop=True)
buy2019=buy2019.reset_index(drop=True)

sns2018 = pd.concat([sns2018_1,sns2018_2], axis=0)
sns2019 = pd.concat([sns2019_1,sns2019_2], axis=0)

sns2018=sns2018.reset_index(drop=True)
sns2019=sns2019.reset_index(drop=True)

In [None]:
buy2018["date"]=pd.to_datetime(buy2018["date"],format='%Y%m%d')
buy2019["date"]=pd.to_datetime(buy2019["date"],format='%Y%m%d')
sns2018["date"]=pd.to_datetime(sns2018["date"],format='%Y%m%d')
sns2019["date"]=pd.to_datetime(sns2019["date"],format='%Y%m%d')

In [None]:
weather18=pd.read_csv("weather18.csv")
weather19=pd.read_csv("weather19.csv")


## __3. 날씨가 구매에 영향을 미칠까__ 

### __3-1. 2018년도 구매량__

In [None]:
time1=buy2018.groupby(by='date')['qty'].sum()
plt.figure(figsize=(20,10))
plt.plot(time1)
plt.show()

### __3-1-1. 2018년도 전체 구매량__

In [None]:
b18_tot=buy2018.groupby(["date"])["qty"].sum()
s18_tot=sns2018.groupby(["date"])["cnt"].sum()
b18_tot=b18_tot.reset_index(drop=True)
s18_tot=s18_tot.reset_index(drop=True)

df18=pd.concat([b18_tot,s18_tot,weather18],axis=1)
df18.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

X18=df18[df18.columns[2:]]
y18=df18["buy"]

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X18_scaled=scaler.fit_transform(X18)
y18_scaled=scaler.fit_transform(y18)

#### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_rega=RandomForestRegressor(n_estimators=500, max_depth=6, min_samples_leaf=18, min_samples_split=8,random_state=0)
rf_rega.fit(X_train, y_train)
pred=rf_rega.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_rega.feature_importances_,index=X18.columns).sort_values(ascending=True).plot.barh()

Random Forest 결과 : 2018년 뷰티 구매횟수는 평균 기온에 가장 큰 영향 받았다. 

#### 회귀

회귀를 위한 함수

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle

def rmse(y,pred):
    return np.sqrt(mean_squared_error(y,pred))

def evaluate_regr(y,pred):
    rmsle_val = rmsle(y,pred)
    rmse_val = rmse(y,pred)
    mse_val = mean_squared_error(y,pred)
    # MAE 는 scikit learn의 mean_absolute_error() 로 계산
    mae_val = mean_absolute_error(y,pred)
    print('RMSLE: {0:.6f}, RMSE: {1:.6F}, MSE: {2:.6F}, MAE: {3:.6F}'.format(rmsle_val, rmse_val, mse_val, mae_val))
    
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X18.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df18.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X18_train, y18_train)
pred=model.predict(X18_test)

mean_squared_error(pred,y18_test)

### __3-1-2. 2018년도 뷰티 구매량__ 

In [None]:
buy2018_b=buy2018[buy2018["bcat"]=="뷰티"]
sns2018_b=sns2018[sns2018["bcat"]=="뷰티"]

buy2018_b=buy2018_b.reset_index(drop=True)
sns2018_b=sns2018_b.reset_index(drop=True)

b18_tot=buy2018_b.groupby(["date"])["qty"].sum()
s18_tot=sns2018_b.groupby(["date"])["cnt"].sum()
b18_tot=b18_tot.reset_index(drop=True)
s18_tot=s18_tot.reset_index(drop=True)

df18=pd.concat([b18_tot,s18_tot,weather18],axis=1)
df18.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

X18=df18[df18.columns[2:]]
y18=df18["buy"]

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X18_scaled=scaler.fit_transform(X18)
y18_scaled=scaler.fit_transform(y18)

#### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_reg5=RandomForestRegressor(n_estimators=500, max_depth=6, min_samples_leaf=18, min_samples_split=8,random_state=0)
rf_reg5.fit(X_train, y_train)
pred=rf_reg5.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_reg5.feature_importances_,index=X18.columns).sort_values(ascending=True).plot.barh()

Random Forest 결과 : 2018년 뷰티 구매횟수는 평균 기온에 가장 큰 영향 받았다. 

#### 회귀

In [None]:
X_train, X_test,y_train, y_test= train_test_split(X18_scaled, y18_scaled, test_size=0.3, random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X18.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test,y_train, y_test= train_test_split(X18_scaled, y18_scaled, test_size=0.3, random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df18.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X18_train, y18_train)
pred=model.predict(X18_test)

mean_squared_error(pred,y18_test)

### __3-1-3. 2018년도 냉난방가전 구매량__

In [None]:
buy2018_n=buy2018[buy2018["bcat"]=="냉난방가전"]
sns2018_n=sns2018[sns2018["bcat"]=="냉난방가전"]
buy2018_n=buy2018_n.reset_index(drop=True)
sns2018_n=sns2018_n.reset_index(drop=True)
b18_tot=buy2018_n.groupby(["date"])["qty"].sum()
s18_tot=sns2018_n.groupby(["date"])["cnt"].sum()
b18_tot=b18_tot.reset_index(drop=True)
s18_tot=s18_tot.reset_index(drop=True)

df18=pd.concat([b18_tot,s18_tot,weather18],axis=1)
df18.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

X18=df18[df18.columns[2:]]
y18=df18["buy"]

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X18_scaled=scaler.fit_transform(X18)
y18_scaled=scaler.fit_transform(y18)

#### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_reg1=RandomForestRegressor(n_estimators=500, max_depth=6, min_samples_leaf=18, min_samples_split=8,random_state=0)
rf_reg1.fit(X_train, y_train)
pred=rf_reg1.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_reg1.feature_importances_,index=X18.columns).sort_values(ascending=True).plot.barh()

Random Forest 결과: 평균기온이 냉난방 구매횟수에 가장 큰 영향을 미친다. 

#### 회귀

In [None]:
X_train, X_test,y_train, y_test= train_test_split(X18_scaled, y18_scaled, test_size=0.3, random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X18.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)          
          
cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df18.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X18_train, y18_train)
pred=model.predict(X18_test)

mean_squared_error(pred,y18_test)

### __3-1-4. 2018년도 식품 구매량__

In [None]:
buy2018_f=buy2018[buy2018["bcat"]=="식품"]
sns2018_f=sns2018[sns2018["bcat"]=="식품"]
buy2018_f=buy2018_f.reset_index(drop=True)
sns2018_f=sns2018_f.reset_index(drop=True)
b18_tot=buy2018_f.groupby(["date"])["qty"].sum()
s18_tot=sns2018_f.groupby(["date"])["cnt"].sum()
b18_tot=b18_tot.reset_index(drop=True)
s18_tot=s18_tot.reset_index(drop=True)

df18=pd.concat([b18_tot,s18_tot,weather18],axis=1)
df18.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

X18=df18[df18.columns[2:]]
y18=df18["buy"]

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X18_scaled=scaler.fit_transform(X18)
y18_scaled=scaler.fit_transform(y18)

#### RandomForest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_regx=RandomForestRegressor(n_estimators=500, max_depth=6, min_samples_leaf=12, min_samples_split=8,random_state=0)
rf_regx.fit(X_train, y_train)
pred=rf_regx.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_regx.feature_importances_,index=X18.columns).sort_values(ascending=True).plot.barh()

#### 회귀

In [None]:
X_train, X_test,y_train, y_test= train_test_split(X18_scaled, y18_scaled, test_size=0.3, random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X18.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df18.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X18_train, y18_train)
pred=model.predict(X18_test)

mean_squared_error(pred,y18_test)

### __3-2. 2019년도 구매량__

In [None]:
time2=buy2019.groupby(by='date')['qty'].sum()
plt.figure(figsize=(20,10))
plt.plot(time2)
plt.show()

### __3-2-1. 2019년도 전체 구매량__

In [None]:
b19_tot=buy2019.groupby(["date"])["qty"].sum()
s19_tot=sns2019.groupby(["date"])["cnt"].sum()
b19_tot=b19_tot.reset_index(drop=True)
s19_tot=s19_tot.reset_index(drop=True)

df19=pd.concat([b19_tot,s19_tot,weather19],axis=1)
df19.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

X19=df19[df19.columns[2:]]
y19=df19["buy"]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X19_scaled=scaler.fit_transform(X19)
y19_scaled=scaler.fit_transform(y19)

#### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_regc=RandomForestRegressor(n_estimators=500, max_depth=6, min_samples_leaf=12, min_samples_split=8,random_state=0)
rf_regc.fit(X_train, y_train)
pred=rf_regc.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_regc.feature_importances_,index=X19.columns).sort_values(ascending=True).plot.barh()

Random Forest 결과: 2019년 구매횟수는 평균 상대습도에 가장 큰 영향을 받았다. 

#### 회귀

In [None]:
X_train, X_test,y_train, y_test= train_test_split(X19_scaled, y19_scaled, test_size=0.3, random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X19.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df19.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X19_train, y19_train)
pred=model.predict(X19_test)

mean_squared_error(pred,y19_test)

### __3-2-2. 2019년도 뷰티 구매량__

In [None]:
buy2019_b=buy2019[buy2019["bcat"]=="뷰티"]
sns2019_b=sns2019[sns2019["bcat"]=="뷰티"]
buy2019_b=buy2019_b.reset_index(drop=True)
sns2019_b=sns2019_b.reset_index(drop=True)

b19_tot=buy2019_b.groupby(["date"])["qty"].sum()
s19_tot=sns2019_b.groupby(["date"])["cnt"].sum()
b19_tot=b19_tot.reset_index(drop=True)
s19_tot=s19_tot.reset_index(drop=True)

df19=pd.concat([b19_tot,s19_tot,weather19],axis=1)
df19.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

X19=df19[df19.columns[2:]]
y19=df19["buy"]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X19_scaled=scaler.fit_transform(X19)
y19_scaled=scaler.fit_transform(y19)

#### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_reg6=RandomForestRegressor(n_estimators=500, max_depth=6, min_samples_leaf=12, min_samples_split=8,random_state=0)
rf_reg6.fit(X_train, y_train)
pred=rf_reg6.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_reg6.feature_importances_,index=X19.columns).sort_values(ascending=True).plot.barh()

Random Forest 결과: 2019년 뷰티 구매횟수는 평균 상대습도에 가장 큰 영향 받았다.

#### 회귀

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X19.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df19.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X19_train, y19_train)
pred=model.predict(X19_test)

mean_squared_error(pred,y19_test)

### __3-2-3. 2019년도 냉난방가전 구매량__

In [None]:
buy2019_n=buy2019[buy2019["bcat"]=="냉난방가전"]
sns2019_n=sns2019[sns2019["bcat"]=="냉난방가전"]
buy2019_n=buy2019_n.reset_index(drop=True)
sns2019_n=sns2019_n.reset_index(drop=True)
b19_tot=buy2019_n.groupby(["date"])["qty"].sum()
s19_tot=sns2019_n.groupby(["date"])["cnt"].sum()
b19_tot=b19_tot.reset_index(drop=True)
s19_tot=s19_tot.reset_index(drop=True)

df19=pd.concat([b19_tot,s19_tot,weather19],axis=1)
df19.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

X19=df19[df19.columns[2:]]
y19=df19["buy"]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X19_scaled=scaler.fit_transform(X19)
y19_scaled=scaler.fit_transform(y19)

#### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_reg4=RandomForestRegressor(n_estimators=500,max_depth=6,min_samples_leaf=18,min_samples_split=8,random_state=0)
rf_reg4.fit(X_train, y_train)
pred=rf_reg4.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_reg4.feature_importances_,index=X19.columns).sort_values(ascending=True).plot.barh()

Random Forest 결과 : 2019년 냉난방 sns 언급 횟수는 평균 기온에 가장 큰 영향을 받았다. 

#### 회귀

In [None]:
X_train, X_test,y_train, y_test= train_test_split(X19_scaled, y19_scaled, test_size=0.3, random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
coef= pd.Series(lr_reg.coef_,index=X19.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df19.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X19_train, y19_train)
pred=model.predict(X19_test)

mean_squared_error(pred,y19_test)

### __3-2-4. 2019년도 식품 구매량__

In [None]:
buy2019_f=buy2019[buy2019["bcat"]=="식품"]
sns2019_f=sns2019[sns2019["bcat"]=="식품"]
buy2019_f=buy2019_f.reset_index(drop=True)
sns2019_f=sns2019_f.reset_index(drop=True)
b19_tot=buy2019_f.groupby(["date"])["qty"].sum()
s19_tot=sns2019_f.groupby(["date"])["cnt"].sum()
b19_tot=b19_tot.reset_index(drop=True)
s19_tot=s19_tot.reset_index(drop=True)

df19=pd.concat([b19_tot,s19_tot,weather19],axis=1)
df19.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']


X19=df19[df19.columns[2:]]
y19=df19["buy"]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X19_scaled=scaler.fit_transform(X19)
y19_scaled=scaler.fit_transform(y19)

#### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_regy=RandomForestRegressor(n_estimators=500, max_depth=6, min_samples_leaf=12, min_samples_split=8,random_state=0)
rf_regy.fit(X_train, y_train)
pred=rf_regy.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_regy.feature_importances_,index=X19.columns).sort_values(ascending=True).plot.barh()

#### 회귀

In [None]:
X_train, X_test,y_train, y_test= train_test_split(X19_scaled, y19_scaled, test_size=0.3, random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X19.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df19.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X19_train, y19_train)
pred=model.predict(X19_test)

mean_squared_error(pred,y19_test)

## __4. 날씨가 sns 언급횟수에 영향을 미칠까__

### __4-1. 2018년도 언급횟수__

In [None]:
time3=sns2018.groupby(by='date')['cnt'].sum()
plt.figure(figsize=(20,10))
plt.plot(time3)
plt.show()

### __4-1-1. 2018년도 전체 언급횟수__

In [None]:
b18_tot=buy2018.groupby(["date"])["qty"].sum()
s18_tot=sns2018.groupby(["date"])["cnt"].sum()
b18_tot=b18_tot.reset_index(drop=True)
s18_tot=s18_tot.reset_index(drop=True)

df18=pd.concat([b18_tot,s18_tot,weather18],axis=1)
df18.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

X18=df18[df18.columns[2:]]
y18=df18["sns"]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X18_scaled=scaler.fit_transform(X18)
y18_scaled=scaler.fit_transform(y18)

#### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_reg8=RandomForestRegressor(n_estimators=500, max_depth=6, min_samples_leaf=18, min_samples_split=8,random_state=0)
rf_reg8.fit(X_train, y_train)
pred=rf_reg8.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_reg8.feature_importances_,index=X18.columns).sort_values(ascending=True).plot.barh()

#### 회귀

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X18.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df18.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X18_train, y18_train)
pred=model.predict(X18_test)

mean_squared_error(pred,y18_test)

### __4-1-2. 2018년도 뷰티 언급횟수__ 

In [None]:
buy2018_b=buy2018[buy2018["bcat"]=="뷰티"]
sns2018_b=sns2018[sns2018["bcat"]=="뷰티"]

buy2018_b=buy2018_b.reset_index(drop=True)
sns2018_b=sns2018_b.reset_index(drop=True)

b18_tot=buy2018_b.groupby(["date"])["qty"].sum()
s18_tot=sns2018_b.groupby(["date"])["cnt"].sum()
b18_tot=b18_tot.reset_index(drop=True)
s18_tot=s18_tot.reset_index(drop=True)

df18=pd.concat([b18_tot,s18_tot,weather18],axis=1)
df18.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

X18=df18[df18.columns[2:]]
y18=df18["sns"]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X18_scaled=scaler.fit_transform(X18)
y18_scaled=scaler.fit_transform(y18)

#### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_reg7=RandomForestRegressor(n_estimators=500, max_depth=6, min_samples_leaf=8, min_samples_split=8,random_state=0)
rf_reg7.fit(X_train, y_train)
pred=rf_reg7.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_reg7.feature_importances_,index=X18.columns).sort_values(ascending=True).plot.barh()

#### 회귀

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X18.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df18.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X18_train, y18_train)
pred=model.predict(X18_test)

mean_squared_error(pred,y18_test)

### __4-1-3. 2018년도 냉난방가전 언급횟수__

In [None]:
buy2018_n=buy2018[buy2018["bcat"]=="냉난방가전"]
sns2018_n=sns2018[sns2018["bcat"]=="냉난방가전"]
buy2018_n=buy2018_n.reset_index(drop=True)
sns2018_n=sns2018_n.reset_index(drop=True)
b18_tot=buy2018_n.groupby(["date"])["qty"].sum()
s18_tot=sns2018_n.groupby(["date"])["cnt"].sum()
b18_tot=b18_tot.reset_index(drop=True)
s18_tot=s18_tot.reset_index(drop=True)

df18=pd.concat([b18_tot,s18_tot,weather18],axis=1)
df18.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

X18=df18[df18.columns[2:]]
y18=df18["buy"]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X18_scaled=scaler.fit_transform(X18)
y18_scaled=scaler.fit_transform(y18)

#### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_reg3=RandomForestRegressor(n_estimators=500,max_depth=6,min_samples_leaf=18,min_samples_split=8,random_state=0)
rf_reg3.fit(X_train, y_train)
pred=rf_reg3.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_reg3.feature_importances_,index=X18.columns).sort_values(ascending=True).plot.barh()

#### 회귀

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X18.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df18.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X18_train, y18_train)
pred=model.predict(X18_test)

mean_squared_error(pred,y18_test)

### 4-1-4. 2018년 식품 언급횟수

In [None]:
buy2018_f=buy2018[buy2018["bcat"]=="식품"]
sns2018_f=sns2018[sns2018["bcat"]=="식품"]
buy2018_f=buy2018_f.reset_index(drop=True)
sns2018_f=sns2018_f.reset_index(drop=True)
b18_tot=buy2018_f.groupby(["date"])["qty"].sum()
s18_tot=sns2018_f.groupby(["date"])["cnt"].sum()
b18_tot=b18_tot.reset_index(drop=True)
s18_tot=s18_tot.reset_index(drop=True)

df18=pd.concat([b18_tot,s18_tot,weather18],axis=1)
df18.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

X18=df18[df18.columns[2:]]
y18=df18["sns"]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X18_scaled=scaler.fit_transform(X18)
y18_scaled=scaler.fit_transform(y18)

#### RandomForest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_regz=RandomForestRegressor(n_estimators=500, max_depth=6, min_samples_leaf=18, min_samples_split=8,random_state=0)
rf_regz.fit(X_train, y_train)
pred=rf_regz.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_regd.feature_importances_,index=X18.columns).sort_values(ascending=True).plot.barh()

#### 회귀

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X18.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X18_scaled,y18_scaled,test_size=0.2,random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df18.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X18_train, y18_train)
pred=model.predict(X18_test)

mean_squared_error(pred,y18_test)

### __4-2. 2019년도 언급횟수__

In [None]:
time4=sns2019.groupby(by='date')['cnt'].sum()
plt.figure(figsize=(20,10))
plt.plot(time4)
plt.show()

### __4-2-1. 2019년도 전체 언급횟수__

In [None]:
b19_tot=buy2019.groupby(["date"])["qty"].sum()
s19_tot=sns2019.groupby(["date"])["cnt"].sum()
b19_tot=b19_tot.reset_index(drop=True)
s19_tot=s19_tot.reset_index(drop=True)

df19=pd.concat([b19_tot,s19_tot,weather19],axis=1)
df19.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

X19=df19[df19.columns[2:]]
y19=df19["sns"]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X19_scaled=scaler.fit_transform(X19)
y19_scaled=scaler.fit_transform(y19)

#### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_regd=RandomForestRegressor(n_estimators=500, max_depth=6, min_samples_leaf=18, min_samples_split=8,random_state=0)
rf_regd.fit(X_train, y_train)
pred=rf_regd.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_regd.feature_importances_,index=X19.columns).sort_values(ascending=True).plot.barh()

#### 회귀

In [None]:
X_train, X_test,y_train, y_test= train_test_split(X19_scaled, y19_scaled, test_size=0.3, random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X19.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df19.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X19_train, y19_train)
pred=model.predict(X19_test)

mean_squared_error(pred,y19_test)

### __4-2-2. 2019년도 뷰티 언급횟수__

In [None]:
buy2019_b=buy2019[buy2019["bcat"]=="뷰티"]
sns2019_b=sns2019[sns2019["bcat"]=="뷰티"]
buy2019_b=buy2019_b.reset_index(drop=True)
sns2019_b=sns2019_b.reset_index(drop=True)

b19_tot=buy2019_b.groupby(["date"])["qty"].sum()
s19_tot=sns2019_b.groupby(["date"])["cnt"].sum()
b19_tot=b19_tot.reset_index(drop=True)
s19_tot=s19_tot.reset_index(drop=True)

df19=pd.concat([b19_tot,s19_tot,weather19],axis=1)
df19.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

X19=df19[df19.columns[2:]]
y19=df19["sns"]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X19_scaled=scaler.fit_transform(X19)
y19_scaled=scaler.fit_transform(y19)

#### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_regt=RandomForestRegressor(n_estimators=500, max_depth=6, min_samples_leaf=18, min_samples_split=8,random_state=0)
rf_regt.fit(X_train, y_train)
pred=rf_regt.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_regt.feature_importances_,index=X19.columns).sort_values(ascending=True).plot.barh()

#### 회귀

In [None]:
X_train, X_test,y_train, y_test= train_test_split(X19_scaled, y19_scaled, test_size=0.3, random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X19.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df19.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X19_train, y19_train)
pred=model.predict(X19_test)

mean_squared_error(pred,y19_test)

### __4-2-3. 2019년도 냉난방가전 언급횟수__

In [None]:
buy2019_n=buy2019[buy2019["bcat"]=="냉난방가전"]
sns2019_n=sns2019[sns2019["bcat"]=="냉난방가전"]
buy2019_n=buy2019_n.reset_index(drop=True)
sns2019_n=sns2019_n.reset_index(drop=True)
b19_tot=buy2019_n.groupby(["date"])["qty"].sum()
s19_tot=sns2019_n.groupby(["date"])["cnt"].sum()
b19_tot=b19_tot.reset_index(drop=True)
s19_tot=s19_tot.reset_index(drop=True)

df19=pd.concat([b19_tot,s19_tot,weather19],axis=1)
df19.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

X19=df19[df19.columns[2:]]
y19=df19["sns"]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X19_scaled=scaler.fit_transform(X19)
y19_scaled=scaler.fit_transform(y19)

#### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_reg4=RandomForestRegressor(n_estimators=500,max_depth=6,min_samples_leaf=18,min_samples_split=8,random_state=0)
rf_reg4.fit(X_train, y_train)
pred=rf_reg4.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_reg4.feature_importances_,index=X19.columns).sort_values(ascending=True).plot.barh()

2019년 냉난방 sns 언급 횟수는 평균 기온에 가장 큰 영향을 받았다. 

#### 회귀

In [None]:
X_train, X_test,y_train, y_test= train_test_split(X19_scaled, y19_scaled, test_size=0.3, random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X19.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df19.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X19_train, y19_train)
pred=model.predict(X19_test)

mean_squared_error(pred,y19_test)

### __4-2-4. 2019년도 식품 언급횟수__

In [None]:
buy2019_f=buy2019[buy2019["bcat"]=="식품"]
sns2019_f=sns2019[sns2019["bcat"]=="식품"]
buy2019_f=buy2019_f.reset_index(drop=True)
sns2019_f=sns2019_f.reset_index(drop=True)
b19_tot=buy2019_f.groupby(["date"])["qty"].sum()
s19_tot=sns2019_f.groupby(["date"])["cnt"].sum()
b19_tot=b19_tot.reset_index(drop=True)
s19_tot=s19_tot.reset_index(drop=True)

df19=pd.concat([b19_tot,s19_tot,weather19],axis=1)
df19.columns = ['buy','sns','avg_ta','avg_rhm','avg_cl','rain']

X19=df19[df19.columns[2:]]
y19=df19["sns"]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
X19_scaled=scaler.fit_transform(X19)
y19_scaled=scaler.fit_transform(y19)

#### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

In [None]:
params={
    'n_estimators':[500],
    'max_depth':[6,8,10,12],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,16,20]
}

rf_reg=RandomForestRegressor(random_state=0,n_jobs=-1)
grid_cv=GridSearchCV(rf_reg,param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train,y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도:{0:.4f}'.format(grid_cv.best_score_))

In [None]:
rf_regp=RandomForestRegressor(n_estimators=500, max_depth=6, min_samples_leaf=18, min_samples_split=8,random_state=0)
rf_regp.fit(X_train, y_train)
pred=rf_regp.predict(X_test)
print(mean_squared_error(y_test,pred))

In [None]:
pd.Series(rf_regp.feature_importances_,index=X19.columns).sort_values(ascending=True).plot.barh()

#### 회귀

In [None]:
X_train, X_test,y_train, y_test= train_test_split(X19_scaled, y19_scaled, test_size=0.3, random_state=0)
lr_reg=LinearRegression()
lr_reg.fit(X_train,y_train)
pred=lr_reg.predict(X_test)

evaluate_regr(y_test, pred)

In [None]:
# 각피처의 회귀 계수값 시각화
coef= pd.Series(lr_reg.coef_,index=X19.columns)
coef_sort = coef.sort_values(ascending=False)
sns.barplot(x=coef_sort.values, y=coef_sort.index)

#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV

n_estimators=[50,80,100,200]
learning_rate=[0.05,0.1,0.5]

param_grid = {"n_estimators":n_estimators,
              "learning_rate":learning_rate}
          
X_train, X_test, y_train, y_test = train_test_split(X19_scaled,y19_scaled,test_size=0.2,random_state=0)

cv=GridSearchCV(estimator=AdaBoostRegressor(random_state=0),
               param_grid=param_grid, verbose=True)
cv.fit(X_train,y_train)

print("GridSearchCV 최적 하이퍼 파라미터:",cv.best_params_)
print("GridSearchCV 최고 정확도",cv.best_score_)
print("GridSearchCV 최적 모형",cv.best_estimator_)

In [None]:
model=cv.best_estimator_
pd.Series(model.feature_importances_, df19.columns[2:]).sort_values(ascending=True).plot.barh(width=0.8)

In [None]:
from sklearn.metrics import mean_squared_error

model.fit(X19_train, y19_train)
pred=model.predict(X19_test)

mean_squared_error(pred,y19_test)