# Data

## 0. Data load

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/기상청/데이터')

In [59]:
import numpy as np
import pandas as pd
import datetime
from datetime import timedelta
from tqdm.notebook import tqdm
tqdm.pandas()

import random
from functools import reduce

import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
%matplotlib inline

rc('font', family='MalgunGothic')
plt.rcParams['axes.unicode_minus'] = False

  from pandas import Panel


In [78]:
weather1 = pd.read_csv('weather1819.csv')
weather2 = pd.read_csv('weather1819_2.csv', encoding='CP949')
weather3 = pd.read_csv('weather_final.csv',encoding='CP949')
customer = pd.read_csv('소비자심리지수.csv',encoding='CP949')
online = pd.read_csv('data.csv',encoding='CP949')

In [80]:
weather1.head(2)

Unnamed: 0,date,stn_id,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws
0,2018-01-01,105,1.3,5.7,-2.1,0.0,0.0,3.7
1,2018-01-01,112,-0.3,2.7,-2.7,0.0,0.0,1.6


In [79]:
weather2.head(2)

Unnamed: 0.1,Unnamed: 0,date,aws_id,region,PM10,PM25,hm_max,sum_ss_hr,max_pa
0,0,2018-01-01,105,강릉,20.066667,13.4,25.4,57.9,1023.0
1,1,2018-01-01,108,서울,42.307692,21.470696,57.1,51.6,1018.1


In [15]:
customer.head(2)

Unnamed: 0,시도,2018. 01,2018. 02,2018. 03,2018. 04,2018. 05,2018. 06,2018. 07,2018. 08,2018. 09,2018. 10,2018. 11,2018. 12,2019. 01,2019. 02,2019. 03,2019. 04,2019. 05,2019. 06,2019. 07,2019. 08,2019. 09,2019. 10,2019. 11,2019. 12,2020. 01,2020. 02,2020. 03,2020. 04,2020. 05,2020. 06,2020. 07,2020. 08,2020. 09,2020. 10,2020. 11,2020. 12
0,강릉,102.5,98.333333,101.333333,97.333333,101.833333,99.0,97.666667,93.0,86.166667,86.333333,82.166667,82.666667,86.666667,88.0,87.5,87.0,84.166667,86.333333,86.166667,83.166667,87.333333,86.833333,88.5,87.666667,89.666667,86.666667,70.0,67.166667,75.0,79.0,79.0,80.0,71.333333,78.833333,83.833333,78.333333
1,강원,103.166667,100.0,101.666667,98.333333,101.833333,98.5,95.0,91.5,86.0,87.333333,84.5,84.5,87.0,88.5,88.166667,89.833333,86.833333,87.166667,87.0,84.666667,88.166667,88.166667,88.333333,88.833333,90.666667,88.0,72.833333,70.0,75.666667,78.166667,79.166667,80.666667,72.833333,81.166667,85.0,78.833333


In [19]:
online.head(2)

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,cnt
0,20180101,F,20,식품,가공란,37,0.480964
1,20180101,F,30,식품,가공란,16,0.480964


## 1. Data preprocessing

In [50]:
# 데이터 재구조화
df_melt = pd.melt(customer, id_vars=['시도'])

In [53]:
# yymm 변수 생성
df_melt['yymm'] = df_melt['variable'].apply(lambda x: x[:4] + x[6:8])

In [83]:
# 열 이름 변경
weather1.rename(columns={'stn_id':'aws_id'}, inplace=True)

In [81]:
# 필요없는 열 제거
weather2.drop(['Unnamed: 0'], axis=1, inplace=True)

In [84]:
# 날짜 변수
online['date'] = pd.to_datetime(online['date'], format='%Y%m%d')

## 2. 소비자 심리지수 반영한 날씨지수 

In [89]:
def score(data, col_name):
    # 데이터 전처리
    data['시도'] = data['aws_id'].map({108:'서울', 159:'부산', 143:'대구', 112:'인천', 156:'광주',
                                     133:'대전', 152:'울산', 119:'경기', 105:'강원', 131:'충북', 232:'충남',
                                     146:'전북', 168:'전남', 136:'경북', 155:'경남', 184:'제주'})
    data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
    data['year'] = data['date'].dt.year
    data['month'] = data['date'].dt.month
    data['yymm'] = data['year'].map(str) + data['month'].apply(lambda x: str(x) if x >= 10 else '0'+str(x))

    # score 계산
    data['score'] = np.nan
    for i in tqdm(range(len(data))):
        for j in range(len(df_melt)):
            if (data.loc[i,'yymm'] == df_melt.loc[j,'yymm']) & (data.loc[i,'시도'] in df_melt.loc[j,'시도']):
                data.loc[i,'score'] = data.loc[i,col_name]*(df_melt.loc[j,'value']/100)
    return pd.DataFrame(data.groupby(['date'])['score'].mean()).reset_index()

In [85]:
print(sorted(weather1.aws_id.unique()))
print(sorted(weather2.aws_id.unique()))

[105, 108, 112, 119, 131, 133, 136, 143, 146, 152, 155, 156, 159, 168, 184, 232]
[105, 108, 112, 119, 131, 133, 136, 143, 146, 152, 155, 156, 159, 168, 184, 232]


In [88]:
print(weather1.columns.tolist())
print(weather2.columns.tolist())

['date', 'aws_id', 'avg_ta', 'max_ta', 'min_ta', 'rn_day', 'rn_hr1', 'avg_ws']
['date', 'aws_id', 'region', 'PM10', 'PM25', 'hm_max', 'sum_ss_hr', 'max_pa']


In [86]:
weather1.head(2)

Unnamed: 0,date,aws_id,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws
0,2018-01-01,105,1.3,5.7,-2.1,0.0,0.0,3.7
1,2018-01-01,112,-0.3,2.7,-2.7,0.0,0.0,1.6


In [57]:
# weather1에 대한 날씨지수
avg_ta = score(weather1, 'avg_ta')
max_ta = score(weather1, 'max_ta')
min_ta = score(weather1, 'min_ta')
rn_day = score(weather1, 'rn_day')
rn_hr1 = score(weather1, 'rn_hr1')
avg_ws = score(weather1, 'avg_ws')

In [60]:
# 열 이름 변경
avg_ta.columns = ['date', 'avg_ta']
max_ta.columns = ['date', 'max_ta']
min_ta.columns = ['date', 'min_ta']
rn_day.columns = ['date', 'rn_day']
rn_hr1.columns = ['date', 'rn_hr1']
avg_ws.columns = ['date', 'avg_ws']

In [87]:
weather2.head(2)

Unnamed: 0,date,aws_id,region,PM10,PM25,hm_max,sum_ss_hr,max_pa
0,2018-01-01,105,강릉,20.066667,13.4,25.4,57.9,1023.0
1,2018-01-01,108,서울,42.307692,21.470696,57.1,51.6,1018.1


In [91]:
# weather2에 대한 날씨지수
PM10 = score(weather2, 'PM10')
PM25 = score(weather2, 'PM25')
hm_max = score(weather2, 'hm_max')
sum_ss_hr = score(weather2, 'sum_ss_hr')
max_pa = score(weather2, 'max_pa')

HBox(children=(FloatProgress(value=0.0, max=11680.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11680.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11680.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11680.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11680.0), HTML(value='')))




In [92]:
# 열 이름 변경
PM10.columns = ['date', 'PM10']
PM25.columns = ['date', 'PM25']
hm_max.columns = ['date', 'hm_max']
sum_ss_hr.columns = ['date', 'sum_ss_hr']
max_pa.columns = ['date', 'max_pa']

In [93]:
# 여러 데이터 프레임 merge
dfs2 = [PM10, PM25, hm_max, sum_ss_hr, max_pa]
df_merge2 = reduce(lambda left, right: pd.merge(left, right, on='date'), dfs2)

In [94]:
df_merge2.head(2)

Unnamed: 0,date,PM10,PM25,hm_max,sum_ss_hr,max_pa
0,2018-01-01,44.015611,21.477208,61.923313,52.276365,1020.592385
1,2018-01-02,59.179487,33.836826,74.169781,37.996677,1021.524802


## 3. 온라인 구매건수 merge

In [71]:
online.head(2)

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,cnt
0,2018-01-01,F,20,식품,가공란,37,0.480964
1,2018-01-01,F,30,식품,가공란,16,0.480964


In [95]:
merge_1 = pd.merge(online, df_merge, how='left', on='date')
df_final = pd.merge(merge_1, df_merge2, how='left', on='date')

In [96]:
df_final.head(2)

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,cnt,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws,PM10,PM25,hm_max,sum_ss_hr,max_pa
0,2018-01-01,F,20,식품,가공란,37,0.480964,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385
1,2018-01-01,F,30,식품,가공란,16,0.480964,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385


In [97]:
df_final.to_csv('data_with_weather.csv', index=False, encoding='CP949')

In [98]:
pd.read_csv('data_with_weather.csv', encoding='CP949')

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,cnt,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws,PM10,PM25,hm_max,sum_ss_hr,max_pa
0,2018-01-01,F,20,식품,가공란,37,0.480964,0.885198,5.464146,-2.880740,0.0,0.000000,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385
1,2018-01-01,F,30,식품,가공란,16,0.480964,0.885198,5.464146,-2.880740,0.0,0.000000,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385
2,2018-01-01,F,40,식품,가공란,9,0.480964,0.885198,5.464146,-2.880740,0.0,0.000000,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385
3,2018-01-01,F,50,식품,가공란,3,0.480964,0.885198,5.464146,-2.880740,0.0,0.000000,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385
4,2018-01-01,M,20,식품,가공란,13,0.480964,0.885198,5.464146,-2.880740,0.0,0.000000,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2056894,2019-12-31,M,20,냉난방가전,히터,8,71.295163,-3.153771,0.872542,-5.469094,0.0,0.005885,3.414438,19.590719,10.860786,61.658000,47.797865,927.437910
2056895,2019-12-31,M,30,냉난방가전,히터,22,71.295163,-3.153771,0.872542,-5.469094,0.0,0.005885,3.414438,19.590719,10.860786,61.658000,47.797865,927.437910
2056896,2019-12-31,M,40,냉난방가전,히터,38,71.295163,-3.153771,0.872542,-5.469094,0.0,0.005885,3.414438,19.590719,10.860786,61.658000,47.797865,927.437910
2056897,2019-12-31,M,50,냉난방가전,히터,23,71.295163,-3.153771,0.872542,-5.469094,0.0,0.005885,3.414438,19.590719,10.860786,61.658000,47.797865,927.437910
