# 최종 데이터 생성 

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd 
import numpy as np
pd.set_option("max_rows", 500)
pd.set_option("max_columns", 500)
pd.set_option('float_format', '{:f}'.format)

import dataload

import os 
import datetime
from tqdm import tqdm 
from functools import reduce

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt 
plt.style.use('seaborn')
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)

In [2]:
# Data Load 
weather = pd.read_csv('weather_final2.csv', encoding='cp949')
weatherindex = pd.read_csv('data_with_weather.csv', encoding='cp949')
search = pd.read_csv('trend_with_weather.csv', encoding='cp949')

product = pd.read_csv("ProductFiltering.csv", encoding='cp949')
steady = pd.read_csv("SteadySeller.csv", encoding='cp949')

weather.shape, weatherindex.shape, search.shape, product.shape, steady.shape

((11680, 29), (2056899, 18), (2568668, 12), (192, 9), (191, 9))

In [8]:
weather = weather.rename(columns = {"날짜" : "date"})

tqdm.pandas()
weather["date"] = weather["date"].progress_apply(lambda x : pd.to_datetime(str(x), format="%Y-%m-%d"))
weatherindex["date"] = weatherindex["date"].progress_apply(lambda x : pd.to_datetime(str(x), format="%Y-%m-%d"))
search["date"] = search["date"].progress_apply(lambda x : pd.to_datetime(str(x), format="%Y-%m-%d"))

100%|██████████| 11680/11680 [00:00<00:00, 13172.70it/s]
100%|██████████| 2056899/2056899 [02:27<00:00, 13922.19it/s]
100%|██████████| 2568668/2568668 [03:02<00:00, 14074.06it/s]


## 1. Data Merge 

In [50]:
# 날씨 지수 데이터 
weatherindex2 = weatherindex[['date', 'avg_ta', 'max_ta', 'min_ta', 'rn_day', 'rn_hr1', 'avg_ws', 'PM10', 'PM25', 'hm_max', 'sum_ss_hr', 'max_pa']]
weatherindex2 = weatherindex2.drop_duplicates('date', keep='first').reset_index(drop=True)

print(weatherindex2.shape)
weatherindex2.head()

(730, 12)


Unnamed: 0,date,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws,PM10,PM25,hm_max,sum_ss_hr,max_pa
0,2018-01-01,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385
1,2018-01-02,1.208896,6.014385,-3.135167,0.0,0.0,1.903604,59.179487,33.836826,74.169781,37.996677,1021.524802
2,2018-01-03,-0.798604,3.236083,-3.619615,0.0,0.0,2.623562,32.757902,18.242391,59.310396,53.216885,1023.2775
3,2018-01-04,-1.075625,2.345708,-4.636646,0.0,0.0,1.842729,38.685406,23.045234,63.96175,33.707167,1021.539615
4,2018-01-05,0.584646,5.053948,-2.273062,0.316688,0.087646,2.280937,46.151746,30.785155,71.747365,40.40324,1014.525396


In [51]:
# 날짜 변수
weather2 = weatherindex2.merge(weather[['date', '연', '월', '분기', '요일', '공휴일여부', '주말여부', '계절']], on="date", how='left').drop_duplicates().reset_index(drop=True)

# 날짜 더미변수 전처리 
weather2 = pd.get_dummies(weather2, columns=['연'], prefix='연', drop_first=True) 
weather2 = pd.get_dummies(weather2, columns=['월'], prefix='월', drop_first=True) 
weather2 = pd.get_dummies(weather2, columns=['분기'], prefix='분기', drop_first=True) 
weather2 = pd.get_dummies(weather2, columns=['요일'], prefix='요일', drop_first=True) 
weather2 = pd.get_dummies(weather2, columns=['계절'], prefix='계절', drop_first=True) 

print(weather2.shape)
weather2.head()

(730, 38)


Unnamed: 0,date,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws,PM10,PM25,hm_max,sum_ss_hr,max_pa,공휴일여부,주말여부,연_2019,월_2,월_3,월_4,월_5,월_6,월_7,월_8,월_9,월_10,월_11,월_12,분기_2,분기_3,분기_4,요일_1,요일_2,요일_3,요일_4,요일_5,요일_6,계절_1,계절_2,계절_3
0,2018-01-01,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,2018-01-02,1.208896,6.014385,-3.135167,0.0,0.0,1.903604,59.179487,33.836826,74.169781,37.996677,1021.524802,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
2,2018-01-03,-0.798604,3.236083,-3.619615,0.0,0.0,2.623562,32.757902,18.242391,59.310396,53.216885,1023.2775,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,2018-01-04,-1.075625,2.345708,-4.636646,0.0,0.0,1.842729,38.685406,23.045234,63.96175,33.707167,1021.539615,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
4,2018-01-05,0.584646,5.053948,-2.273062,0.316688,0.087646,2.280937,46.151746,30.785155,71.747365,40.40324,1014.525396,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [53]:
# 검색량 데이터 + 날씨 지수 데이터 
final = search.merge(weather2, on='date', how='left')

print(final.shape)
final.head()

(2568668, 49)


Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,ratio,오늘날씨,미세먼지,오늘 비,기상정보,태풍정보,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws,PM10,PM25,hm_max,sum_ss_hr,max_pa,공휴일여부,주말여부,연_2019,월_2,월_3,월_4,월_5,월_6,월_7,월_8,월_9,월_10,월_11,월_12,분기_2,분기_3,분기_4,요일_1,요일_2,요일_3,요일_4,요일_5,요일_6,계절_1,계절_2,계절_3
0,2018-01-01,F,20,식품,가공란,37.0,0.0,10.69387,4.09137,0.0,0.0,0.0444,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,2018-01-01,F,30,식품,가공란,16.0,0.0,12.42969,6.35294,0.0,0.51282,0.42417,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,2018-01-01,F,40,식품,가공란,9.0,0.0,13.02249,4.88037,0.0,0.4065,0.0,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,2018-01-01,F,50,식품,가공란,3.0,0.0,11.6908,4.5335,1.21951,0.90909,0.0,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,2018-01-01,M,20,식품,가공란,13.0,0.0,11.23032,3.61296,0.3891,2.22222,0.0406,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [54]:
# 공통 더미변수 전처리 
final["sex"] = final["sex"].progress_apply(lambda x : 1 if x=='F' else 0)
final = pd.get_dummies(final, columns=['age'], prefix='age', drop_first=False) 
final = final.drop(['age_60'], axis=1)

100%|██████████| 2568668/2568668 [00:02<00:00, 1103261.28it/s]


In [55]:
# 날씨에 영향 받는/받지 않는 상품군 나누기 
final_weather = final[final.sm_cat.isin(list(product.sm_cat.unique()))]
final_steady = final[final.sm_cat.isin(list(steady.sm_cat.unique()))]

print(final_weather.shape, final_steady.shape)
final.shape[0] == final_weather.shape[0] + final_steady.shape[0]

(1303336, 52) (1265332, 52)


True

In [72]:
# 최종 dataframe 
final.to_csv("final_0613.csv", index=False, encoding="cp949")

# 날씨에 영향 받는/받지 않는 상품군 
final_weather.to_csv("final_weather_0613.csv", index=False, encoding="cp949")
final_steady.to_csv("final_steady_0613.csv", index=False, encoding="cp949")

In [83]:
# 진짜 최종 dataframe 
corr_result = pd.read_csv("adf_corr_group_0617.csv", encoding="cp949")
corr_result.group.value_counts()

비정상_high    134
정상_high     130
정상_low       99
비정상_low      20
Name: group, dtype: int64

In [84]:
nonst_high = final[final.sm_cat.isin(list(corr_result[corr_result.group == "비정상_high"].sm_cat.unique()))]
nonst_low  = final[final.sm_cat.isin(list(corr_result[corr_result.group == "비정상_low"].sm_cat.unique()))]

st_high = final[final.sm_cat.isin(list(corr_result[corr_result.group == "정상_high"].sm_cat.unique()))]
st_low  = final[final.sm_cat.isin(list(corr_result[corr_result.group == "정상_low"].sm_cat.unique()))]


print(nonst_high.shape, nonst_low.shape, st_high.shape, st_low.shape)
final.shape[0] == nonst_high.shape[0] + nonst_low.shape[0] + st_high.shape[0] + st_low.shape[0] 

(913113, 52) (142880, 52) (871890, 52) (640785, 52)


True

In [85]:
nonst_high.to_csv("nonst_high_0617.csv", index=False, encoding="cp949")
nonst_low.to_csv("nonst_low_0617.csv", index=False, encoding="cp949")

st_high.to_csv("st_high_0617.csv", index=False, encoding="cp949")
st_low.to_csv("st_low_0617.csv", index=False, encoding="cp949")

## 2. 지역별 데이터프레임 생성 

In [69]:
def region_weather(loc):
    weather_true = weather.drop(['연', '월', '일', '분기', '요일', '공휴일명', '공휴일여부', '주말여부', '계절', 'year', 'month'], axis=1) 
    weather1 = pd.pivot_table(weather_true[weather_true['지역'] == loc], index='date', columns='지역').reset_index(level=0)
    weather1.columns = [l+"_"+w for l,w in zip(weather1.columns.get_level_values(1), weather1.columns.get_level_values(0))]
    return weather1.rename(columns = {'_date' : 'date'})

In [70]:
region_weather('서울')

Unnamed: 0,date,서울_1시간최대강수량,서울_1시간최대습도,서울_PM10,서울_PM25,서울_강수여부,서울_열지수,서울_일별강수량,서울_일조시간합,서울_지점번호,서울_체감온도,서울_최고기온,서울_최고현지기압,서울_최저기온,서울_평균기온,서울_평균풍속,서울_폭염여부
0,2018-01-01,0.000000,57.100000,42.307692,21.470696,0,-44.993999,0.000000,51.600000,108,9.649709,3.800000,1018.100000,-5.100000,-1.300000,1.400000,0
1,2018-01-02,0.000000,71.600000,40.470588,22.711397,0,-46.012971,0.000000,48.400000,108,8.521194,1.800000,1019.600000,-4.300000,-1.800000,1.800000,0
2,2018-01-03,0.000000,54.500000,35.242478,19.042478,0,-52.098536,0.000000,53.600000,108,5.540740,-0.400000,1021.900000,-7.100000,-4.700000,2.200000,0
3,2018-01-04,0.000000,57.000000,46.385027,25.381462,0,-52.095698,0.000000,41.500000,108,7.234635,-0.700000,1020.400000,-8.700000,-4.700000,1.400000,0
4,2018-01-05,0.000000,56.900000,57.841918,37.159858,0,-48.525270,0.000000,49.100000,108,7.839316,1.600000,1012.700000,-5.600000,-3.000000,1.700000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,2019-12-27,0.000000,64.000000,35.617137,23.327549,0,-45.813935,0.000000,56.500000,108,8.016649,2.600000,1016.900000,-4.600000,-1.700000,2.100000,0
726,2019-12-28,0.000000,59.500000,44.451233,25.655949,0,-40.073796,0.000000,39.900000,108,10.304818,6.100000,1018.600000,-3.600000,1.100000,2.000000,0
727,2019-12-29,0.300000,75.200000,45.688437,27.728051,1,-34.620696,0.900000,0.000000,108,10.905129,6.200000,1018.300000,1.100000,3.800000,2.900000,0
728,2019-12-30,0.500000,89.500000,37.671010,25.619978,1,-36.810104,0.400000,0.000000,108,10.019105,6.800000,1018.400000,-5.700000,2.700000,2.900000,0


In [71]:
# 지역별로 붙여서 모델링 
final_weather.merge(region_weather('서울'), on="date", how='left')

Unnamed: 0,date,sex,big_cat,sm_cat,qty,ratio,오늘날씨,미세먼지,오늘 비,기상정보,태풍정보,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws,PM10,PM25,hm_max,sum_ss_hr,max_pa,공휴일여부,주말여부,연_2019,월_2,월_3,월_4,월_5,월_6,월_7,월_8,월_9,월_10,월_11,월_12,분기_2,분기_3,분기_4,요일_1,요일_2,요일_3,요일_4,요일_5,요일_6,계절_1,계절_2,계절_3,age_20,age_30,age_40,age_50,서울_1시간최대강수량,서울_1시간최대습도,서울_PM10,서울_PM25,서울_강수여부,서울_열지수,서울_일별강수량,서울_일조시간합,서울_지점번호,서울_체감온도,서울_최고기온,서울_최고현지기압,서울_최저기온,서울_평균기온,서울_평균풍속,서울_폭염여부
0,2018-01-01,1,냉난방가전,가열식 가습기,1.000000,24.759610,10.693870,4.091370,0.000000,0.000000,0.044400,0.885198,5.464146,-2.880740,0.000000,0.000000,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0.000000,57.100000,42.307692,21.470696,0,-44.993999,0.000000,51.600000,108,9.649709,3.800000,1018.100000,-5.100000,-1.300000,1.400000,0
1,2018-01-01,1,냉난방가전,가열식 가습기,0.000000,17.938930,12.429690,6.352940,0.000000,0.512820,0.424170,0.885198,5.464146,-2.880740,0.000000,0.000000,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0.000000,57.100000,42.307692,21.470696,0,-44.993999,0.000000,51.600000,108,9.649709,3.800000,1018.100000,-5.100000,-1.300000,1.400000,0
2,2018-01-01,1,냉난방가전,가열식 가습기,3.000000,11.070550,13.022490,4.880370,0.000000,0.406500,0.000000,0.885198,5.464146,-2.880740,0.000000,0.000000,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0.000000,57.100000,42.307692,21.470696,0,-44.993999,0.000000,51.600000,108,9.649709,3.800000,1018.100000,-5.100000,-1.300000,1.400000,0
3,2018-01-01,1,냉난방가전,가열식 가습기,0.000000,12.605040,11.690800,4.533500,1.219510,0.909090,0.000000,0.885198,5.464146,-2.880740,0.000000,0.000000,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0.000000,57.100000,42.307692,21.470696,0,-44.993999,0.000000,51.600000,108,9.649709,3.800000,1018.100000,-5.100000,-1.300000,1.400000,0
4,2018-01-01,1,냉난방가전,가열식 가습기,0.000000,4.255310,13.927120,5.478870,0.000000,0.000000,0.000000,0.885198,5.464146,-2.880740,0.000000,0.000000,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.000000,57.100000,42.307692,21.470696,0,-44.993999,0.000000,51.600000,108,9.649709,3.800000,1018.100000,-5.100000,-1.300000,1.400000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1303331,2019-12-31,0,냉난방가전,히터,8.000000,25.602960,28.469310,1.273920,1.361860,7.407400,0.081200,-3.153771,0.872542,-5.469094,0.000000,0.005885,3.414438,19.590719,10.860786,61.658000,47.797865,927.437910,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0.000000,67.600000,25.641304,14.782609,0,-58.910956,0.000000,57.400000,108,1.016984,-4.500000,1025.900000,-10.900000,-7.900000,3.100000,0
1303332,2019-12-31,0,냉난방가전,히터,22.000000,24.163560,31.705780,1.416210,0.000000,3.636360,0.000000,-3.153771,0.872542,-5.469094,0.000000,0.005885,3.414438,19.590719,10.860786,61.658000,47.797865,927.437910,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0.000000,67.600000,25.641304,14.782609,0,-58.910956,0.000000,57.400000,108,1.016984,-4.500000,1025.900000,-10.900000,-7.900000,3.100000,0
1303333,2019-12-31,0,냉난방가전,히터,38.000000,25.116270,37.039630,1.604750,0.000000,4.909560,0.178140,-3.153771,0.872542,-5.469094,0.000000,0.005885,3.414438,19.590719,10.860786,61.658000,47.797865,927.437910,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0.000000,67.600000,25.641304,14.782609,0,-58.910956,0.000000,57.400000,108,1.016984,-4.500000,1025.900000,-10.900000,-7.900000,3.100000,0
1303334,2019-12-31,0,냉난방가전,히터,23.000000,25.185180,49.942120,2.410130,0.000000,4.814810,0.106490,-3.153771,0.872542,-5.469094,0.000000,0.005885,3.414438,19.590719,10.860786,61.658000,47.797865,927.437910,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0.000000,67.600000,25.641304,14.782609,0,-58.910956,0.000000,57.400000,108,1.016984,-4.500000,1025.900000,-10.900000,-7.900000,3.100000,0
