# #1. Data Preprocessing 
1. 날씨 : (기온, 강수량, 풍속)  
2. 외부이슈 : 네이버 검색어트렌드 
3. final_0620 데이터 생성 

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd 
import numpy as np
pd.set_option("max_rows", 500)
pd.set_option("max_columns", 500)
pd.set_option('float_format', '{:f}'.format)

import dataload

import datetime
from tqdm import tqdm 
from functools import reduce
tqdm.pandas()

# crawling 
import os
import sys
import urllib.request

import re
from bs4 import BeautifulSoup

import json
from pandas.io.json import json_normalize

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt 
plt.style.use('seaborn')
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)

# 1. 날씨 

In [None]:
def load_file(weather) : 
    
    '''
    weather : 추출하고 싶은 날씨 데이터 변수 
    '''
    
    # folder path 설정 
    file_path = os.path.join(os.getcwd(), 'weather')
    file_list = os.listdir(file_path)
    
    # startswith('i') : i로 시작하는 파일 모두 불러오기 / endswith('csv') : csv 파일 모두 불러오기 
    data_file = sorted([file for file in file_list if file.startswith(weather)])
 
    # csv 파일들을 DataFrame으로 불러와서 concat
    df = pd.DataFrame()
    for d in data_file:
        data = pd.read_csv(os.path.join(file_path, d))
        df = pd.concat([df, data], axis=0).reset_index(drop=True)
    
    # index column 삭제 ..ㅎㅎ 
    df = df.drop('Unnamed: 0', axis=1)
    
    return df

In [281]:
''' 해당 지역, 해당 년월의 평균치로 결측치를 대체하는 함수'''

def missing_value(data, aws_id, year, month, var):
    data['year'] = data['date'].apply(lambda x : x.year)
    data['month'] = data['date'].apply(lambda x : x.month)
    sample = data[(data.stn_id==aws_id)&(data.year==year)&(data.month==month)]
    alt = sample[var].mean() # 평균
    # 대체
    data[(data.stn_id==aws_id)&(data.year==year)&(data.month==month)] = data[(data.stn_id==aws_id)&(data.year==year)&(data.month==month)].fillna(alt) 
    data = data.drop(['year', 'month'], axis=1) # 월 변수 제거
    return data

In [194]:
temp = load_file('temp')
wind = load_file('wind')
rain = load_file('rain')
forecast = load_file('forecast')

temp.shape, wind.shape, rain.shape, forecast.shape

((17536, 5), (17536, 3), (420864, 4), (256464, 5))

## 1. 기온
* 일별 데이터 : 시각, 지점번호, 평균기온, 최고기온, 최저기온

In [195]:
tqdm.pandas()
temp['date'] = temp['tma'].progress_apply(lambda x : pd.to_datetime(x[:10]))

100%|██████████| 17536/17536 [00:01<00:00, 14506.81it/s]


In [196]:
'''
avg_ta : 평균기온 
max_ta : 최고기온 
min_ta : 최저기온 
'''

temp.head().append(temp.tail())

Unnamed: 0,tma,stn_id,avg_ta,max_ta,min_ta,date
0,2018-01-01 00:00:00.0,105,1.3,5.7,-2.1,2018-01-01
1,2018-01-01 00:00:00.0,112,-0.3,2.7,-2.7,2018-01-01
2,2018-01-01 00:00:00.0,119,-1.7,4.7,-6.9,2018-01-01
3,2018-01-01 00:00:00.0,136,-1.0,4.7,-6.5,2018-01-01
4,2018-01-01 00:00:00.0,152,2.1,6.2,-0.4,2018-01-01
17531,2020-12-31 00:00:00.0,152,-3.8,1.5,-8.9,2020-12-31
17532,2020-12-31 00:00:00.0,155,-4.1,2.2,-9.8,2020-12-31
17533,2020-12-31 00:00:00.0,159,-2.9,2.9,-8.0,2020-12-31
17534,2020-12-31 00:00:00.0,168,-2.1,2.8,-7.0,2020-12-31
17535,2020-12-31 00:00:00.0,184,2.9,4.6,-0.3,2020-12-31


In [197]:
print("관측 지역 수 : ", temp.stn_id.nunique())
print(temp.stn_id.unique())

관측 지역 수 :  16
[105 112 119 136 152 184 232 131 143 155 168 159 108 156 146 133]


In [206]:
temp.isnull().sum()

tma       0
stn_id    0
avg_ta    2
max_ta    0
min_ta    1
date      0
dtype: int64

In [207]:
temp[temp["avg_ta"].isna()]

Unnamed: 0,tma,stn_id,avg_ta,max_ta,min_ta,date
8449,2019-11-25 00:00:00.0,232,,6.8,0.5,2019-11-25
8458,2019-11-26 00:00:00.0,232,,13.4,,2019-11-26


In [233]:
# 실제 기상청에도 자료가 없어서, avg_ta = (max+min)/2 로 결측값 채움 
temp.loc[8449, 'avg_ta'] = (temp.loc[8449]['max_ta'] + temp.loc[8449]['min_ta']) / 2 

temp.loc[8458, 'min_ta'] = 8.2
temp.loc[8458, 'avg_ta'] = (temp.loc[8458]['max_ta'] + temp.loc[8458]['min_ta']) / 2 

In [234]:
temp[temp["avg_ta"].isna()]

Unnamed: 0,tma,stn_id,avg_ta,max_ta,min_ta,date


In [238]:
temp.isnull().sum()

tma       0
stn_id    0
avg_ta    0
max_ta    0
min_ta    0
date      0
dtype: int64

## 2. 강수량
* 관측시간, AWS번호, RN_DAY (누적 강수량, 마지막 23시 데이터), RN_HR1 (1시간 강수량, 일별로 최대값)

In [198]:
rain['date'] = rain['tm'].progress_apply(lambda x : pd.to_datetime(x[:10]))

100%|██████████| 420864/420864 [00:27<00:00, 15085.17it/s]


In [199]:
print("관측 지역 수 : ", rain.aws_id.nunique())
print(rain.aws_id.unique())

관측 지역 수 :  16
[133 119 136 155 105 108 156 184 159 112 232 131 143 146 152 168]


In [202]:
rain = rain.sort_values('tm').groupby(['date', 'aws_id']).agg({
        'rn_day' : [('rn_day', 'last')], 
        'rn_hr1' : [('rn_hr1', np.max)]
}).reset_index()
rain.columns = rain.columns.get_level_values(level=0)

In [203]:
'''
rn_day : 일별 강수량 
rn_hr1 : 일별 1시간 최다 강수량 
'''

print(rain.shape)
rain.head().append(rain.tail())

(17536, 4)


Unnamed: 0,date,aws_id,rn_day,rn_hr1
0,2018-01-01,105,0.0,0.0
1,2018-01-01,108,0.0,0.0
2,2018-01-01,112,0.0,0.0
3,2018-01-01,119,0.0,0.0
4,2018-01-01,131,0.0,0.0
17531,2020-12-31,156,0.6,0.3
17532,2020-12-31,159,0.0,0.0
17533,2020-12-31,168,0.0,0.0
17534,2020-12-31,184,3.8,2.4
17535,2020-12-31,232,0.0,0.0


In [205]:
rain[rain["rn_day"] != 0].head().append(rain[rain["rn_day"] != 0].tail())

Unnamed: 0,date,aws_id,rn_day,rn_hr1
76,2018-01-05,159,0.8,0.4
78,2018-01-05,184,4.2,1.0
107,2018-01-07,156,2.5,1.5
109,2018-01-07,168,0.9,0.9
110,2018-01-07,184,3.2,1.4
17518,2020-12-30,184,4.3,1.9
17528,2020-12-31,146,0.7,0.5
17529,2020-12-31,152,,
17531,2020-12-31,156,0.6,0.3
17534,2020-12-31,184,3.8,2.4


In [208]:
# 결측값 확인 
rain.isnull().sum()

date       0
aws_id     0
rn_day    16
rn_hr1    17
dtype: int64

In [211]:
# 결측값 0 으로 채우기
rain = rain.fillna(0)

In [212]:
rain.isnull().sum()

date      0
aws_id    0
rn_day    0
rn_hr1    0
dtype: int64

## 3. 풍속 
* 관측시간, 지점번호, 평균풍속

In [213]:
wind['date'] = wind['tma'].progress_apply(lambda x : pd.to_datetime(x[:10]))

100%|██████████| 17536/17536 [00:01<00:00, 15200.28it/s]


In [214]:
'''
avg_ws : 평균풍속 
'''
wind.head().append(wind.tail())

Unnamed: 0,tma,stn_id,avg_ws,date
0,2018-01-01 00:00:00.0,112,1.6,2018-01-01
1,2018-01-01 00:00:00.0,155,1.8,2018-01-01
2,2018-01-01 00:00:00.0,168,4.5,2018-01-01
3,2018-01-01 00:00:00.0,184,3.5,2018-01-01
4,2018-01-02 00:00:00.0,119,1.6,2018-01-02
17531,2020-12-30 00:00:00.0,232,3.3,2020-12-30
17532,2020-12-31 00:00:00.0,112,3.0,2020-12-31
17533,2020-12-31 00:00:00.0,152,3.3,2020-12-31
17534,2020-12-31 00:00:00.0,155,2.0,2020-12-31
17535,2020-12-31 00:00:00.0,159,4.1,2020-12-31


In [215]:
print("관측 지역 수 : ", wind.stn_id.nunique())
print(wind.stn_id.unique())

관측 지역 수 :  16
[112 155 168 184 119 136 156 105 131 133 143 146 159 232 108 152]


In [216]:
wind.isnull().sum()

tma        0
stn_id     0
avg_ws    11
date       0
dtype: int64

In [284]:
wind = missing_value(wind, 112, 2018, 7, 'avg_ws')
wind = missing_value(wind, 143, 2019, 4, 'avg_ws')
wind = missing_value(wind, 133, 2019, 5, 'avg_ws')
wind = missing_value(wind, 152, 2019, 6, 'avg_ws')
wind = missing_value(wind, 131, 2019, 7, 'avg_ws')
wind = missing_value(wind, 232, 2019, 11, 'avg_ws')
wind = missing_value(wind, 133, 2019, 11, 'avg_ws')
# wind = missing_value(wind, 159, 2020, 3, 'avg_ws')

In [285]:
wind.isnull().sum()

tma       0
stn_id    0
avg_ws    0
date      0
dtype: int64

In [287]:
wind.loc[3793]

tma       2018-07-30 00:00:00.0
stn_id                      112
avg_ws                 2.563333
date        2018-07-30 00:00:00
Name: 3793, dtype: object

## Data Merge 

In [289]:
rain = rain.rename(columns = {'aws_id' : 'stn_id'})

In [290]:
weather = reduce(lambda left, right: pd.merge(left, right, on=['date', 'stn_id']), [temp.drop("tma", axis=1), rain, wind.drop("tma", axis=1)])
weather = weather.reindex(columns=['date', 'stn_id', 'avg_ta', 'max_ta', 'min_ta', 'rn_day', 'rn_hr1', 'avg_ws'])

In [292]:
print(weather.shape)
weather.head().append(weather.tail())

(11680, 8)


Unnamed: 0,date,stn_id,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws
0,2018-01-01,105,1.3,5.7,-2.1,0.0,0.0,3.7
1,2018-01-01,112,-0.3,2.7,-2.7,0.0,0.0,1.6
2,2018-01-01,119,-1.7,4.7,-6.9,0.0,0.0,1.0
3,2018-01-01,136,-1.0,4.7,-6.5,0.0,0.0,2.2
4,2018-01-01,152,2.1,6.2,-0.4,0.0,0.0,3.3
11675,2019-12-31,112,-7.3,-5.3,-9.9,0.0,0.0,5.0
11676,2019-12-31,131,-5.7,-1.4,-8.0,0.0,0.0,2.2
11677,2019-12-31,143,-2.5,2.6,-5.1,0.0,0.0,5.2
11678,2019-12-31,155,-1.7,4.8,-4.0,0.0,0.0,2.5
11679,2019-12-31,184,2.9,6.4,1.8,0.0,0.0,5.4


In [296]:
weather.to_csv('weather1819.csv', index=False)

# 2. 검색어 트렌드 크롤링 
* https://datalab.naver.com/keyword/trendSearch.naver 
* **sm_cat** 에 해당하는 검색어 트렌드 값 산출 
* `/` 로 나뉘어 있는 경우, 평균값 사용

## Data Load 

In [2]:
# Data Load & Get Category
buy_full = dataload.load_buy()
print(buy_full.shape)

food = list(buy_full[buy_full["big_cat"]=="식품"].sm_cat.unique())
beauty = list(buy_full[buy_full["big_cat"]=="뷰티"].sm_cat.unique())
app = list(buy_full[buy_full["big_cat"]=="냉난방가전"].sm_cat.unique())
print("food : {} | beauty : {} | app : {}".format(len(food), len(beauty), len(app)))

(2056899, 6)
food : 212 | beauty : 131 | app : 40


In [3]:
buy_full.head(10)

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty
0,20180101,F,20,식품,가공란,37
1,20180101,F,30,식품,가공란,16
2,20180101,F,40,식품,가공란,9
3,20180101,F,50,식품,가공란,3
4,20180101,M,20,식품,가공란,13
5,20180101,M,30,식품,가공란,6
6,20180101,M,40,식품,가공란,6
7,20180101,F,20,냉난방가전,가열식 가습기,1
8,20180101,F,40,냉난방가전,가열식 가습기,3
9,20180101,M,20,냉난방가전,가열식 가습기,1


In [73]:
# original category 
pd.DataFrame(food).to_csv("food_category.csv", encoding="cp949")
pd.DataFrame(beauty).to_csv("beauty_category.csv", encoding="cp949")
pd.DataFrame(app).to_csv("app_category.csv", encoding="cp949")

In [169]:
# category 수정 
food2 = pd.read_csv("food_category.csv", encoding="cp949")
beauty2 = pd.read_csv("beauty_category.csv", encoding="cp949")
app2 = pd.read_csv("app_category.csv", encoding="cp949")

In [170]:
food2 = food2['0'].tolist()
beauty2 = beauty2['0'].tolist()
app2 = app2['0'].tolist()

print("food : {} | beauty : {} | app : {}".format(len(food2), len(beauty2), len(app2)))

food : 212 | beauty : 131 | app : 40


In [171]:
# original category 정보 저장 
food_cat = pd.concat([pd.DataFrame(food), pd.DataFrame(food2)], axis=1)
beauty_cat = pd.concat([pd.DataFrame(beauty), pd.DataFrame(beauty2)], axis=1)
app_cat = pd.concat([pd.DataFrame(app), pd.DataFrame(app2)], axis=1)

## Crawling 

In [172]:
client_id = "???" ; client_secret = "???"

In [173]:
def keyword_search(client_id, client_secret, keyword) : 
    ages = {20:['3','4'], 30:['5','6'], 40:['7','8'], 50:['9','10'], 60:['11']} ; genders = ["f", "m"]
    df = pd.DataFrame()
    
    for age, age_list in ages.items() : 
        for gender in genders : 
            client_id = client_id ; client_secret = client_secret
            url = "https://openapi.naver.com/v1/datalab/search"
            body = {"startDate":"2018-01-01",
                    "endDate":"2019-12-31",
                    "timeUnit":"date",
                    "keywordGroups":[{"groupName":keyword,"keywords":[keyword]}],
                    "ages": age_list ,
                    "gender": gender}
            body_j = json.dumps(body)

            # get results 
            request = urllib.request.Request(url)
            request.add_header("X-Naver-Client-Id",client_id)
            request.add_header("X-Naver-Client-Secret",client_secret)
            request.add_header("Content-Type","application/json")
            response = urllib.request.urlopen(request, data=body_j.encode("utf-8"))
            rescode = response.getcode()

            if(rescode==200):
                response_body = response.read()
                scraped = response_body.decode('utf-8')
                
                try : 
                    result = json_normalize(json.loads(scraped)["results"][0]["data"])
                    result.insert(1, "age", age) ; result.insert(1, "sex", gender.upper()) 
                    df = df.append(result, ignore_index=True)
                except : 
                    pass

            else:
                print("Error Code:" + rescode) 
    
    try : 
        df.insert(3, "sm_cat", keyword)
    except : 
        pass 
    
    return df

In [179]:
def category_search(big_cat) : 
    df = pd.DataFrame()
    
    for sm_cat in tqdm(big_cat) : 
        try: 
            if not '/' in sm_cat : 
                tmp = keyword_search(client_id, client_secret, sm_cat)
                df = df.append(tmp, ignore_index=True)

            # '/' 나뉘어져 있는 경우 
            else : 
                tmp2 = pd.DataFrame()
                sm_cats = sm_cat.split('/')
                for sm in sm_cats : 
                    tmp = keyword_search(client_id, client_secret, sm)
                    tmp2 = tmp2.append(tmp, ignore_index=True)
                    
                try : 
                    tmp2 = tmp2.groupby(["period", "sex", "age"]).agg({'ratio' : [('ratio', np.mean)]}).reset_index()
                    tmp2.columns = tmp2.columns.get_level_values(0)
                    tmp2.insert(3, "sm_cat", sm_cat)
                except : 
                    pass

                df = df.append(tmp2, ignore_index=True)
        
        except urllib.request.HTTPError : 
            print("Too Many Requests - at {}".format(big_cat.index(sm_cat)))
            
    if big_cat[0] in food2 : df.insert(3, "big_cat", "식품")
    elif big_cat[0] in beauty2 : df.insert(3, "big_cat", "뷰티")
    elif big_cat[0] in app2 : df.insert(3, "big_cat", "냉난방가전")
    else : df.insert(3, "big_cat", "날씨")
        
    return df 

## 1. 날씨 정보 

In [65]:
weather_keywords = ["오늘날씨", "미세먼지", "오늘 비", "기상정보", "태풍정보"]
weather_search = category_search(weather_keywords)

100%|██████████| 5/5 [00:21<00:00,  4.28s/it]


In [66]:
print(weather_search.shape)
weather_search.head(10)

(33070, 6)


Unnamed: 0,period,sex,age,big_cat,sm_cat,ratio
0,2018-01-01,F,20,날씨,오늘날씨,10.69387
1,2018-01-02,F,20,날씨,오늘날씨,16.83434
2,2018-01-03,F,20,날씨,오늘날씨,13.63583
3,2018-01-04,F,20,날씨,오늘날씨,12.33925
4,2018-01-05,F,20,날씨,오늘날씨,12.84071
5,2018-01-06,F,20,날씨,오늘날씨,12.89477
6,2018-01-07,F,20,날씨,오늘날씨,19.33421
7,2018-01-08,F,20,날씨,오늘날씨,34.82321
8,2018-01-09,F,20,날씨,오늘날씨,22.80943
9,2018-01-10,F,20,날씨,오늘날씨,16.20355


In [67]:
weather_search.sm_cat.value_counts()

미세먼지    7300
오늘날씨    7300
기상정보    6928
태풍정보    6143
오늘 비    5399
Name: sm_cat, dtype: int64

In [68]:
weather_search.to_csv("weather_trend.csv", index=False, encoding='cp949')

## 2. 검색어 트렌드 

In [120]:
# original category 와 수정된 category 이름 matching 
def to_original_category_name(searchdf, categorydf) : 
    categorydf.columns = ["original", "rename"]
    searchdf = searchdf.merge(categorydf, left_on="sm_cat", right_on="rename").drop(["sm_cat", "rename"], axis=1).rename(columns={'original' : 'sm_cat'})
    searchdf = searchdf[['period', 'sex', 'age', 'big_cat', 'sm_cat', 'ratio']]
    return searchdf

### (1) 가전 

In [97]:
app_search = category_search(app2)

100%|██████████| 40/40 [01:55<00:00,  2.89s/it]


In [121]:
app_search2 = to_original_category_name(app_search, app_cat)

In [122]:
print(app_search2.shape)
app_search2.head()

(208971, 6)


Unnamed: 0,period,sex,age,big_cat,sm_cat,ratio
0,2018-01-01,F,20,냉난방가전,가열식 가습기,24.75961
1,2018-01-02,F,20,냉난방가전,가열식 가습기,27.64423
2,2018-01-03,F,20,냉난방가전,가열식 가습기,30.16826
3,2018-01-04,F,20,냉난방가전,가열식 가습기,35.8173
4,2018-01-05,F,20,냉난방가전,가열식 가습기,31.37019


In [123]:
app_search2.to_csv("app_trend.csv", index=False, encoding='cp949')

### (2) 뷰티 

In [130]:
beauty_search1 = category_search(beauty2[:90])

100%|██████████| 90/90 [02:14<00:00,  1.50s/it]


In [149]:
beauty_search2 = category_search(beauty2[90:])

100%|██████████| 41/41 [01:06<00:00,  1.63s/it]


In [151]:
beauty_search = pd.concat([beauty_search1, beauty_search2], axis=0).reset_index(drop=True)
beauty_search = to_original_category_name(beauty_search, beauty_cat)

print(beauty_search.shape)
beauty_search.head()

(718099, 6)


Unnamed: 0,period,sex,age,big_cat,sm_cat,ratio
0,2018-01-01,F,20,뷰티,기능성 링클케어 화장품,25.0
1,2018-01-02,F,20,뷰티,기능성 링클케어 화장품,25.0
2,2018-01-03,F,20,뷰티,기능성 링클케어 화장품,50.0
3,2018-01-04,F,20,뷰티,기능성 링클케어 화장품,58.33333
4,2018-01-05,F,20,뷰티,기능성 링클케어 화장품,8.33333


In [152]:
beauty_search.to_csv("beauty_trend.csv", index=False, encoding='cp949')

### (3) 식품 

In [175]:
food_search1 = category_search(food2[:70])

100%|██████████| 70/70 [05:11<00:00,  4.46s/it]


In [182]:
food_search2 = category_search(food2[70:137])

100%|██████████| 67/67 [04:19<00:00,  3.87s/it]


In [184]:
food_search3 = category_search(food2[137:])

 95%|█████████▍| 71/75 [05:20<00:09,  2.46s/it]

Too Many Requests - at 69
Too Many Requests - at 70


 97%|█████████▋| 73/75 [05:20<00:02,  1.27s/it]

Too Many Requests - at 71
Too Many Requests - at 72


100%|██████████| 75/75 [05:20<00:00,  4.28s/it]

Too Many Requests - at 73
Too Many Requests - at 74





In [192]:
food_search4 = category_search(food2[-6:])

100%|██████████| 6/6 [00:22<00:00,  3.72s/it]


In [194]:
food_search = pd.concat([food_search1,food_search2, food_search3, food_search4], axis=0).reset_index(drop=True)
food_search = to_original_category_name(food_search, food_cat)

print(food_search.shape)
food_search.head()

(1326674, 6)


Unnamed: 0,period,sex,age,big_cat,sm_cat,ratio
0,2018-06-18,F,20,식품,가공란,75.0
1,2018-06-19,F,20,식품,가공란,75.0
2,2018-07-03,F,20,식품,가공란,50.0
3,2019-05-28,F,20,식품,가공란,100.0
4,2018-06-18,M,20,식품,가공란,100.0


In [195]:
food_search.to_csv("food_trend.csv", index=False, encoding='cp949')

## Data Merge 

### 1. 검색어 트렌드 

In [199]:
buy_full['date'] = buy_full['date'].progress_apply(lambda x : pd.to_datetime(str(x), format='%Y%m%d'))

100%|██████████| 2056899/2056899 [03:23<00:00, 10095.74it/s]


In [196]:
search = pd.concat([app_search2, beauty_search, food_search], axis=0).reset_index(drop=True)
search.shape

(2253744, 6)

In [208]:
search = search.rename(columns = {'period' : 'date'})
search['date'] = search['date'].progress_apply(lambda x : pd.to_datetime(str(x), format='%Y-%m-%d'))

100%|██████████| 2253744/2253744 [02:43<00:00, 13780.29it/s]


In [242]:
trend = buy_full.merge(search, on=["date", "sex", "age", "big_cat", "sm_cat"], how='outer').fillna(0)
trend = trend.sort_values(['date', 'sm_cat', 'sex', 'age'], ascending=True).reset_index(drop=True)

print(trend.shape)
trend.head()

(2568668, 7)


Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,ratio
0,2018-01-01,F,20,식품,가공란,37.0,0.0
1,2018-01-01,F,30,식품,가공란,16.0,0.0
2,2018-01-01,F,40,식품,가공란,9.0,0.0
3,2018-01-01,F,50,식품,가공란,3.0,0.0
4,2018-01-01,M,20,식품,가공란,13.0,0.0


In [None]:
# Data Save 
trend.to_csv(os.path.join("외부데이터", "trendsearch.csv"), index=False, encoding='cp949')

### 2. 날씨 정보 

In [234]:
weather_search = weather_search.rename(columns = {'period' : 'date'})
weather_search['date'] = weather_search['date'].progress_apply(lambda x : pd.to_datetime(str(x), format='%Y-%m-%d'))

100%|██████████| 33070/33070 [00:03<00:00, 10952.54it/s]


In [248]:
for w_trend in weather_search.sm_cat.unique() : 
    tmp = weather_search[weather_search["sm_cat"] == w_trend][['date', 'sex', 'age', 'ratio']].rename(columns = {'ratio' : w_trend})
    trend = trend.merge(tmp, on=['date', 'sex', 'age'], how='left')

In [249]:
trend = trend.sort_values(['date', 'sm_cat', 'sex', 'age'], ascending=True).reset_index(drop=True)
print(trend.shape)
trend.head().append(trend.tail())

(2568668, 12)


Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,ratio,오늘날씨,미세먼지,오늘 비,기상정보,태풍정보
0,2018-01-01,F,20,식품,가공란,37.0,0.0,10.69387,4.09137,,,0.0444
1,2018-01-01,F,30,식품,가공란,16.0,0.0,12.42969,6.35294,,0.51282,0.42417
2,2018-01-01,F,40,식품,가공란,9.0,0.0,13.02249,4.88037,,0.4065,
3,2018-01-01,F,50,식품,가공란,3.0,0.0,11.6908,4.5335,1.21951,0.90909,
4,2018-01-01,M,20,식품,가공란,13.0,0.0,11.23032,3.61296,0.3891,2.22222,0.0406
2568663,2019-12-31,M,20,냉난방가전,히터,8.0,25.60296,28.46931,1.27392,1.36186,7.4074,0.0812
2568664,2019-12-31,M,30,냉난방가전,히터,22.0,24.16356,31.70578,1.41621,,3.63636,
2568665,2019-12-31,M,40,냉난방가전,히터,38.0,25.11627,37.03963,1.60475,,4.90956,0.17814
2568666,2019-12-31,M,50,냉난방가전,히터,23.0,25.18518,49.94212,2.41013,,4.81481,0.10649
2568667,2019-12-31,M,60,냉난방가전,히터,10.0,21.95121,67.48388,4.62476,,19.13043,0.69605


In [250]:
trend = trend.fillna(0)

In [251]:
trend.head().append(trend.tail())

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,ratio,오늘날씨,미세먼지,오늘 비,기상정보,태풍정보
0,2018-01-01,F,20,식품,가공란,37.0,0.0,10.69387,4.09137,0.0,0.0,0.0444
1,2018-01-01,F,30,식품,가공란,16.0,0.0,12.42969,6.35294,0.0,0.51282,0.42417
2,2018-01-01,F,40,식품,가공란,9.0,0.0,13.02249,4.88037,0.0,0.4065,0.0
3,2018-01-01,F,50,식품,가공란,3.0,0.0,11.6908,4.5335,1.21951,0.90909,0.0
4,2018-01-01,M,20,식품,가공란,13.0,0.0,11.23032,3.61296,0.3891,2.22222,0.0406
2568663,2019-12-31,M,20,냉난방가전,히터,8.0,25.60296,28.46931,1.27392,1.36186,7.4074,0.0812
2568664,2019-12-31,M,30,냉난방가전,히터,22.0,24.16356,31.70578,1.41621,0.0,3.63636,0.0
2568665,2019-12-31,M,40,냉난방가전,히터,38.0,25.11627,37.03963,1.60475,0.0,4.90956,0.17814
2568666,2019-12-31,M,50,냉난방가전,히터,23.0,25.18518,49.94212,2.41013,0.0,4.81481,0.10649
2568667,2019-12-31,M,60,냉난방가전,히터,10.0,21.95121,67.48388,4.62476,0.0,19.13043,0.69605


In [252]:
trend.to_csv(os.path.join("외부데이터", "trend_with_weather.csv"), index=False, encoding='cp949')

# 3. Final Dataset 

In [2]:
# Data Load 
weather = pd.read_csv('weather_final2.csv', encoding='cp949')
weatherindex = pd.read_csv('data_with_weather.csv', encoding='cp949')
trend = pd.read_csv('trend_with_weather.csv', encoding='cp949')

weather.shape, weatherindex.shape, trend.shape

((11680, 29), (2056899, 18), (2568668, 12), (192, 9), (191, 9))

In [8]:
weather = weather.rename(columns = {"날짜" : "date"})

tqdm.pandas()
weather["date"] = weather["date"].progress_apply(lambda x : pd.to_datetime(str(x), format="%Y-%m-%d"))
weatherindex["date"] = weatherindex["date"].progress_apply(lambda x : pd.to_datetime(str(x), format="%Y-%m-%d"))
trend["date"] = trend["date"].progress_apply(lambda x : pd.to_datetime(str(x), format="%Y-%m-%d"))

100%|██████████| 11680/11680 [00:00<00:00, 13172.70it/s]
100%|██████████| 2056899/2056899 [02:27<00:00, 13922.19it/s]
100%|██████████| 2568668/2568668 [03:02<00:00, 14074.06it/s]


In [107]:
weather.head()

Unnamed: 0,date,지점번호,평균기온,최고기온,최저기온,일별강수량,1시간최대강수량,평균풍속,지역,PM10,PM25,1시간최대습도,일조시간합,최고현지기압,연,월,일,분기,요일,공휴일명,공휴일여부,주말여부,계절,체감온도,열지수,폭염여부,강수여부,year,month
0,2018-01-01,105,1.3,5.7,-2.1,0.0,0.0,3.7,강릉,20.066667,13.4,25.4,57.9,1023.0,2018,1,1,1,0,1월1일,1,0,3,7.502056,-39.701524,0,0,2018,1
1,2018-01-01,112,-0.3,2.7,-2.7,0.0,0.0,1.6,인천,37.518681,18.641758,67.2,53.8,1020.3,2018,1,1,1,0,1월1일,1,0,3,9.992379,-42.924922,0,0,2018,1
2,2018-01-01,119,-1.7,4.7,-6.9,0.0,0.0,1.0,수원,42.782895,21.375,84.7,52.7,1025.1,2018,1,1,1,0,1월1일,1,0,3,10.136402,-45.791893,0,0,2018,1
3,2018-01-01,136,-1.0,4.7,-6.5,0.0,0.0,2.2,안동,39.75,28.583333,56.4,58.3,1010.6,2018,1,1,1,0,1월1일,1,0,3,8.356692,-44.375704,0,0,2018,1
4,2018-01-01,152,2.1,6.2,-0.4,0.0,0.0,3.3,울산,38.548193,15.433735,42.8,58.3,1017.1,2018,1,1,1,0,1월1일,1,0,3,8.861429,-38.064334,0,0,2018,1


## 1. Data Merge 

In [112]:
weather_cnt = weatherindex[["date", "sex", "age", "big_cat", "sm_cat", "qty", "cnt"]]

In [113]:
# 날씨 지수 데이터 
weatherindex2 = weatherindex[['date', 'avg_ta', 'max_ta', 'min_ta', 'rn_day', 'rn_hr1', 'avg_ws', 'PM10', 'PM25', 'hm_max', 'sum_ss_hr', 'max_pa']]
weatherindex2 = weatherindex2.drop_duplicates('date', keep='first').reset_index(drop=True)

print(weatherindex2.shape)
weatherindex2.head()

(730, 12)


Unnamed: 0,date,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws,PM10,PM25,hm_max,sum_ss_hr,max_pa
0,2018-01-01,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385
1,2018-01-02,1.208896,6.014385,-3.135167,0.0,0.0,1.903604,59.179487,33.836826,74.169781,37.996677,1021.524802
2,2018-01-03,-0.798604,3.236083,-3.619615,0.0,0.0,2.623562,32.757902,18.242391,59.310396,53.216885,1023.2775
3,2018-01-04,-1.075625,2.345708,-4.636646,0.0,0.0,1.842729,38.685406,23.045234,63.96175,33.707167,1021.539615
4,2018-01-05,0.584646,5.053948,-2.273062,0.316688,0.087646,2.280937,46.151746,30.785155,71.747365,40.40324,1014.525396


In [114]:
# 날짜 변수
weather2 = weatherindex2.merge(weather[['date', '연', '월', '분기', '요일', '공휴일여부', '주말여부', '계절']], on="date", how='left').drop_duplicates().reset_index(drop=True)

# 날짜 더미변수 전처리 
weather2 = pd.get_dummies(weather2, columns=['연'], prefix='연', drop_first=True) 
weather2 = pd.get_dummies(weather2, columns=['월'], prefix='월', drop_first=True) 
weather2 = pd.get_dummies(weather2, columns=['분기'], prefix='분기', drop_first=True) 
weather2 = pd.get_dummies(weather2, columns=['요일'], prefix='요일', drop_first=True) 
weather2 = pd.get_dummies(weather2, columns=['계절'], prefix='계절', drop_first=False) 

print(weather2.shape)
weather2.head()

(730, 39)


Unnamed: 0,date,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws,PM10,PM25,hm_max,sum_ss_hr,max_pa,공휴일여부,주말여부,연_2019,월_2,월_3,월_4,월_5,월_6,월_7,월_8,월_9,월_10,월_11,월_12,분기_2,분기_3,분기_4,요일_1,요일_2,요일_3,요일_4,요일_5,요일_6,계절_0,계절_1,계절_2,계절_3
0,2018-01-01,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,2018-01-02,1.208896,6.014385,-3.135167,0.0,0.0,1.903604,59.179487,33.836826,74.169781,37.996677,1021.524802,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
2,2018-01-03,-0.798604,3.236083,-3.619615,0.0,0.0,2.623562,32.757902,18.242391,59.310396,53.216885,1023.2775,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
3,2018-01-04,-1.075625,2.345708,-4.636646,0.0,0.0,1.842729,38.685406,23.045234,63.96175,33.707167,1021.539615,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
4,2018-01-05,0.584646,5.053948,-2.273062,0.316688,0.087646,2.280937,46.151746,30.785155,71.747365,40.40324,1014.525396,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1


In [115]:
weather2 = weather2.rename(columns = {"계절_0" : "봄", "계절_1" : "여름", "계절_2" : "가을", "계절_3" : "겨울"})

In [116]:
# 검색량 데이터 + 날씨 지수 데이터 
final = trend.merge(weather2, on='date', how='left')
final = final.merge(weather_cnt, on=["date", "sex", "age", "big_cat", "sm_cat", "qty"], how='left')

print(final.shape)
final.head()

(2568668, 51)


Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,ratio,오늘날씨,미세먼지,오늘 비,기상정보,태풍정보,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws,PM10,PM25,hm_max,sum_ss_hr,max_pa,공휴일여부,주말여부,연_2019,월_2,월_3,월_4,월_5,월_6,월_7,월_8,월_9,월_10,월_11,월_12,분기_2,분기_3,분기_4,요일_1,요일_2,요일_3,요일_4,요일_5,요일_6,봄,여름,가을,겨울,cnt
0,2018-01-01,F,20,식품,가공란,37.0,0.0,10.69387,4.09137,0.0,0.0,0.0444,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.480964
1,2018-01-01,F,30,식품,가공란,16.0,0.0,12.42969,6.35294,0.0,0.51282,0.42417,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.480964
2,2018-01-01,F,40,식품,가공란,9.0,0.0,13.02249,4.88037,0.0,0.4065,0.0,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.480964
3,2018-01-01,F,50,식품,가공란,3.0,0.0,11.6908,4.5335,1.21951,0.90909,0.0,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.480964
4,2018-01-01,M,20,식품,가공란,13.0,0.0,11.23032,3.61296,0.3891,2.22222,0.0406,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.480964


In [119]:
# 공통 더미변수 전처리 
final["sex"] = final["sex"].progress_apply(lambda x : 1 if x=='F' else 0)
final = pd.get_dummies(final, columns=['age'], prefix='age', drop_first=False)

100%|██████████| 2568668/2568668 [00:02<00:00, 1116496.12it/s]


In [125]:
# 최종 dataframe 
final.to_csv(os.path.join("최종데이터", "final_0622.csv"), index=False, encoding="cp949")

In [121]:
# 범주형 
weather3 = weatherindex2.merge(weather[['date', '연', '월', '분기', '요일', '공휴일여부', '주말여부', '계절']], on="date", how='left').drop_duplicates().reset_index(drop=True)

# 검색량 데이터 + 날씨 지수 데이터 
final2 = trend.merge(weather3, on='date', how='left')
final2 = final2.merge(weather_cnt, on=["date", "sex", "age", "big_cat", "sm_cat", "qty"], how='left')

print(final2.shape)
final2.head()

(2568668, 31)


Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,ratio,오늘날씨,미세먼지,오늘 비,기상정보,태풍정보,avg_ta,max_ta,min_ta,rn_day,rn_hr1,avg_ws,PM10,PM25,hm_max,sum_ss_hr,max_pa,연,월,분기,요일,공휴일여부,주말여부,계절,cnt
0,2018-01-01,F,20,식품,가공란,37.0,0.0,10.69387,4.09137,0.0,0.0,0.0444,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,2018,1,1,0,1,0,3,0.480964
1,2018-01-01,F,30,식품,가공란,16.0,0.0,12.42969,6.35294,0.0,0.51282,0.42417,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,2018,1,1,0,1,0,3,0.480964
2,2018-01-01,F,40,식품,가공란,9.0,0.0,13.02249,4.88037,0.0,0.4065,0.0,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,2018,1,1,0,1,0,3,0.480964
3,2018-01-01,F,50,식품,가공란,3.0,0.0,11.6908,4.5335,1.21951,0.90909,0.0,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,2018,1,1,0,1,0,3,0.480964
4,2018-01-01,M,20,식품,가공란,13.0,0.0,11.23032,3.61296,0.3891,2.22222,0.0406,0.885198,5.464146,-2.88074,0.0,0.0,2.157094,44.015611,21.477208,61.923313,52.276365,1020.592385,2018,1,1,0,1,0,3,0.480964


In [124]:
final2.to_csv(os.path.join("최종데이터", "final_categorical_0622.csv"), index=False, encoding="cp949")