# 검색어 트렌드 크롤링 
* https://datalab.naver.com/keyword/trendSearch.naver 
* **sm_cat** 에 해당하는 검색어 트렌드 값 산출 
* `/` 로 나뉘어 있는 경우, 평균값 사용

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import sys
import urllib.request

import pandas as pd 
import numpy as np 
from tqdm import tqdm

import dataload

import re
from bs4 import BeautifulSoup

import json
from pandas.io.json import json_normalize

In [2]:
# Data Load & Get Category
buy_full = dataload.load_buy()
print(buy_full.shape)

food = list(buy_full[buy_full["big_cat"]=="식품"].sm_cat.unique())
beauty = list(buy_full[buy_full["big_cat"]=="뷰티"].sm_cat.unique())
app = list(buy_full[buy_full["big_cat"]=="냉난방가전"].sm_cat.unique())
print("food : {} | beauty : {} | app : {}".format(len(food), len(beauty), len(app)))

(2056899, 6)
food : 212 | beauty : 131 | app : 40


In [3]:
buy_full.head(10)

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty
0,20180101,F,20,식품,가공란,37
1,20180101,F,30,식품,가공란,16
2,20180101,F,40,식품,가공란,9
3,20180101,F,50,식품,가공란,3
4,20180101,M,20,식품,가공란,13
5,20180101,M,30,식품,가공란,6
6,20180101,M,40,식품,가공란,6
7,20180101,F,20,냉난방가전,가열식 가습기,1
8,20180101,F,40,냉난방가전,가열식 가습기,3
9,20180101,M,20,냉난방가전,가열식 가습기,1


In [73]:
pd.DataFrame(food).to_csv("food_category.csv", encoding="cp949")
pd.DataFrame(beauty).to_csv("beauty_category.csv", encoding="cp949")
pd.DataFrame(app).to_csv("app_category.csv", encoding="cp949")

In [169]:
# category 수정 
food2 = pd.read_csv("food_category.csv", encoding="cp949")
beauty2 = pd.read_csv("beauty_category.csv", encoding="cp949")
app2 = pd.read_csv("app_category.csv", encoding="cp949")

In [170]:
food2 = food2['0'].tolist()
beauty2 = beauty2['0'].tolist()
app2 = app2['0'].tolist()

print("food : {} | beauty : {} | app : {}".format(len(food2), len(beauty2), len(app2)))

food : 212 | beauty : 131 | app : 40


In [171]:
# original category 정보 저장 
food_cat = pd.concat([pd.DataFrame(food), pd.DataFrame(food2)], axis=1)
beauty_cat = pd.concat([pd.DataFrame(beauty), pd.DataFrame(beauty2)], axis=1)
app_cat = pd.concat([pd.DataFrame(app), pd.DataFrame(app2)], axis=1)

# Naver Developers API 
* 참고 : [Parameters](https://developers.naver.com/docs/serviceapi/datalab/search/search.md#%EB%84%A4%EC%9D%B4%EB%B2%84-%ED%86%B5%ED%95%A9-%EA%B2%80%EC%83%89%EC%96%B4-%ED%8A%B8%EB%A0%8C%EB%93%9C-%EC%A1%B0%ED%9A%8C)
* 함정 : 비교 척도 **100** 기준 ㅠㅠ 

In [172]:
client_id = "???" ; client_secret = "???"

In [173]:
def keyword_search(client_id, client_secret, keyword) : 
    ages = {20:['3','4'], 30:['5','6'], 40:['7','8'], 50:['9','10'], 60:['11']} ; genders = ["f", "m"]
    df = pd.DataFrame()
    
    for age, age_list in ages.items() : 
        for gender in genders : 
            client_id = client_id ; client_secret = client_secret
            url = "https://openapi.naver.com/v1/datalab/search"
            body = {"startDate":"2018-01-01",
                    "endDate":"2019-12-31",
                    "timeUnit":"date",
                    "keywordGroups":[{"groupName":keyword,"keywords":[keyword]}],
                    "ages": age_list ,
                    "gender": gender}
            body_j = json.dumps(body)

            # get results 
            request = urllib.request.Request(url)
            request.add_header("X-Naver-Client-Id",client_id)
            request.add_header("X-Naver-Client-Secret",client_secret)
            request.add_header("Content-Type","application/json")
            response = urllib.request.urlopen(request, data=body_j.encode("utf-8"))
            rescode = response.getcode()

            if(rescode==200):
                response_body = response.read()
                scraped = response_body.decode('utf-8')
                
                try : 
                    result = json_normalize(json.loads(scraped)["results"][0]["data"])
                    result.insert(1, "age", age) ; result.insert(1, "sex", gender.upper()) 
                    df = df.append(result, ignore_index=True)
                except : 
                    pass

            else:
                print("Error Code:" + rescode) 
    
    try : 
        df.insert(3, "sm_cat", keyword)
    except : 
        pass 
    
    return df

In [179]:
def category_search(big_cat) : 
    df = pd.DataFrame()
    
    for sm_cat in tqdm(big_cat) : 
        try: 
            if not '/' in sm_cat : 
                tmp = keyword_search(client_id, client_secret, sm_cat)
                df = df.append(tmp, ignore_index=True)

            # '/' 나뉘어져 있는 경우 
            else : 
                tmp2 = pd.DataFrame()
                sm_cats = sm_cat.split('/')
                for sm in sm_cats : 
                    tmp = keyword_search(client_id, client_secret, sm)
                    tmp2 = tmp2.append(tmp, ignore_index=True)
                    
                try : 
                    tmp2 = tmp2.groupby(["period", "sex", "age"]).agg({'ratio' : [('ratio', np.mean)]}).reset_index()
                    tmp2.columns = tmp2.columns.get_level_values(0)
                    tmp2.insert(3, "sm_cat", sm_cat)
                except : 
                    pass

                df = df.append(tmp2, ignore_index=True)
        
        except urllib.request.HTTPError : 
            print("Too Many Requests - at {}".format(big_cat.index(sm_cat)))
            
    if big_cat[0] in food2 : df.insert(3, "big_cat", "식품")
    elif big_cat[0] in beauty2 : df.insert(3, "big_cat", "뷰티")
    elif big_cat[0] in app2 : df.insert(3, "big_cat", "냉난방가전")
    else : df.insert(3, "big_cat", "날씨")
        
    return df 

## 1. 날씨 정보 

In [65]:
weather_keywords = ["오늘날씨", "미세먼지", "오늘 비", "기상정보", "태풍정보"]
weather_search = category_search(weather_keywords)

100%|██████████| 5/5 [00:21<00:00,  4.28s/it]


In [66]:
print(weather_search.shape)
weather_search.head(10)

(33070, 6)


Unnamed: 0,period,sex,age,big_cat,sm_cat,ratio
0,2018-01-01,F,20,날씨,오늘날씨,10.69387
1,2018-01-02,F,20,날씨,오늘날씨,16.83434
2,2018-01-03,F,20,날씨,오늘날씨,13.63583
3,2018-01-04,F,20,날씨,오늘날씨,12.33925
4,2018-01-05,F,20,날씨,오늘날씨,12.84071
5,2018-01-06,F,20,날씨,오늘날씨,12.89477
6,2018-01-07,F,20,날씨,오늘날씨,19.33421
7,2018-01-08,F,20,날씨,오늘날씨,34.82321
8,2018-01-09,F,20,날씨,오늘날씨,22.80943
9,2018-01-10,F,20,날씨,오늘날씨,16.20355


In [67]:
weather_search.sm_cat.value_counts()

미세먼지    7300
오늘날씨    7300
기상정보    6928
태풍정보    6143
오늘 비    5399
Name: sm_cat, dtype: int64

In [68]:
weather_search.to_csv("weather_trend.csv", index=False, encoding='cp949')

## 2. 가전 

In [120]:
def to_original_category_name(searchdf, categorydf) : 
    categorydf.columns = ["original", "rename"]
    searchdf = searchdf.merge(categorydf, left_on="sm_cat", right_on="rename").drop(["sm_cat", "rename"], axis=1).rename(columns={'original' : 'sm_cat'})
    searchdf = searchdf[['period', 'sex', 'age', 'big_cat', 'sm_cat', 'ratio']]
    return searchdf

In [97]:
app_search = category_search(app2)

100%|██████████| 40/40 [01:55<00:00,  2.89s/it]


In [121]:
app_search2 = to_original_category_name(app_search, app_cat)

In [122]:
print(app_search2.shape)
app_search2.head()

(208971, 6)


Unnamed: 0,period,sex,age,big_cat,sm_cat,ratio
0,2018-01-01,F,20,냉난방가전,가열식 가습기,24.75961
1,2018-01-02,F,20,냉난방가전,가열식 가습기,27.64423
2,2018-01-03,F,20,냉난방가전,가열식 가습기,30.16826
3,2018-01-04,F,20,냉난방가전,가열식 가습기,35.8173
4,2018-01-05,F,20,냉난방가전,가열식 가습기,31.37019


In [123]:
app_search2.to_csv("app_trend.csv", index=False, encoding='cp949')

## 3. 뷰티 

In [130]:
beauty_search1 = category_search(beauty2[:90])

100%|██████████| 90/90 [02:14<00:00,  1.50s/it]


In [149]:
beauty_search2 = category_search(beauty2[90:])

100%|██████████| 41/41 [01:06<00:00,  1.63s/it]


In [151]:
beauty_search = pd.concat([beauty_search1, beauty_search2], axis=0).reset_index(drop=True)
beauty_search = to_original_category_name(beauty_search, beauty_cat)

print(beauty_search.shape)
beauty_search.head()

(718099, 6)


Unnamed: 0,period,sex,age,big_cat,sm_cat,ratio
0,2018-01-01,F,20,뷰티,기능성 링클케어 화장품,25.0
1,2018-01-02,F,20,뷰티,기능성 링클케어 화장품,25.0
2,2018-01-03,F,20,뷰티,기능성 링클케어 화장품,50.0
3,2018-01-04,F,20,뷰티,기능성 링클케어 화장품,58.33333
4,2018-01-05,F,20,뷰티,기능성 링클케어 화장품,8.33333


In [152]:
beauty_search.to_csv("beauty_trend.csv", index=False, encoding='cp949')

## 4. 식품 

In [175]:
food_search1 = category_search(food2[:70])

100%|██████████| 70/70 [05:11<00:00,  4.46s/it]


In [182]:
food_search2 = category_search(food2[70:137])

100%|██████████| 67/67 [04:19<00:00,  3.87s/it]


In [184]:
food_search3 = category_search(food2[137:])

 95%|█████████▍| 71/75 [05:20<00:09,  2.46s/it]

Too Many Requests - at 69
Too Many Requests - at 70


 97%|█████████▋| 73/75 [05:20<00:02,  1.27s/it]

Too Many Requests - at 71
Too Many Requests - at 72


100%|██████████| 75/75 [05:20<00:00,  4.28s/it]

Too Many Requests - at 73
Too Many Requests - at 74





In [192]:
food_search4 = category_search(food2[-6:])

100%|██████████| 6/6 [00:22<00:00,  3.72s/it]


In [194]:
food_search = pd.concat([food_search1,food_search2, food_search3, food_search4], axis=0).reset_index(drop=True)
food_search = to_original_category_name(food_search, food_cat)

print(food_search.shape)
food_search.head()

(1326674, 6)


Unnamed: 0,period,sex,age,big_cat,sm_cat,ratio
0,2018-06-18,F,20,식품,가공란,75.0
1,2018-06-19,F,20,식품,가공란,75.0
2,2018-07-03,F,20,식품,가공란,50.0
3,2019-05-28,F,20,식품,가공란,100.0
4,2018-06-18,M,20,식품,가공란,100.0


In [195]:
food_search.to_csv("food_trend.csv", index=False, encoding='cp949')

# Data Merge

## 1. 검색어 트렌드 

In [199]:
tqdm.pandas()
buy_full['date'] = buy_full['date'].progress_apply(lambda x : pd.to_datetime(str(x), format='%Y%m%d'))

100%|██████████| 2056899/2056899 [03:23<00:00, 10095.74it/s]


In [196]:
search = pd.concat([app_search2, beauty_search, food_search], axis=0).reset_index(drop=True)
search.shape

(2253744, 6)

In [208]:
search = search.rename(columns = {'period' : 'date'})
search['date'] = search['date'].progress_apply(lambda x : pd.to_datetime(str(x), format='%Y-%m-%d'))

100%|██████████| 2253744/2253744 [02:43<00:00, 13780.29it/s]


In [242]:
trend = buy_full.merge(search, on=["date", "sex", "age", "big_cat", "sm_cat"], how='outer').fillna(0)
trend = trend.sort_values(['date', 'sm_cat', 'sex', 'age'], ascending=True).reset_index(drop=True)

print(trend.shape)
trend.head()

(2568668, 7)


Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,ratio
0,2018-01-01,F,20,식품,가공란,37.0,0.0
1,2018-01-01,F,30,식품,가공란,16.0,0.0
2,2018-01-01,F,40,식품,가공란,9.0,0.0
3,2018-01-01,F,50,식품,가공란,3.0,0.0
4,2018-01-01,M,20,식품,가공란,13.0,0.0


In [243]:
# 1. 판매량 존재하지 않는데, 검색량 존재하는 경우 
trend[trend["qty"] == 0]

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,ratio
7,2018-01-01,F,20,냉난방가전,가스온수기,0.0,10.52631
8,2018-01-01,F,30,냉난방가전,가스온수기,0.0,19.04761
9,2018-01-01,F,40,냉난방가전,가스온수기,0.0,16.66666
10,2018-01-01,F,60,냉난방가전,가스온수기,0.0,14.28571
11,2018-01-01,M,20,냉난방가전,가스온수기,0.0,6.25000
...,...,...,...,...,...,...,...
2568627,2019-12-31,M,40,냉난방가전,황토매트,0.0,12.90322
2568640,2019-12-31,F,20,냉난방가전,휴대용 선풍기,0.0,0.08343
2568643,2019-12-31,F,50,냉난방가전,휴대용 선풍기,0.0,1.02214
2568646,2019-12-31,M,40,냉난방가전,휴대용 선풍기,0.0,0.26624


In [244]:
# 2. 판매량 존재하는데, 검색량 0인 경우 
trend[trend["ratio"] == 0]

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,ratio
0,2018-01-01,F,20,식품,가공란,37.0,0.0
1,2018-01-01,F,30,식품,가공란,16.0,0.0
2,2018-01-01,F,40,식품,가공란,9.0,0.0
3,2018-01-01,F,50,식품,가공란,3.0,0.0
4,2018-01-01,M,20,식품,가공란,13.0,0.0
...,...,...,...,...,...,...,...
2568651,2019-12-31,F,50,식품,흰우유,49.0,0.0
2568652,2019-12-31,F,60,식품,흰우유,11.0,0.0
2568654,2019-12-31,M,30,식품,흰우유,98.0,0.0
2568656,2019-12-31,M,50,식품,흰우유,24.0,0.0


In [245]:
print(trend.shape)
trend.head().append(trend.tail())

(2568668, 7)


Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,ratio
0,2018-01-01,F,20,식품,가공란,37.0,0.0
1,2018-01-01,F,30,식품,가공란,16.0,0.0
2,2018-01-01,F,40,식품,가공란,9.0,0.0
3,2018-01-01,F,50,식품,가공란,3.0,0.0
4,2018-01-01,M,20,식품,가공란,13.0,0.0
2568663,2019-12-31,M,20,냉난방가전,히터,8.0,25.60296
2568664,2019-12-31,M,30,냉난방가전,히터,22.0,24.16356
2568665,2019-12-31,M,40,냉난방가전,히터,38.0,25.11627
2568666,2019-12-31,M,50,냉난방가전,히터,23.0,25.18518
2568667,2019-12-31,M,60,냉난방가전,히터,10.0,21.95121


In [246]:
trend.to_csv("trendsearch.csv", index=False, encoding='cp949')

In [12]:
trend = pd.read_csv("trendsearch.csv", encoding='cp949')
print(trend.shape)
trend.head()

(2568668, 7)


Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,ratio
0,2018-01-01,F,20,식품,가공란,37.0,0.0
1,2018-01-01,F,30,식품,가공란,16.0,0.0
2,2018-01-01,F,40,식품,가공란,9.0,0.0
3,2018-01-01,F,50,식품,가공란,3.0,0.0
4,2018-01-01,M,20,식품,가공란,13.0,0.0


In [22]:
trend[(trend.big_cat == '뷰티') & (trend.sm_cat == '핸드크림')]

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,ratio
3139,2018-01-01,F,20,뷰티,핸드크림,92.0,38.02008
3140,2018-01-01,F,30,뷰티,핸드크림,93.0,30.28322
3141,2018-01-01,F,40,뷰티,핸드크림,55.0,21.22905
3142,2018-01-01,F,50,뷰티,핸드크림,21.0,14.00000
3143,2018-01-01,F,60,뷰티,핸드크림,3.0,20.00000
...,...,...,...,...,...,...,...
2568366,2019-12-31,M,20,뷰티,핸드크림,13.0,41.30982
2568367,2019-12-31,M,30,뷰티,핸드크림,29.0,30.29556
2568368,2019-12-31,M,40,뷰티,핸드크림,18.0,24.12280
2568369,2019-12-31,M,50,뷰티,핸드크림,8.0,30.37974


In [19]:
trend.sm_cat.unique()

array(['가공란', '가스온수기', '가열식 가습기', '가자미', '갈비/찜/바비큐용 돈육', '갈비용 우육', '갈치',
       '감/홍시', '감귤/한라봉/오렌지', '감마리놀렌산 영양제', '감말랭이', '감자', '갓김치', '건강즙',
       '건강즙/녹용', '건대추', '건망고', '건바나나', '건어물 건새우', '건어물 노가리', '건어물 마른오징어',
       '건어물 멸치', '건어물 쥐포', '건어물 진미채', '건어물 황태', '건자두', '건포도', '게장류',
       '견과류', '견과류 땅콩', '견과류 마카다미아', '견과류 밤', '견과류 잣/은행', '견과류 카카오닙스',
       '견과류 캐슈넛', '견과류 피스타치오', '견과류 호두', '계란', '고등어', '고추/피망/파프리카', '곡물차',
       '공기정화 용품', '공기청정기', '곶감/반건시', '과실차', '과일류', '과일세트', '과채 음료/주스',
       '구이/수육용 돈육', '국내산 돈육', '굴 생물', '굴비/조기', '글루코사민/키토산 영양제',
       '기능성 링클케어 화장품', '기능성 모공관리 화장품', '기능성 아이케어 화장품', '기능성 영양보습 화장품',
       '기능성 트러블케어 화장품', '기능성 화이트닝 화장품', '기능성 화장품 세트', '기초 화장용 로션',
       '기초 화장용 미스트', '기초 화장용 스킨', '기초 화장용 에센스', '기초 화장용 오일/앰플',
       '기초 화장용 크림', '기타 농산물', '기타 주스류', '기타 한방/환제품', '김치류', '꽃게', '나물',
       '낙지', '남성 로션', '남성 메이크업', '남성 선케어', '남성 세트', '남성 쉐이빙', '남성 스킨',
       '남성 에센스', '남성 크림', '남성 클렌징', '남성향수', '냉풍기', '네일 메이크업 용품',
       '네일관리 소품', '네일리

## 2. 날씨 트렌드 

In [234]:
weather_search = weather_search.rename(columns = {'period' : 'date'})
weather_search['date'] = weather_search['date'].progress_apply(lambda x : pd.to_datetime(str(x), format='%Y-%m-%d'))

100%|██████████| 33070/33070 [00:03<00:00, 10952.54it/s]


In [248]:
for w_trend in weather_search.sm_cat.unique() : 
    tmp = weather_search[weather_search["sm_cat"] == w_trend][['date', 'sex', 'age', 'ratio']].rename(columns = {'ratio' : w_trend})
    trend = trend.merge(tmp, on=['date', 'sex', 'age'], how='left')

In [249]:
trend = trend.sort_values(['date', 'sm_cat', 'sex', 'age'], ascending=True).reset_index(drop=True)
print(trend.shape)
trend.head().append(trend.tail())

(2568668, 12)


Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,ratio,오늘날씨,미세먼지,오늘 비,기상정보,태풍정보
0,2018-01-01,F,20,식품,가공란,37.0,0.0,10.69387,4.09137,,,0.0444
1,2018-01-01,F,30,식품,가공란,16.0,0.0,12.42969,6.35294,,0.51282,0.42417
2,2018-01-01,F,40,식품,가공란,9.0,0.0,13.02249,4.88037,,0.4065,
3,2018-01-01,F,50,식품,가공란,3.0,0.0,11.6908,4.5335,1.21951,0.90909,
4,2018-01-01,M,20,식품,가공란,13.0,0.0,11.23032,3.61296,0.3891,2.22222,0.0406
2568663,2019-12-31,M,20,냉난방가전,히터,8.0,25.60296,28.46931,1.27392,1.36186,7.4074,0.0812
2568664,2019-12-31,M,30,냉난방가전,히터,22.0,24.16356,31.70578,1.41621,,3.63636,
2568665,2019-12-31,M,40,냉난방가전,히터,38.0,25.11627,37.03963,1.60475,,4.90956,0.17814
2568666,2019-12-31,M,50,냉난방가전,히터,23.0,25.18518,49.94212,2.41013,,4.81481,0.10649
2568667,2019-12-31,M,60,냉난방가전,히터,10.0,21.95121,67.48388,4.62476,,19.13043,0.69605


In [250]:
trend = trend.fillna(0)

In [251]:
trend.head().append(trend.tail())

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty,ratio,오늘날씨,미세먼지,오늘 비,기상정보,태풍정보
0,2018-01-01,F,20,식품,가공란,37.0,0.0,10.69387,4.09137,0.0,0.0,0.0444
1,2018-01-01,F,30,식품,가공란,16.0,0.0,12.42969,6.35294,0.0,0.51282,0.42417
2,2018-01-01,F,40,식품,가공란,9.0,0.0,13.02249,4.88037,0.0,0.4065,0.0
3,2018-01-01,F,50,식품,가공란,3.0,0.0,11.6908,4.5335,1.21951,0.90909,0.0
4,2018-01-01,M,20,식품,가공란,13.0,0.0,11.23032,3.61296,0.3891,2.22222,0.0406
2568663,2019-12-31,M,20,냉난방가전,히터,8.0,25.60296,28.46931,1.27392,1.36186,7.4074,0.0812
2568664,2019-12-31,M,30,냉난방가전,히터,22.0,24.16356,31.70578,1.41621,0.0,3.63636,0.0
2568665,2019-12-31,M,40,냉난방가전,히터,38.0,25.11627,37.03963,1.60475,0.0,4.90956,0.17814
2568666,2019-12-31,M,50,냉난방가전,히터,23.0,25.18518,49.94212,2.41013,0.0,4.81481,0.10649
2568667,2019-12-31,M,60,냉난방가전,히터,10.0,21.95121,67.48388,4.62476,0.0,19.13043,0.69605


In [252]:
trend.to_csv("trend_with_weather.csv", index=False, encoding='cp949')