# 검색어 트렌드 크롤링 
* https://datalab.naver.com/keyword/trendSearch.naver 
* **sm_cat** 에 해당하는 검색어 트렌드 값 산출 
* `/` 로 나뉘어 있는 경우, 평균값 사용

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import sys
import urllib.request

import pandas as pd 
import numpy as np 
from tqdm import tqdm

import dataload

import re
from bs4 import BeautifulSoup

import json
from pandas.io.json import json_normalize

In [2]:
# Data Load & Get Category
buy_full = dataload.load_buy()
print(buy_full.shape)

food = list(buy_full[buy_full["big_cat"]=="식품"].sm_cat.unique())
beauty = list(buy_full[buy_full["big_cat"]=="뷰티"].sm_cat.unique())
app = list(buy_full[buy_full["big_cat"]=="냉난방가전"].sm_cat.unique())
print("food : {} | beauty : {} | app : {}".format(len(food), len(beauty), len(app)))

(2056899, 6)
food : 212 | beauty : 131 | app : 40


In [3]:
buy_full.head(10)

Unnamed: 0,date,sex,age,big_cat,sm_cat,qty
0,20180101,F,20,식품,가공란,37
1,20180101,F,30,식품,가공란,16
2,20180101,F,40,식품,가공란,9
3,20180101,F,50,식품,가공란,3
4,20180101,M,20,식품,가공란,13
5,20180101,M,30,식품,가공란,6
6,20180101,M,40,식품,가공란,6
7,20180101,F,20,냉난방가전,가열식 가습기,1
8,20180101,F,40,냉난방가전,가열식 가습기,3
9,20180101,M,20,냉난방가전,가열식 가습기,1


In [73]:
pd.DataFrame(food).to_csv("food_category.csv", encoding="cp949")
pd.DataFrame(beauty).to_csv("beauty_category.csv", encoding="cp949")
pd.DataFrame(app).to_csv("app_category.csv", encoding="cp949")

In [92]:
# category 수정 
food2 = pd.read_csv("food_category.csv", encoding="cp949")
beauty2 = pd.read_csv("beauty_category.csv", encoding="cp949")
app2 = pd.read_csv("app_category.csv", encoding="cp949")

In [94]:
food2 = food2['0'].tolist()
beauty2 = beauty2['0'].tolist()
app2 = app2['0'].tolist()

print("food : {} | beauty : {} | app : {}".format(len(food2), len(beauty2), len(app2)))

food : 212 | beauty : 131 | app : 40


In [119]:
# original category 정보 저장 
food_cat = pd.concat([pd.DataFrame(food), pd.DataFrame(food2)], axis=1)
beauty_cat = pd.concat([pd.DataFrame(beauty), pd.DataFrame(beauty2)], axis=1)
app_cat = pd.concat([pd.DataFrame(app), pd.DataFrame(app2)], axis=1)

# Naver Developers API 
* 참고 : [Parameters](https://developers.naver.com/docs/serviceapi/datalab/search/search.md#%EB%84%A4%EC%9D%B4%EB%B2%84-%ED%86%B5%ED%95%A9-%EA%B2%80%EC%83%89%EC%96%B4-%ED%8A%B8%EB%A0%8C%EB%93%9C-%EC%A1%B0%ED%9A%8C)
* 함정 : 비교 척도 **100** 기준 ㅠㅠ 

In [140]:
client_id = "x6_VTrxuFQF5fGLEGlTQ" ; client_secret = "Q0us4DPwgg"

In [63]:
def keyword_search(client_id, client_secret, keyword) : 
    ages = {20:['3','4'], 30:['5','6'], 40:['7','8'], 50:['9','10'], 60:['11']} ; genders = ["f", "m"]
    df = pd.DataFrame()
    
    for age, age_list in ages.items() : 
        for gender in genders : 
            client_id = client_id ; client_secret = client_secret
            url = "https://openapi.naver.com/v1/datalab/search"
            body = {"startDate":"2018-01-01",
                    "endDate":"2019-12-31",
                    "timeUnit":"date",
                    "keywordGroups":[{"groupName":keyword,"keywords":[keyword]}],
                    "ages": age_list ,
                    "gender": gender}
            body_j = json.dumps(body)

            # get results 
            request = urllib.request.Request(url)
            request.add_header("X-Naver-Client-Id",client_id)
            request.add_header("X-Naver-Client-Secret",client_secret)
            request.add_header("Content-Type","application/json")
            response = urllib.request.urlopen(request, data=body_j.encode("utf-8"))
            rescode = response.getcode()

            if(rescode==200):
                response_body = response.read()
                scraped = response_body.decode('utf-8')
                
                try : 
                    result = json_normalize(json.loads(scraped)["results"][0]["data"])
                    result.insert(1, "age", age) ; result.insert(1, "sex", gender.upper()) 
                    df = df.append(result, ignore_index=True)
                except : 
                    pass

            else:
                print("Error Code:" + rescode) 
    
    try : 
        df.insert(3, "sm_cat", keyword)
    except : 
        pass 
    
    return df

In [156]:
def category_search(big_cat) : 
    df = pd.DataFrame()
    
    for sm_cat in tqdm(big_cat) : 
        try: 
            if not '/' in sm_cat : 
                tmp = keyword_search(client_id, client_secret, sm_cat)
                df = df.append(tmp, ignore_index=True)

            # '/' 나뉘어져 있는 경우 
            else : 
                tmp2 = pd.DataFrame()
                sm_cats = sm_cat.split('/')
                for sm in sm_cats : 
                    tmp = keyword_search(client_id, client_secret, sm)
                    tmp2 = tmp2.append(tmp, ignore_index=True)
                    
                try : 
                    tmp2 = tmp2.groupby(["period", "sex", "age"]).agg({'ratio' : [('ratio', np.mean)]}).reset_index()
                    tmp2.columns = tmp2.columns.get_level_values(0)
                    tmp2.insert(3, "sm_cat", sm_cat)
                except : 
                    pass

                df = df.append(tmp2, ignore_index=True)
        
        except urllib.request.HTTPError : 
            print("Too Many Requests - at {}".format(big_cat.index("sm_cat")))
            
    if big_cat[0] in food2 : df.insert(3, "big_cat", "식품")
    elif big_cat[0] in beauty2 : df.insert(3, "big_cat", "뷰티")
    elif big_cat[0] in app2 : df.insert(3, "big_cat", "냉난방가전")
    else : df.insert(3, "big_cat", "날씨")
        
    return df 

## 1. 날씨 정보 

In [65]:
weather_keywords = ["오늘날씨", "미세먼지", "오늘 비", "기상정보", "태풍정보"]
weather_search = category_search(weather_keywords)

100%|██████████| 5/5 [00:21<00:00,  4.28s/it]


In [66]:
print(weather_search.shape)
weather_search.head(10)

(33070, 6)


Unnamed: 0,period,sex,age,big_cat,sm_cat,ratio
0,2018-01-01,F,20,날씨,오늘날씨,10.69387
1,2018-01-02,F,20,날씨,오늘날씨,16.83434
2,2018-01-03,F,20,날씨,오늘날씨,13.63583
3,2018-01-04,F,20,날씨,오늘날씨,12.33925
4,2018-01-05,F,20,날씨,오늘날씨,12.84071
5,2018-01-06,F,20,날씨,오늘날씨,12.89477
6,2018-01-07,F,20,날씨,오늘날씨,19.33421
7,2018-01-08,F,20,날씨,오늘날씨,34.82321
8,2018-01-09,F,20,날씨,오늘날씨,22.80943
9,2018-01-10,F,20,날씨,오늘날씨,16.20355


In [67]:
weather_search.sm_cat.value_counts()

미세먼지    7300
오늘날씨    7300
기상정보    6928
태풍정보    6143
오늘 비    5399
Name: sm_cat, dtype: int64

In [68]:
weather_search.to_csv("weather_trend.csv", index=False, encoding='cp949')

## 2. 가전 

In [120]:
def to_original_category_name(searchdf, categorydf) : 
    categorydf.columns = ["original", "rename"]
    searchdf = searchdf.merge(categorydf, left_on="sm_cat", right_on="rename").drop(["sm_cat", "rename"], axis=1).rename(columns={'original' : 'sm_cat'})
    searchdf = searchdf[['period', 'sex', 'age', 'big_cat', 'sm_cat', 'ratio']]
    return searchdf

In [97]:
app_search = category_search(app2)

100%|██████████| 40/40 [01:55<00:00,  2.89s/it]


In [121]:
app_search2 = to_original_category_name(app_search, app_cat)

In [122]:
print(app_search2.shape)
app_search2.head()

(208971, 6)


Unnamed: 0,period,sex,age,big_cat,sm_cat,ratio
0,2018-01-01,F,20,냉난방가전,가열식 가습기,24.75961
1,2018-01-02,F,20,냉난방가전,가열식 가습기,27.64423
2,2018-01-03,F,20,냉난방가전,가열식 가습기,30.16826
3,2018-01-04,F,20,냉난방가전,가열식 가습기,35.8173
4,2018-01-05,F,20,냉난방가전,가열식 가습기,31.37019


In [123]:
app_search2.to_csv("app_trend.csv", index=False, encoding='cp949')

## 3. 뷰티 

In [130]:
beauty_search1 = category_search(beauty2[:90])

100%|██████████| 90/90 [02:14<00:00,  1.50s/it]


In [149]:
beauty_search2 = category_search(beauty2[90:])

100%|██████████| 41/41 [01:06<00:00,  1.63s/it]


In [151]:
beauty_search = pd.concat([beauty_search1, beauty_search2], axis=0).reset_index(drop=True)
beauty_search = to_original_category_name(beauty_search, beauty_cat)

print(beauty_search.shape)
beauty_search.head()

(718099, 6)


Unnamed: 0,period,sex,age,big_cat,sm_cat,ratio
0,2018-01-01,F,20,뷰티,기능성 링클케어 화장품,25.0
1,2018-01-02,F,20,뷰티,기능성 링클케어 화장품,25.0
2,2018-01-03,F,20,뷰티,기능성 링클케어 화장품,50.0
3,2018-01-04,F,20,뷰티,기능성 링클케어 화장품,58.33333
4,2018-01-05,F,20,뷰티,기능성 링클케어 화장품,8.33333


In [152]:
beauty_search.to_csv("beauty_trend.csv", index=False, encoding='cp949')

## 4. 식품 

In [157]:
food_search1 = category_search(food2[:30])

100%|██████████| 30/30 [01:11<00:00,  2.37s/it]
