In [1]:
#from PyNaver import Datalab
import pandas as pd
import urllib.request
from datetime import datetime, timedelta
import json
import sys
import os
import warnings
import re
from pathlib import Path
warnings.filterwarnings(action='ignore')

today = str(datetime.now().date())
recent_8_days = str(datetime.now().date() - timedelta(weeks=1,days=1))
regex = '\(.*\)|\s-\s'

In [2]:
BASE_DIR = Path().resolve().parent/'Data_Preprocessing'

In [3]:
ratio_df = pd.read_csv(BASE_DIR/'sub_day_search.csv')
recent_search = pd.read_csv(BASE_DIR/'recent_search.csv')

In [4]:
days = pd.read_csv(BASE_DIR/'38_day.csv',encoding='euc-kr')
days = days[~days['기업명'].str.contains('스팩')]
days["청약일"] = days["청약일"].str.replace(pat=r'[^\w]', repl=r'', regex=True)
days['청약일'] = pd.to_datetime(days['청약일'])
days = days.reset_index()
days.drop(['수요예측일','Unnamed: 0','index'], axis = 1,inplace = True)

for i in range(len(days['기업명'])):
    days['기업명'][i] = re.sub(regex,'',days['기업명'][i])
days = days[days['청약일'] > '2016-01-02']
days = days[~days['기업명'].isin(['코썬바이오'])]
days.reset_index(drop=True,inplace=True)

In [5]:
class NaverDataLabOpenAPI():
    """
    네이버 데이터랩 오픈 API 컨트롤러 클래스
    """
    def __init__(self, client_id, client_secret):
        """
        인증키 설정 및 검색어 그룹 초기화
        """
        self.client_id = client_id
        self.client_secret = client_secret
        self.keywordGroups = []
        self.url = "https://openapi.naver.com/v1/datalab/search"
    def add_keyword_groups(self, group_dict):
        """
        검색어 그룹 추가
        """
        keyword_gorup = {
            'groupName': group_dict['groupName'],
            'keywords': group_dict['keywords']
        }
        
        self.keywordGroups.append(keyword_gorup)
        #print(f">>> Num of keywordGroups: {len(self.keywordGroups)}")
        
    def get_data(self, startDate, endDate, timeUnit, device, ages, gender):
        """
        요청 결과 반환
        timeUnit - 'date', 'week', 'month'
        device - None, 'pc', 'mo'
        ages = [], ['1' ~ '11']
        gender = None, 'm', 'f'
        """
        # Request body
        body = json.dumps({
            "startDate": startDate,
            "endDate": endDate,
            "timeUnit": timeUnit,
            "keywordGroups": self.keywordGroups,
            "device": device,
            "ages": ages,
            "gender": gender
        }, ensure_ascii=False)
        
        # Results
        request = urllib.request.Request(self.url)
        request.add_header("X-Naver-Client-Id",self.client_id)
        request.add_header("X-Naver-Client-Secret",self.client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            # Json Result
            result = json.loads(response.read())
            
            df = pd.DataFrame(result['results'][0]['data'])[['period']]
            for i in range(len(self.keywordGroups)):
                #df['period'] = df.apply(lambda r: [df['period'].values], axis=1, result_type='expand')
                tmp = pd.DataFrame(result['results'][i]['data'])
                tmp = tmp.rename(columns={'ratio': result['results'][i]['title']})
                df = pd.merge(df, tmp, how='left', on=['period'])
            self.df = df.rename(columns={'period': '날짜'})
            self.df['날짜'] = pd.to_datetime(self.df['날짜'])
            
        else:
            print("Error Code:" + rescode)
            
        return self.df

In [6]:
today = str(datetime.now().date())
recent_8_days = str(datetime.now().date() - timedelta(weeks=1,days=1))
# API 인증 정보 설정
client_id = "IasLV39qud8JnmvpzcB0"
client_secret = "1Cax7mYUKK"

for i in range(len(days['청약일'])):
    if days['기업명'][i] not in ratio_df['기업명'].values:
        name = days['기업명'][i]
        sub_date = str(days['청약일'][i].date())
        day_after = str(days['청약일'][i].date() + timedelta(days=1))
    else:
        continue
        
    keyword_group_set = {
    'keyword_group_1': {'groupName': '검색량', 'keywords': [name]}
                    }
    # 요청 파라미터 설정
    startDate = sub_date
    endDate = day_after
    timeUnit = 'date'
    device = ''
    ages = []
    gender = ''

# 데이터 프레임 정의
    naver = NaverDataLabOpenAPI(client_id=client_id, client_secret=client_secret)

    naver.add_keyword_groups(keyword_group_set['keyword_group_1'])

    df = naver.get_data(startDate, endDate, timeUnit, device, ages, gender)
    df['기업명'] = name
    ratio_df = pd.concat([ratio_df,df])

In [7]:
#최근 8일동안 검색량 
client_id = 'IasLV39qud8JnmvpzcB0'
client_secret = '1Cax7mYUKK'
for i in range(len(days['청약일'])):
    if days['기업명'][i] not in ratio_df['기업명'].values:
        name = days['기업명'][i]
    else:
        continue
    url = "https://openapi.naver.com/v1/datalab/search"
    body = "{\"startDate\":\""+recent_8_days+"\",\"endDate\":\""+today+"\",\"timeUnit\":\"date\",\"keywordGroups\":[{\"groupName\":\""+name+"\",\"keywords\":[\""+name+"\"]}]}";
    requested = urllib.request.Request(url)
    requested.add_header("X-Naver-Client-Id", client_id)
    requested.add_header("X-Naver-Client-Secret", client_secret)
    requested.add_header("Content-Type", "application/json")
    response = urllib.request.urlopen(requested, data=body.encode("utf-8"))
    rescode = response.getcode()

    if(rescode==200):
        response_body = response.read()
        output_data = response_body.decode('utf-8')
    else:
        print('Error code:'+ rescode)
        continue


    result = json.loads(output_data)
    
    date = [a['period'] for a in result['results'][0]['data']]
    corp_ratio = [a['ratio'] for a in result['results'][0]['data']]

    ratio = pd.DataFrame({'date':date, 
                  '기업검색량':corp_ratio,
                '기업명':name})
    recent_search = pd.concat([recent_search,ratio])

In [8]:
#청약일2일, 최근 일주일간 검색량 csv
ratio_df.to_csv(BASE_DIR/'sub_day_search.csv',index=False)
recent_search.to_csv(BASE_DIR/'recent_search.csv', index=False)

In [9]:
#ratio_df.drop(['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1'],axis=1,inplace=True)
sub_day_search = ratio_df.groupby('기업명').mean()
sub_day_search.rename(columns={'검색량':'청약일검색량'},inplace=True)
sub_day_search

Unnamed: 0_level_0,청약일검색량
기업명,Unnamed: 1_level_1
ING생명보험,57.449205
LG에너지솔루션,85.464685
NH올원리츠,57.859765
SK리츠,63.899235
SK바이오사이언스,74.984995
...,...
현대중공업,73.482235
호전실업,60.733840
화승엔터프라이즈,61.744120
휴네시온,70.359845


In [10]:
recent_search = recent_search.groupby('기업명').median()
#recent_search.drop(['Unnamed: 0'],axis=1,inplace=True)
recent_search.rename(columns={'기업검색량':'최근일주일검색량'},inplace=True)
recent_search

Unnamed: 0_level_0,Unnamed: 0,최근일주일검색량
기업명,Unnamed: 1_level_1,Unnamed: 2_level_1
BGF리테일,3.0,65.37931
CJ헬로비전,3.0,73.84341
GKL,3.0,77.66393
GMB코리아,3.0,58.44155
GS리테일,3.0,48.65889
...,...,...
휴비스,3.0,95.90973
휴젤,3.0,80.56253
흥국,3.0,94.04761
흥국에프엔비,3.0,55.03838


In [11]:
merged = pd.merge(sub_day_search,recent_search,how='left',on='기업명')
merged.drop(['Unnamed: 0'],axis=1,inplace=True)
merged.reset_index(inplace=True)
merged

Unnamed: 0,기업명,청약일검색량,최근일주일검색량
0,ING생명보험,57.449205,100.00000
1,LG에너지솔루션,85.464685,79.98288
2,NH올원리츠,57.859765,72.54098
3,SK리츠,63.899235,69.37922
4,SK바이오사이언스,74.984995,75.06483
...,...,...,...
489,현대중공업,73.482235,74.68610
490,호전실업,60.733840,77.94117
491,화승엔터프라이즈,61.744120,91.08635
492,휴네시온,70.359845,76.57992


In [12]:
import math
score_list = []
for i in range(len(merged['청약일검색량'])):
    sub_search_amt = round(merged['청약일검색량'][i],3)
    weekly_search_amt = round(merged['최근일주일검색량'][i],3)
    score = math.log10(sub_search_amt) - math.log10(weekly_search_amt)
    score_list.append(score)
merged = merged.assign(비정상검색량지수 = score_list)
merged['비정상검색량지수'] = round(merged['비정상검색량지수'],3)
merged = merged.dropna(axis=0).reset_index(drop=True)
merged.drop(['청약일검색량','최근일주일검색량'], axis = 1,inplace = True)
#merged.to_csv(BASE_DIR/'abnormal_merged.csv')

In [28]:
merged.columns = ['cor_name','search_amt']
final_df = pd.read_csv('final_data_per2.csv')
final_df = pd.merge(final_df,merged,how='left',on='cor_name')
final_df.drop(['Unnamed: 0','search_amt_x','search_amt_y'], axis = 1,inplace = True)
final_df.fillna({'search_amt':final_df['search_amt'].mean()},inplace=True)
final_df.to_csv('final_data_per2.csv')

In [29]:
final_df = pd.read_csv('final_data_per2.csv')
final_df

Unnamed: 0.1,Unnamed: 0,cor_name,sales,profit,shares_to_pub,sub_rate,cor_rate,obligation,trend,score,...,market_type,listed_date,current_price,offer_price,h_exp_offer_price,l_exp_offer_price,target,st_price,end_price,search_amt
0,0,에이치피에스피,61174,17658,1.00,1159.05,1511.36,42.54,0.75,3,...,0,20220715,52700,25000,23000,25000,43250,50000,100.00,0.097000
1,1,영창케미칼,60760,2506,0.83,682.13,1616.27,5.92,0.67,12,...,0,20220714,14250,18600,15000,18600,16450,18400,-1.08,-0.001000
2,2,넥스트칩,10383,-13496,1.00,1727.38,1623.41,11.58,1.00,4,...,0,20220701,12000,13000,9900,11600,14300,17150,31.92,-0.113000
3,3,위니아에이드,418527,24895,1.00,111.26,955.00,1.39,0.88,0,...,0,20220623,9330,16200,14200,16200,11000,14600,-9.88,-0.110000
4,4,레이저쎌,3879,-7665,1.00,1845.11,1442.95,12.43,0.89,2,...,0,20220624,12400,16000,12000,14000,17350,20600,28.75,-0.137000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,386,데브시스터즈,61303,22326,1.00,285.28,651.66,29.98,1.00,-6,...,0,20141006,-,53000,43000,50000,61000,71000,33.96,0.026409
387,387,신화콘텍,51059,6446,1.00,99.30,441.99,10.37,0.67,5,...,0,20140808,-,9100,8100,9100,7100,8190,-10.00,0.026409
388,388,덕신하우징,104434,11718,1.00,899.07,650.72,59.86,0.83,12,...,0,20140801,-,13000,9600,11000,20950,21650,66.54,0.026409
389,389,창해에탄올,69032,14887,1.00,675.79,578.20,59.50,0.80,6,...,0,20140730,-,8300,6000,6900,14600,16600,100.00,0.026409
