In [314]:
import pandas as pd
import numpy as np
import json
import re

In [346]:
def open_files(date):
    focus_text = "2018-02-" + date + "_focus/2018-02-" + \
                 date + "_focus_group.json"
    krx_text = "krx_code.json"
    increase_text = "2018-02-" + date + "_opening_increase.json"
    price_text = "2018-02-" + date + "_price_and_everything.json"
    with open(focus_text, 'r', encoding='UTF-8') as focus_group:
        focus = json.load(focus_group)
    with open(krx_text, 'r', encoding='UTF-8') as krx:
        krx_code = json.load(krx)
    with open(increase_text, 'r', encoding='UTF-8') as oi:
        INCREASE = json.load(oi)
    with open(price_text, 'r', encoding='UTF-8') as oi:
        price = json.load(oi)
    
    return focus, krx_code, INCREASE, price   

In [385]:
def get_time_disc(date):
    
    time = []
    prefix = "2018-02-" + date + "_focus/discussion_2018-02-" + date
    for hour in range(9,16):
        for minute in range(0, 6):
            time.append(prefix + " " + (("0" + str(hour)) if hour <= 9 else str(hour))+ \
                        ":" + str(minute) + "0.json")
    
    time = time[1:40]
    
    return time

In [386]:
def df_list_dis(date):
    df_list = []
    time = get_time_disc(date)
    for x in time:
        with open(x, 'r', encoding='UTF-8') as f:
            discussion = json.load(f)        
            discuss_df = pd.DataFrame(discussion, columns = ["post_num", \
                                "unique_id", "click", "like", "dislike", 
                                "name", "time"])
            reset_col = ["name", "time", "post_num", "unique_id", "click", \
                         "like", "dislike"]
            discuss_df = discuss_df[reset_col]
            df_list.append(discuss_df)
    return df_list
    

In [387]:
def list_to_df(date, df_list, column_names, key_list):
    total = pd.DataFrame(columns=column_names)
    for ind, df in enumerate(df_list):
        if ((ind + 1) % 3  == 0) and (ind != 2):
            df_total = df.merge(df_list[ind - 5], on = \
                                key_list).merge(df_list[ind - 4], \
                                on = key_list).merge(df_list[ind - 3], on = key_list)
            df_total.columns = column_names
            total = pd.concat([total, df_total], axis = 0)
            total = total[total["name"].isin(INCREASE)]   
    
    return total    

In [388]:
COLUMN_DISC =["name", "time", "post_num", "unique_id", "click", \
              "like", "dislike", "time_1", "post_num_1", "unique_id_1", "click_1", \
              "like_1", "dislike_1", "time_2", "post_num_2", "unique_id_2", "click_2", \
              "like_2", "dislike_2", "time_3", "post_num_3", "unique_id_3", "click_3", \
              "like_3", "dislike_3"]

def get_discuss_df(date):
    discussion_list = df_list_dis(date)
    discuss_df = list_to_df(date, discussion_list, COLUMN_NAMES, 
                            ['name']).reset_index().drop(["index"], axis = 1)
    return discuss_df

In [389]:
def df_list_price(date):
    price_df = pd.DataFrame(price, columns = ["index", "code", "name", "time", "price", \
                                         "price_dif", "sell", "buy", "volume", "variation"])
    price_df = price_df[["code", "name", "time", "price", \
                    "price_dif", "sell", "buy", "volume", "variation"]][1:]
    text = "2018-02-" + date + " 09:00"
    price_df = price_df[price_df["time"] != text]
    time_list = price_df['time'].unique().tolist()
    price_df_list = []
    for time in time_list:
        df = price_df[price_df["time"] == time]
        df = df[["code", "name", "time", "price", \
                "price_dif", "sell", "buy", "volume", "variation"]]
        price_df_list.append(df)
        
    return price_df_list

In [390]:
COLUMN_PRICE=["code", "name", "time", "price", "price_dif", "sell", 
              "buy", "volume", "variation", "time_1", "price_1", 
              "price_dif_1", "sell_1", "buy_1", "volume_1", "variation_1",
              "time_2", "price_2", "price_dif_2", "sell_2", "buy_2", 
              "volume_2", "variation_2", "time_3", "price_3", 
              "price_dif_3", "sell_3", "buy_3", "volume_3", "variation_3"]

def get_price_df(date):
    price_df_list = df_list_price(date)
    price_df = list_to_df(date, price_df_list, COLUMN_PRICE, \
                          ['code', 'name']).reset_index().drop(["index"], axis = 1)
    
    return price_df

In [391]:
COLUMN_TOTAL = ['name', 'code', 'time', 'price', 'time_1', "price_1", \
                "price_dif_1", "sell_1", "buy_1", "volume_1", "variation_1", \
                'post_num_1', 'unique_id_1', 'click_1', 'like_1', 'dislike_1', \
                'time_2', 'price_2', "price_dif_2", "sell_2", "buy_2", \
                "volume_2", "variation_2", 'post_num_2', 'unique_id_2', 'click_2', \
                'like_2', 'dislike_2', 'time_3', 'price_3', "price_dif_3", \
                "sell_3", "buy_3", "volume_3", "variation_3", 'post_num_3', \
                'unique_id_3', 'click_3', 'like_3', 'dislike_3']

def get_total_df(date):
    price_df = get_price_df(date)
    discuss_df = get_discuss_df(date)
    total_df = pd.merge(price_df, discuss_df, on = ['name', \
                                                    'time', 'time_1', 'time_2', 'time_3'])
    total_df = total_df[COLUMN_TOTAL]
    
    return total_df

In [392]:
import numpy as np
import re

with open("company_size.json", 'r', encoding='UTF-8') as f:
    company_size = json.load(f)

company_df = pd.DataFrame({"company_name": np.zeros(0), \
                           "code": np.zeros(0), "market": np.zeros(0), 
                          "company_size": np.zeros(0)})

company_df = pd.DataFrame(company_size, columns = ["name", "code", "market", "size"])

KOSPI = []
KOSDAQ = []
TRASH = []
MKT_CAP = {}

for idx, row in company_df.iterrows():
    if re.sub('[0-9 ,위]', '', row["market"]) == "코스피":
        KOSPI.append(row["code"])
    elif re.sub('[0-9 ,위]', '', row["market"]) == "코스닥":
        KOSDAQ.append(row["code"])
    else:
        TRASH.append(row["code"])
    
    MKT_CAP[row["code"]] = int(re.sub('[조억원,]', '', row["size"]))

In [374]:
def add_company(date):
    
    total = get_total_df(date)
    total["mkt_cap"] = np.nan
    total["kospi"] = np.nan
    total["kosdaq"] = np.nan
    total["trash"] = np.nan    
    
    for index, row in total.iterrows():
        mkt_cap = MKT_CAP[row["code"]]
        if row["code"] in KOSPI:
            kospi = 1
        else:
            kospi = 0

        if row["code"] in KOSDAQ:
            kosdaq = 1
        else:
            kosdaq = 0

        if row["code"] in TRASH:
            trash = 1
        else:
            trash = 0
        total.set_value(index,'mkt_cap', mkt_cap)
        total.set_value(index,'kospi', kospi)
        total.set_value(index,'kosdaq', kosdaq)
        total.set_value(index,'trash', trash)  
    
    return total


In [393]:
def transform_df(date):
    
    total = add_company(date)   
    var_to_transform = ['price', 'price_1', 'price_dif_1', 'sell_1', 
                        'buy_1', 'volume_1', 'variation_1', 'price_2', 
                        'price_dif_2', 'sell_2', 'buy_2', 'volume_2', 
                        'variation_2', 'price_3', 'price_dif_3', 'sell_3', 'buy_3', 
                        'volume_3', 'variation_3']
    for index, row in total.iterrows():
        for var in var_to_transform:
            if isinstance(row[var], str):

                data = row[var].split(",")
                value = ''.join(data)
                value = int(value)
                total.set_value(index, var, value)
    return total
    

In [394]:
focus, krx_code, INCREASE, price = open_files('14')
total_14 = transform_df('14')
focus, krx_code, INCREASE, price = open_files('20')
total_20 = transform_df('20')
focus, krx_code, INCREASE, price = open_files('21')
total_21 = transform_df('21')
focus, krx_code, INCREASE, price = open_files('22')
total_22 = transform_df('22')
focus, krx_code, INCREASE, price = open_files('23')
total_23 = transform_df('23')

In [400]:
total_df = pd.concat([total_14, total_20, total_21, total_22, total_23])

In [402]:
total_df.reset_index()

Unnamed: 0,index,name,code,time,price,time_1,price_1,price_dif_1,sell_1,buy_1,...,variation_3,post_num_3,unique_id_3,click_3,like_3,dislike_3,mkt_cap,kospi,kosdaq,trash
0,0,호텔신라,008770,2018-02-14 10:00,83400,2018-02-14 09:10,84700,1800,84800,84700,...,2036,5,4,571,7,4,32419.0,1.0,0.0,0.0
1,1,대한광통신,010170,2018-02-14 10:00,7050,2018-02-14 09:10,7060,90,7060,7050,...,3467,3,3,142,1,3,4176.0,0.0,1.0,0.0
2,2,포스코 ICT,022100,2018-02-14 10:00,8320,2018-02-14 09:10,8310,0,8330,8310,...,1712,3,3,781,1,5,12953.0,0.0,1.0,0.0
3,3,한화케미칼,009830,2018-02-14 10:00,32350,2018-02-14 09:10,32700,550,32700,32650,...,3102,6,6,706,3,8,55706.0,1.0,0.0,0.0
4,4,지엘팜텍,204840,2018-02-14 10:00,4760,2018-02-14 09:10,4680,50,4700,4680,...,2844,4,4,970,8,12,1494.0,0.0,1.0,0.0
5,5,코웰패션,033290,2018-02-14 10:00,5350,2018-02-14 09:10,5270,90,5280,5270,...,1530,1,1,121,1,0,4661.0,0.0,1.0,0.0
6,6,대한항공,003490,2018-02-14 10:00,35900,2018-02-14 09:10,35700,300,35750,35700,...,1028,2,2,70,0,0,33338.0,1.0,0.0,0.0
7,7,대한뉴팜,054670,2018-02-14 10:00,13750,2018-02-14 09:10,13800,0,13800,13750,...,365,1,1,64,3,3,2117.0,0.0,1.0,0.0
8,8,레드캡투어,038390,2018-02-14 10:00,15350,2018-02-14 09:10,15350,100,15350,15200,...,1,0,0,0,0,0,1357.0,0.0,1.0,0.0
9,9,이마트,139480,2018-02-14 10:00,290000,2018-02-14 09:10,283500,5000,284000,283500,...,75,1,1,23,2,0,86415.0,1.0,0.0,0.0


In [404]:
num = len(pd.unique(total_df['name']))
num

243