# Study H1 ~ H50

In [1]:
%load_ext autoreload
%autoreload 2
PROJ_HOME='/Users/keemsunguk/Projects/analyst_performance/'
import sys
sys.path.append(PROJ_HOME)
from datetime import datetime
from pathlib import Path
import matplotlib

## Import

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import copy
import pickle
from tqdm import tqdm

In [3]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from googletrans import Translator

In [4]:
from arum.preprocessing import fix_price, get_author_production, clean_recommendations, change_col_names, build_labels
from arum.preprocessing import check_report_date, group_by_month
from arum.lookups.column_names import new_col_names

In [5]:
data_dir = Path(PROJ_HOME+'/data')

In [6]:
nlp = spacy.load("en_core_web_lg")

In [7]:
translator = Translator(service_urls=[
      'translate.google.com',
      'translate.google.co.kr',
    ])

# Preprocessing

In [8]:
sentiment_df = pd.read_pickle(PROJ_HOME+'/data/sentiment_pub_trade.pkl')

In [9]:
author_freq = get_author_production(sentiment_df)

In [10]:
clean_recommendations(sentiment_df)

{'BUY': 30522,
 'NR': 6317,
 'HOLD': 2333,
 'STRONG_BUY': 67,
 'SELL': 16,
 'REDUCE': 12}

In [11]:
change_col_names(sentiment_df)

In [12]:
for colnm in list(new_col_names.values()):
    sentiment_df = fix_price(sentiment_df, colnm)

In [13]:
sentiment_df = fix_price(sentiment_df, 'H25')
sentiment_df = fix_price(sentiment_df, 'H50')

In [14]:
sentiment_df['report_date-2'].fillna('', inplace=True)

In [15]:
sentiment_df = check_report_date(sentiment_df)

In [16]:
# Sanity Check.  cnt_no should be 0
cnt_yes = 0
cnt_no = 0
for k, r in sentiment_df.iterrows():
    if type(r['report_date']) == datetime:
        cnt_yes += 1
    else:
        cnt_no += 1
print(cnt_yes, cnt_no)

39267 0


# Labeling

In [17]:
labeled_df = build_labels(sentiment_df)

In [25]:
sentiment_df.rename(columns={'high': 'high_0'}, inplace=True)

In [48]:
sentiment_df.shape

(39267, 75)

In [28]:
valid_df = sentiment_df.query('gap_up_ratio > -1 and high_profit_ratio > -1 and closing_profit_ratio > -1').copy()
valid_df.shape

(38742, 75)

# Derivatives

In [29]:
valid_df.keys()

Index(['author', 'source', 'ticker_symbol', 'company', 'report_date',
       'report_date-1', 'report_date-2', 'report_title', 'opening', 'closing',
       'high_0', 'closing_1', 'closing_2', 'recommendation', 'high_1',
       'high_2', 'high_3', 'high_4', 'high_5', 'high_6', 'high_7', 'high_8',
       'high_9', 'high_10', 'high_11', 'high_12', 'high_13', 'high_14',
       'high_15', 'high_16', 'high_17', 'high_18', 'high_19', 'high_20',
       'high_21', 'high_22', 'high_23', 'high_24', 'high_25', 'high_26',
       'high_27', 'high_28', 'high_29', 'high_30', 'high_31', 'high_32',
       'high_33', 'high_34', 'high_35', 'high_36', 'high_37', 'high_38',
       'high_39', 'high_40', 'high_41', 'high_42', 'high_43', 'high_44',
       'high_45', 'high_46', 'high_47', 'high_48', 'high_49', 'high_50', 'H25',
       'H50', 'en_report_title', 'sentiment', 'report_month', '__label1',
       '__label2', '__label3', 'gap_up_ratio', 'high_profit_ratio',
       'closing_profit_ratio'],
      dtype=

In [30]:
hi_col_names = ['high_0', 'high_1', 'high_2',
       'high_3', 'high_4', 'high_5', 'high_6', 'high_7', 'high_8', 'high_9',
       'high_10', 'high_11', 'high_12', 'high_13', 'high_14', 'high_15',
       'high_16', 'high_17', 'high_18', 'high_19', 'high_20', 'high_21',
       'high_22', 'high_23', 'high_24', 'high_25', 'high_26', 'high_27',
       'high_28', 'high_29', 'high_30', 'high_31', 'high_32', 'high_33',
       'high_34', 'high_35', 'high_36', 'high_37', 'high_38', 'high_39',
       'high_40', 'high_41', 'high_42', 'high_43', 'high_44', 'high_45',
       'high_46', 'high_47', 'high_48', 'high_49', 'high_50']

In [47]:
valid_df.shape

(38742, 75)

In [50]:
first_time_down = []
first_time_delta = []
high_prefix = 'high_'
for k, d in valid_df.iterrows():
    high_val = d['high_0']
    for i in range(1, 51):
        if high_val > d[high_prefix+str(i)]:
            first_time_down.append(i)
            first_time_delta.append( (high_val - d[high_prefix+str(i)])/high_val )
            break
        if i == 50:
            first_time_down.append(50)
            first_time_delta.append( (high_val - d[high_prefix+str(50)])/high_val )
len(first_time_down)

38742

In [51]:
valid_df['1st_time_down'] = first_time_down
valid_df['1st_time_delta'] = first_time_delta

In [59]:
valid_df.groupby('author').count()

Unnamed: 0_level_0,source,ticker_symbol,company,report_date,report_date-1,report_date-2,report_title,opening,closing,high_0,...,sentiment,report_month,__label1,__label2,__label3,gap_up_ratio,high_profit_ratio,closing_profit_ratio,1st_time_down,1st_time_delta
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
강경근,23,23,23,23,23,23,23,23,23,23,...,23,23,23,23,23,23,23,23,23,23
강경태,22,22,22,22,22,22,22,22,22,22,...,22,22,22,22,22,22,22,22,22,22
강동근,31,31,31,31,31,31,31,31,31,31,...,31,31,31,31,31,31,31,31,31,31
강동진,183,183,183,183,183,183,183,183,183,183,...,183,183,183,183,183,183,183,183,183,183
강석오,26,26,26,26,26,26,26,26,26,26,...,26,26,26,26,26,26,26,26,26,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
황성환,18,18,18,18,18,18,18,18,18,18,...,18,18,18,18,18,18,18,18,18,18
황승택,60,60,60,60,60,60,60,60,60,60,...,60,60,60,60,60,60,60,60,60,60
황어연,152,152,152,152,152,152,152,152,152,152,...,152,152,152,152,152,152,152,152,152,152
황유식,115,115,115,115,115,115,115,115,115,115,...,115,115,115,115,115,115,115,115,115,115


In [60]:
longterm_df = valid_df[['author', 'high_profit_ratio', '1st_time_down', '1st_time_delta']].groupby('author').mean().sort_values(
    ['1st_time_down'], ascending=False).copy()

In [62]:
longterm_df = pd.concat([valid_df.groupby('author').count()[['source']].copy(), longterm_df], axis=1)

In [67]:
longterm_df.sort_values(['1st_time_down'], ascending=False, inplace=True)

In [75]:
longterm_df.query('source > 10')[:60]

Unnamed: 0_level_0,source,high_profit_ratio,1st_time_down,1st_time_delta
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
손지우,35,0.035469,26.742857,-0.122748
문정윤,23,0.02873,22.695652,-0.096079
고문영,32,0.033416,19.03125,-0.139437
이왕진,63,0.028977,17.698413,0.007383
이새롬,14,0.024186,17.285714,0.086848
유승창,37,0.02657,17.108108,-0.081091
서혜원,28,0.03792,16.321429,-0.041589
윤승현,34,0.017331,16.176471,-0.018182
조태나,28,0.039094,15.714286,-0.016129
정하늘,66,0.024068,15.651515,-1.221004


In [76]:
longterm_df.query('source > 10')[60:120]

Unnamed: 0_level_0,source,high_profit_ratio,1st_time_down,1st_time_delta
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
박찬솔,68,0.045674,11.852941,0.025259
서승연,25,0.017648,11.72,0.072889
한상준,20,0.019376,11.7,-0.053289
신지훈,61,0.033301,11.622951,-0.0155
전우제,157,0.027708,11.617834,0.03519
한경래,135,0.03929,11.592593,-0.006977
배송이,147,0.023285,11.544218,0.014757
한영수,111,0.026581,11.468468,-0.741679
손세훈,50,0.039524,11.46,-0.144203
김한경,65,0.03947,11.446154,-0.032458


In [77]:
longterm_df.query('source > 10')[120:180]

Unnamed: 0_level_0,source,high_profit_ratio,1st_time_down,1st_time_delta
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
이동욱,227,0.033874,10.400881,0.024126
윤을정,88,0.01997,10.340909,0.010631
박일선,43,0.024215,10.325581,0.000986
이봉진,69,0.018512,10.304348,0.048139
임예림,24,0.055733,10.291667,0.043355
나관준,60,0.026457,10.283333,0.040503
임희연,126,0.020918,10.277778,-0.492225
김영건,23,0.016852,10.26087,-0.017334
김진구,104,0.026588,10.259615,-0.013772
함형도,51,0.026175,10.215686,-0.028751


In [78]:
longterm_df.query('source > 10')[180:240]

Unnamed: 0_level_0,source,high_profit_ratio,1st_time_down,1st_time_delta
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
남효지,146,0.02106,9.684932,-0.003115
황성현,102,0.024355,9.647059,0.020288
이선일,51,0.028218,9.647059,0.131083
김경민,271,0.026947,9.642066,0.009324
김동양,120,0.020894,9.641667,-0.05162
김준성,75,0.026584,9.64,0.019894
권순우,97,0.025891,9.639175,0.011836
이민아,116,0.020218,9.62931,0.011636
박세라,192,0.025119,9.619792,0.000373
성준원,103,0.019918,9.61165,0.006927


In [79]:
longterm_df.to_excel(data_dir/'longterm_high.xls')