In [1]:
import pandas as pd
import oop_func as func
import personal_pkg as ref
import os
import re
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

## Description_1
- 한국재무관리학회 : fm_file
- 한국재무학회 : f_file
- 한국증권학회지 : sc_file
- 한국파생상품학회 : dr_file

## Description_2
- 저널 별 텍스트 데이터
- 연도 별 텍스트 데이터

#### 저널 별 데이터를 모으자

In [2]:
def journal_txt_ls(journal):
    file_path = \
    sorted(['../paper_list/{}/'.format(journal) + i for i in [i for i in os.listdir('../paper_list/{}/'.format(journal)) if '20' in i]])

    journal_txt_ls = []

    for year,i in enumerate([os.listdir(i) for i in file_path]) : 
        txt_ls = []
        for j in i : 
            if 'txt' in j :
                txt_ls.append(file_path[year] +'/'+ j)
        journal_txt_ls.append(txt_ls)

    return journal_txt_ls

In [3]:
fm_txt_ls = journal_txt_ls('한국재무관리학회')
sc_txt_ls = journal_txt_ls('한국증권학회지')
dr_txt_ls = journal_txt_ls('한국파생상품학회')
f_txt_ls = journal_txt_ls('한국재무학회')

In [4]:
len(fm_txt_ls) , len(dr_txt_ls) , len(sc_txt_ls) , len(f_txt_ls)

(15, 17, 17, 17)

## 저널별 citation_analysis

In [5]:
fm_path = [refer for i in fm_txt_ls for refer in i]
f_path = [refer for i in f_txt_ls for refer in i]
sc_path = [refer for i in sc_txt_ls for refer in i]
dr_path = [refer for i in dr_txt_ls for refer in i]

In [6]:
def split_to_sent(file_path):
    sent_refer_ls = []
    error_ls = []

    for path in file_path :
        try : 
            ca = func.Slicing_paper(path)
            sent_refer_ls.append(ca.split())
        except Exception as e: error_ls.append((e,path))
    
    return [j for i in sent_refer_ls for j in i] , error_ls

In [7]:
fm_sent_ls , fm_error_ls = split_to_sent(fm_path)
f_sent_ls , f_error_ls = split_to_sent(f_path)
sc_sent_ls , sc_error_ls = split_to_sent(sc_path)
dr_sent_ls , dr_error_ls = split_to_sent(dr_path)

In [8]:
len(fm_sent_ls),len(f_sent_ls),len(sc_sent_ls),len(dr_sent_ls)

(11511, 9057, 13211, 8807)

In [9]:
len(fm_error_ls),len(f_error_ls),len(sc_error_ls),len(dr_error_ls)

(76, 15, 24, 55)

## double quote 가 열리도 닫히는 라인을 double quote line 으로 명명한다.

In [10]:
len(fm_sent_ls),len(f_sent_ls),len(sc_sent_ls),len(dr_sent_ls)

(11511, 9057, 13211, 8807)

In [11]:
double_quote_line = [i for i in fm_sent_ls if re.search("“",i) and re.search("”",i)]
len(double_quote_line) / len([i for i in fm_sent_ls])

0.883155242811224

In [12]:
double_quote_line = [i for i in f_sent_ls if re.search("“",i) and re.search("”",i)]
len(double_quote_line) / len([i for i in f_sent_ls])

0.7302638842883957

In [13]:
double_quote_line = [i for i in sc_sent_ls if re.search("“",i) and re.search("”",i)]
len(double_quote_line) / len([i for i in sc_sent_ls])

0.2276133525092726

In [14]:
double_quote_line = [i for i in dr_sent_ls if re.search("“",i) and re.search("”",i)]
len(double_quote_line) / len([i for i in dr_sent_ls])

0.2505961167253321

In [15]:
def catch_the_wrong_split_line(sent_ls):
    double_quote_line = [i for i in sent_ls if re.search("“.+”",i)]
    catch_ls = []
    double_catch_ls = []

    for i in double_quote_line : 
        index = list(zip([m.end() for m in re.finditer('“',i)] , [m.start() for m in re.finditer('”',i)]))
        if len(index) > 2 :
            catch_ls.append(i)
    return catch_ls

In [16]:
(len(catch_the_wrong_split_line(fm_sent_ls)),
len(catch_the_wrong_split_line(f_sent_ls)),
len(catch_the_wrong_split_line(dr_sent_ls)),
len(catch_the_wrong_split_line(sc_sent_ls)))

(7, 2, 9, 7)

In [17]:
def return_paper_df(sent_ls):
    
    double_quote_line = [i for i in sent_ls if re.search("“.+”",i)]    
    paper_ls = []

    for i in double_quote_line : 
        process_ls = []
        index = list(zip([m.end() for m in re.finditer('“',i)] , [m.start() for m in re.finditer('”',i)]))
        for idx in range(len(index)):
            process_ls.append(i[index[idx][0] : index[idx][1]])
        paper_ls.append(process_ls)
    paper_ls = [j.lower() for i in paper_ls for j in i]
    paper_df = pd.DataFrame.from_dict(Counter(paper_ls),orient='index',columns=['count'])
    paper_df.sort_values('count',ascending=False,inplace=True)
    paper_df['ratio'] = (paper_df['count'] / len(sent_ls)) * 100
    return paper_df

In [18]:
display(return_paper_df(fm_sent_ls).head())
display(return_paper_df(f_sent_ls).head())
display(return_paper_df(dr_sent_ls).head())
display(return_paper_df(sc_sent_ls).head())

Unnamed: 0,count,ratio
common risk factors in the returns on stocks and bonds,27,0.234558
corporate financing and investment decisions when firms have information that investors do not have,22,0.191122
agency costs of free cash flow corporate finance and takeovers,18,0.156372
determinants of corporate borrowing,18,0.156372
the cross-section of expected stock returns,14,0.121623


Unnamed: 0,count,ratio
common risk factors in the returns on stocks and bonds,12,0.132494
risk return and equilibrium: empirical tests,11,0.121453
common risk factors in the returns on stocks and bonds,11,0.121453
on persistence in mutual fund performance,10,0.110412
agency costs of free cash flow corporate finance and takeovers,9,0.099371


Unnamed: 0,count,ratio
,28,0.317929
the pricing of options and corporate liabilities,10,0.113546
common risk factors in the returns on stocks and bonds,8,0.090837
informed trading in stock and option markets,7,0.079482
risk return and equilibrium : empirical tests,7,0.079482


Unnamed: 0,count,ratio
기업규모와 장부가/시가 비율과 주식수익률의 관계,13,0.098403
한국 주식시장에서 유동성 요인을 포함한 3요인 모형의 설명력에 관한 연구,8,0.060556
sharpe의 방법론을 이용한 한국 주식형펀드의 운용스타일 및 성과분석,8,0.060556
ipo 락업 제도가 저가발행현상에 미치는 영향,7,0.052986
기업공개시 공모주 가격결정에 관한 연구,7,0.052986


잡긴 잡지만, double quote 기준으로 paper name 만을 추출한 것이기 때문에, count의 갯수가 만족스럽지 못하다. 이에 따라, 위의 dataframe 의 인덱스를 딕셔너리로 잡아서 전체 데이터에 대해 검색해준다.

In [85]:
fm_sent_ls_df = return_paper_df(fm_sent_ls)
f_sent_ls_df = return_paper_df(f_sent_ls)
sc_sent_ls_df = return_paper_df(sc_sent_ls)
dr_sent_ls_df = return_paper_df(dr_sent_ls)

fm_paper_ls = fm_sent_ls_df[fm_sent_ls_df['count'] > 1].index.tolist()
f_paper_ls = f_sent_ls_df[f_sent_ls_df['count'] > 1].index.tolist()
sc_paper_ls = sc_sent_ls_df[sc_sent_ls_df['count'] > 1].index.tolist()
dr_paper_ls = dr_sent_ls_df[dr_sent_ls_df['count'] > 1].index.tolist()

paper_dict = fm_paper_ls + f_paper_ls + sc_paper_ls + dr_paper_ls
paper_dict = [i.strip() for i in paper_dict if len(i) > 3]
paper_dict = list(set(paper_dict))

In [86]:
def apply_paper_dict(sent_ls,paper_dict = paper_dict):
    paper_ls = []

    for catch in paper_dict : 
        for sent in sent_ls : 
            if catch in sent.lower() : 
                if catch.strip() == 'tunneling' : 
                    if '2000' in sent.lower() :
                        paper_ls.append(catch.strip())
                elif catch.strip() == 'noise' :
                    if '1985' in sent.lower() :
                        paper_ls.append(catch.strip())
                else : paper_ls.append(catch.strip())
    return paper_ls

In [87]:
%%time
fm_paper_dict = apply_paper_dict(fm_sent_ls)
f_paper_dict = apply_paper_dict(f_sent_ls)
sc_paper_dict = apply_paper_dict(sc_sent_ls)
dr_paper_dict = apply_paper_dict(dr_sent_ls)

fm_paper_dict_df = pd.DataFrame.from_dict(Counter(fm_paper_dict),orient='index',columns=['count'])
f_paper_dict_df = pd.DataFrame.from_dict(Counter(f_paper_dict),orient='index',columns=['count'])
sc_paper_dict_df = pd.DataFrame.from_dict(Counter(sc_paper_dict),orient='index',columns=['count'])
dr_paper_dict_df = pd.DataFrame.from_dict(Counter(dr_paper_dict),orient='index',columns=['count'])

fm_paper_dict_df.sort_values('count',ascending=False,inplace=True)
f_paper_dict_df.sort_values('count',ascending=False,inplace=True)
sc_paper_dict_df.sort_values('count',ascending=False,inplace=True)
dr_paper_dict_df.sort_values('count',ascending=False,inplace=True)

CPU times: user 49.1 s, sys: 0 ns, total: 49.1 s
Wall time: 49.1 s


In [89]:
display(fm_paper_dict_df.head())
display(f_paper_dict_df.head())
display(sc_paper_dict_df.head())
display(dr_paper_dict_df.head())

Unnamed: 0,count
mutual fund performance,55
common risk factors in the returns on stocks and bonds,28
corporate financing and investment decisions when firms have information that investors do not have,22
determinants of corporate borrowing,20
comovement,20


Unnamed: 0,count
market microstructure,40
mutual fund performance,33
common risk factors in the returns on stocks and bonds,23
risk return and equilibrium: empirical tests,17
corporate financing and investment decisions when firms have information that investors do not have,15


Unnamed: 0,count
mutual fund performance,63
on persistence in mutual fund performance,25
common risk factors in the returns on stocks and bonds,22
market microstructure,21
the cross-section of expected stock returns,20


Unnamed: 0,count
the pricing of options and corporate liabilities,16
vix futures,13
mutual fund performance,11
generalized autoregressive conditional heteroskedasticity,10
common risk factors in the returns on stocks and bonds,9
