In [1]:
import pandas as pd
import oop_func as func
import personal_pkg as ref
import os
import re
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

## Description_1
- 한국재무관리학회 : fm_file
- 한국재무학회 : f_file
- 한국증권학회지 : sc_file
- 한국파생상품학회 : dr_file

## Description_2
- 저널 별 텍스트 데이터
- 연도 별 텍스트 데이터

#### 저널 별 데이터를 모으자

In [2]:
def journal_txt_ls(journal):
    file_path = \
    sorted(['../paper_list/{}/'.format(journal) + i for i in [i for i in os.listdir('../paper_list/{}/'.format(journal)) if '20' in i]])

    journal_txt_ls = []

    for year,i in enumerate([os.listdir(i) for i in file_path]) : 
        txt_ls = []
        for j in i : 
            if 'txt' in j :
                txt_ls.append(file_path[year] +'/'+ j)
        journal_txt_ls.append(txt_ls)

    return journal_txt_ls

In [3]:
fm_txt_ls = journal_txt_ls('한국재무관리학회')
sc_txt_ls = journal_txt_ls('한국증권학회지')
dr_txt_ls = journal_txt_ls('한국파생상품학회')
f_txt_ls = journal_txt_ls('한국재무학회')

In [4]:
len(fm_txt_ls) , len(dr_txt_ls) , len(sc_txt_ls) , len(f_txt_ls)

(15, 17, 17, 17)

## 저널별 citation_analysis

In [5]:
fm_path = [refer for i in fm_txt_ls for refer in i]
f_path = [refer for i in f_txt_ls for refer in i]
sc_path = [refer for i in sc_txt_ls for refer in i]
dr_path = [refer for i in dr_txt_ls for refer in i]

In [6]:
def split_to_sent(file_path):
    sent_refer_ls = []
    error_ls = []

    for path in file_path :
        try : 
            ca = func.Slicing_paper(path)
            sent_refer_ls.append(ca.split())
        except Exception as e: error_ls.append((e,path))
    
    return [j for i in sent_refer_ls for j in i] , error_ls

In [7]:
fm_sent_ls , fm_error_ls = split_to_sent(fm_path)
f_sent_ls , f_error_ls = split_to_sent(f_path)
sc_sent_ls , sc_error_ls = split_to_sent(sc_path)
dr_sent_ls , dr_error_ls = split_to_sent(dr_path)

In [8]:
len(fm_sent_ls),len(f_sent_ls),len(sc_sent_ls),len(dr_sent_ls)

(11511, 9057, 13211, 8807)

In [9]:
len(fm_error_ls),len(f_error_ls),len(sc_error_ls),len(dr_error_ls)

(76, 15, 24, 55)

double quote 가 열리도 닫히는 라인을 double quote line 으로 명명한다.

In [10]:
len(fm_sent_ls),len(f_sent_ls),len(sc_sent_ls),len(dr_sent_ls)

(11511, 9057, 13211, 8807)

In [11]:
double_quote_line = [i for i in fm_sent_ls if re.search("“",i) and re.search("”",i)]
len(double_quote_line) / len([i for i in fm_sent_ls])

0.883155242811224

In [12]:
double_quote_line = [i for i in f_sent_ls if re.search("“",i) and re.search("”",i)]
len(double_quote_line) / len([i for i in f_sent_ls])

0.7302638842883957

In [13]:
double_quote_line = [i for i in sc_sent_ls if re.search("“",i) and re.search("”",i)]
len(double_quote_line) / len([i for i in sc_sent_ls])

0.2276133525092726

In [14]:
double_quote_line = [i for i in dr_sent_ls if re.search("“",i) and re.search("”",i)]
len(double_quote_line) / len([i for i in dr_sent_ls])

0.2505961167253321

In [15]:
def catch_the_wrong_split_line(sent_ls):
    double_quote_line = [i for i in sent_ls if re.search("“.+”",i)]
    catch_ls = []
    double_catch_ls = []

    for i in double_quote_line : 
        index = list(zip([m.end() for m in re.finditer('“',i)] , [m.start() for m in re.finditer('”',i)]))
        if len(index) > 2 :
            catch_ls.append(i)
    return catch_ls

In [16]:
(len(catch_the_wrong_split_line(fm_sent_ls)),
len(catch_the_wrong_split_line(f_sent_ls)),
len(catch_the_wrong_split_line(dr_sent_ls)),
len(catch_the_wrong_split_line(sc_sent_ls)))

(7, 2, 9, 7)

In [17]:
def return_journal_df(sent_ls):
    double_quote_line = [i for i in sent_ls if re.search("“.+”",i)]    
    journal_ls = []

    for i in double_quote_line : 
        index = list(zip([m.end() for m in re.finditer('“',i)] , [m.start() for m in re.finditer('”',i)]))
        if len(index) == 1 :
            journal_ls.append(i[re.search(".+”",i).end():])

    journal_ls = \
    [re.sub('[(].+[)]|[0-9]{,4}','',i) for i in journal_ls]
    journal_ls = \
    [re.sub('Vol.|No.|｢|｣','',i) for i in journal_ls]
    journal_ls = \
    [i[:re.search('제권|제집',i).start()] if re.search('제권|제집',i) else i for i in journal_ls]
    journal_ls = [i.lower().strip() for i in journal_ls]
    
    count_df = pd.DataFrame.from_dict(Counter(journal_ls),orient='index',columns=['count'])
    count_df.sort_values('count',ascending=False,inplace=True)
    count_df['ratio'] = (count_df['count'] / len(sent_ls)) * 100
    
    return count_df

In [18]:
display(return_journal_df(fm_sent_ls).head(10))
display(return_journal_df(f_sent_ls).head(10))
display(return_journal_df(sc_sent_ls).head(10))
display(return_journal_df(dr_sent_ls).head(10))

Unnamed: 0,count,ratio
journal of finance,1231,10.694119
journal of financial economics,1054,9.156459
재무관리연구,387,3.362002
review of financial studies,340,2.953696
증권학회지,268,2.328208
재무연구,227,1.972027
journal of financial and quantitative analysis,216,1.876466
journal of banking and finance,152,1.320476
american economic review,149,1.294414
econometrica,145,1.259665


Unnamed: 0,count,ratio
journal of finance,851,9.396047
journal of financial economics,705,7.784034
재무연구,230,2.539472
review of financial studies,227,2.506349
증권학회지,214,2.362813
재무관리연구,172,1.899084
econometrica,120,1.324942
journal of financial and quantitative analysis,107,1.181407
journal of political economy,99,1.093077
american economic review,92,1.015789


Unnamed: 0,count,ratio
증권학회지,343,2.596321
한국증권학회지,313,2.369238
재무연구,295,2.232988
재무관리연구,238,1.801529
회계학연구,97,0.734237
선물연구,80,0.605556
경영학연구,54,0.40875
journal of finance,51,0.386042
금융연구,47,0.355764
대한경영학회지,40,0.302778


Unnamed: 0,count,ratio
journal of finance,242,2.747814
선물연구,155,1.759964
journal of financial economics,119,1.351198
review of financial studies,106,1.203588
journal of futures markets,56,0.635858
journal of financial and quantitative analysis,55,0.624503
재무관리연구,50,0.56773
재무연구,45,0.510957
한국증권학회지,41,0.465539
journal of political economy,40,0.454184


In [19]:
fm_journal_df = return_journal_df(fm_sent_ls)
f_journal_df = return_journal_df(f_sent_ls)
sc_journal_df = return_journal_df(sc_sent_ls)
dr_journal_df = return_journal_df(dr_sent_ls)

In [23]:
fm_journal_ls = fm_journal_df[fm_journal_df['count'] > 10].index.tolist()
f_journal_ls = f_journal_df[f_journal_df['count'] > 10].index.tolist()
sc_journal_ls = sc_journal_df[sc_journal_df['count'] > 10].index.tolist()
dr_journal_ls = dr_journal_df[dr_journal_df['count'] > 10].index.tolist()
journal_ls = fm_journal_ls + f_journal_ls + sc_journal_ls + dr_journal_ls
journal_ls = [i for i in journal_ls if i and '.' not in i]
journal_dict =list(set(journal_ls))

In [25]:
def apply_journal(sent_ls,journal_dict=journal_dict):
    journal_ls = []

    for catch in journal_dict : 
        for sent in sent_ls :
            if catch in sent.lower() : 
                journal_ls.append(catch)
    return journal_ls

In [26]:
fm_journal_dict = apply_journal(fm_sent_ls)
f_journal_dict = apply_journal(f_sent_ls)
sc_journal_dict = apply_journal(sc_sent_ls)
dr_journal_dict = apply_journal(dr_sent_ls)

In [27]:
fm_journal_dict_df = pd.DataFrame.from_dict(Counter(fm_journal_dict),orient='index',columns=['count'])
f_journal_dict_df = pd.DataFrame.from_dict(Counter(f_journal_dict),orient='index',columns=['count'])
sc_journal_dict_df = pd.DataFrame.from_dict(Counter(sc_journal_dict),orient='index',columns=['count'])
dr_journal_dict_df = pd.DataFrame.from_dict(Counter(dr_journal_dict),orient='index',columns=['count'])

In [28]:
fm_journal_dict_df.sort_values('count',ascending=False,inplace=True)
f_journal_dict_df.sort_values('count',ascending=False,inplace=True)
sc_journal_dict_df.sort_values('count',ascending=False,inplace=True)
dr_journal_dict_df.sort_values('count',ascending=False,inplace=True)

In [29]:
display(fm_journal_dict_df.head(10))
display(f_journal_dict_df.head(10))
display(sc_journal_dict_df.head(10))
display(dr_journal_dict_df.head(10))

Unnamed: 0,count
journal of finance,1449
journal of financial economics,1104
재무관리연구,416
증권학회지,413
working paper,402
review of financial studies,402
재무연구,273
journal of business,255
journal of financial and quantitative analysis,228
financial management,198


Unnamed: 0,count
journal of finance,1060
journal of financial economics,777
재무연구,661
증권학회지,353
working paper,347
review of financial studies,261
재무관리연구,208
econometrica,139
journal of business,133
american economic review,116


Unnamed: 0,count
journal of finance,988
증권학회지,876
journal of financial economics,670
working paper,442
재무연구,402
한국증권학회지,390
review of financial studies,349
재무관리연구,268
journal of business,146
accounting review,137


Unnamed: 0,count
journal of finance,349
선물연구,215
working paper,189
journal of financial economics,146
review of financial studies,140
증권학회지,117
econometrica,113
journal of futures markets,98
재무연구,81
재무관리연구,75
