In [1]:
import pandas as pd
import oop_func as func
import personal_pkg as ref
import os
import re
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

## Description_1
- 한국재무관리학회 : fm_file
- 한국재무학회 : f_file
- 한국증권학회지 : sc_file
- 한국파생상품학회 : dr_file

## Description_2
- 저널 별 텍스트 데이터
- 연도 별 텍스트 데이터

#### 저널 별 데이터를 모으자

In [2]:
def journal_txt_ls(journal):
    file_path = \
    sorted(['../paper_list/{}/'.format(journal) + i for i in [i for i in os.listdir('../paper_list/{}/'.format(journal)) if '20' in i]])

    journal_txt_ls = []

    for year,i in enumerate([os.listdir(i) for i in file_path]) : 
        txt_ls = []
        for j in i : 
            if 'txt' in j :
                txt_ls.append(file_path[year] +'/'+ j)
        journal_txt_ls.append(txt_ls)

    return journal_txt_ls

In [3]:
fm_txt_ls = journal_txt_ls('한국재무관리학회')
sc_txt_ls = journal_txt_ls('한국증권학회지')
dr_txt_ls = journal_txt_ls('한국파생상품학회')
f_txt_ls = journal_txt_ls('한국재무학회')

In [4]:
len(fm_txt_ls) , len(dr_txt_ls) , len(sc_txt_ls) , len(f_txt_ls)

(15, 17, 17, 17)

## 저널별 citation_analysis

In [5]:
fm_path = [refer for i in fm_txt_ls for refer in i]
f_path = [refer for i in f_txt_ls for refer in i]
sc_path = [refer for i in sc_txt_ls for refer in i]
dr_path = [refer for i in dr_txt_ls for refer in i]

In [6]:
def split_to_sent(file_path):
    sent_refer_ls = []
    error_ls = []

    for path in file_path :
        try : 
            ca = func.Slicing_paper(path)
            sent_refer_ls.append(ca.split())
        except Exception as e: error_ls.append((e,path))
    
    return [j for i in sent_refer_ls for j in i] , error_ls

In [7]:
fm_sent_ls , fm_error_ls = split_to_sent(fm_path)
f_sent_ls , f_error_ls = split_to_sent(f_path)
sc_sent_ls , sc_error_ls = split_to_sent(sc_path)
dr_sent_ls , dr_error_ls = split_to_sent(dr_path)

In [8]:
len(fm_sent_ls),len(f_sent_ls),len(sc_sent_ls),len(dr_sent_ls)

(11511, 9057, 13211, 8807)

In [9]:
len(fm_error_ls),len(f_error_ls),len(sc_error_ls),len(dr_error_ls)

(76, 15, 24, 55)

double quote 가 열리도 닫히는 라인을 double quote line 으로 명명한다.

In [10]:
len(fm_sent_ls),len(f_sent_ls),len(sc_sent_ls),len(dr_sent_ls)

(11511, 9057, 13211, 8807)

In [11]:
double_quote_line = [i for i in fm_sent_ls if re.search("“",i) and re.search("”",i)]
len(double_quote_line) / len([i for i in fm_sent_ls])

0.883155242811224

In [12]:
double_quote_line = [i for i in f_sent_ls if re.search("“",i) and re.search("”",i)]
len(double_quote_line) / len([i for i in f_sent_ls])

0.7302638842883957

In [13]:
double_quote_line = [i for i in sc_sent_ls if re.search("“",i) and re.search("”",i)]
len(double_quote_line) / len([i for i in sc_sent_ls])

0.2276133525092726

In [14]:
double_quote_line = [i for i in dr_sent_ls if re.search("“",i) and re.search("”",i)]
len(double_quote_line) / len([i for i in dr_sent_ls])

0.2505961167253321

In [15]:
def catch_the_wrong_split_line(sent_ls):
    double_quote_line = [i for i in sent_ls if re.search("“.+”",i)]
    catch_ls = []
    double_catch_ls = []

    for i in double_quote_line : 
        index = list(zip([m.end() for m in re.finditer('“',i)] , [m.start() for m in re.finditer('”',i)]))
        if len(index) > 2 :
            catch_ls.append(i)
    return catch_ls

In [16]:
(len(catch_the_wrong_split_line(fm_sent_ls)),
len(catch_the_wrong_split_line(f_sent_ls)),
len(catch_the_wrong_split_line(dr_sent_ls)),
len(catch_the_wrong_split_line(sc_sent_ls)))

(7, 2, 9, 7)

In [17]:
def return_author_df(sent_ls):
    double_quote_line = [i for i in sent_ls if re.search("“.+”",i)]    
    author_ls = []

    for i in double_quote_line : 
        index = list(zip([m.end() for m in re.finditer('“',i)] , [m.start() for m in re.finditer('”',i)]))
        if len(index) == 1 :
            author_ls.append(i[:re.search("“.+",i).start()])
    author_ls = [i for i in author_ls if i]
    count_df = pd.DataFrame.from_dict(Counter(author_ls),orient='index',columns=['count'])
    count_df.sort_values('count',ascending=False,inplace=True)
    count_df['ratio'] = (count_df['count'] / len(sent_ls)) * 100
    return count_df

In [18]:
display(return_author_df(fm_sent_ls).head(10))
display(return_author_df(f_sent_ls).head(10))
display(return_author_df(dr_sent_ls).head(10))
display(return_author_df(sc_sent_ls).head(10))

Unnamed: 0,count,ratio
Fama E. F. and K. R. French,59,0.512553
Fama E. and K. French,27,0.234558
Bollerslev T.,25,0.217184
Jensen M.,21,0.182434
Baker M. and J. Wurgler,20,0.173747
Jensen M. and W. Meckling,20,0.173747
Black F.,20,0.173747
Merton R. C.,18,0.156372
신동령,18,0.156372
김창수,17,0.147685


Unnamed: 0,count,ratio
Fama E. F. and K. R. French,47,0.518936
Jensen M. C.,19,0.209782
Amihud Y. and H. Mendelson,13,0.143535
고봉찬 김진우,13,0.143535
Jegadeesh N. and S. Titman,13,0.143535
고봉찬,12,0.132494
최문수,12,0.132494
김창수,12,0.132494
Baker M. and J. Wurgler,11,0.121453
Merton R. C.,11,0.121453


Unnamed: 0,count,ratio
Fama E. F. and K. R. French,20,0.227092
이우백,14,0.158964
Hasbrouck J.,11,0.124901
홍정효,11,0.124901
Black F. and M. Scholes,11,0.124901
고봉찬 김진우,10,0.113546
Merton R. C.,10,0.113546
윤선중,10,0.113546
염명훈 백재승 류두진,9,0.102191
한국거래소,9,0.102191


Unnamed: 0,count,ratio
고봉찬 김진우,32,0.242222
이원흠,18,0.13625
이원흠 최수미,16,0.121111
윤평식,15,0.113542
김창수,15,0.113542
김석진 김지영,14,0.105972
최문수,12,0.090833
정형찬,10,0.075694
고봉찬,10,0.075694
신민식 김수은,10,0.075694


잡긴 잡지만, double quote 기준으로 paper name 만을 추출한 것이기 때문에, count의 갯수가 만족스럽지 못하다. 이에 따라, 위의 dataframe 의 인덱스를 딕셔너리로 잡아서 전체 데이터에 대해 검색해준다.

In [19]:
fm_sent_ls_df = return_author_df(fm_sent_ls)
f_sent_ls_df = return_author_df(f_sent_ls)
sc_sent_ls_df = return_author_df(sc_sent_ls)
dr_sent_ls_df = return_author_df(dr_sent_ls)

In [20]:
fm_author_ls = fm_sent_ls_df[fm_sent_ls_df['count'] > 1].index.tolist()
f_author_ls = f_sent_ls_df[f_sent_ls_df['count'] > 1].index.tolist()
sc_author_ls = sc_sent_ls_df[sc_sent_ls_df['count'] > 1].index.tolist()
dr_author_ls = dr_sent_ls_df[dr_sent_ls_df['count'] > 1].index.tolist()

In [21]:
author_dict = fm_author_ls + f_author_ls + sc_author_ls + dr_author_ls
author_dict = list(set(author_dict))
author_dict = [i for i in author_dict if i]

In [22]:
author_dict = list(set(author_dict))

In [23]:
splited_author_dict = []
for i in author_dict : 
    if re.findall('[a-zA-Z]',i) :
        splited_author_dict.append(i.strip().split("and"))
    elif re.findall('[가-힣]',i) :
        splited_author_dict.append(i.strip().split(" "))

In [24]:
def apply_author_dict(sent_ls,author_dict = splited_author_dict,orgin_data = author_dict):
    author_ls = []

    for idx,catch in enumerate(author_dict) : 
        for sent in sent_ls : 
            if all([re.search(catch[idx],sent) for idx in range(len(catch))]) : 
                author_ls.append(orgin_data[idx])
    return author_ls

In [None]:
%%time
fm_author_dict = apply_author_dict(fm_sent_ls)
print('1',end='\r')
f_author_dict = apply_author_dict(f_sent_ls)
print('2',end='\r')
sc_author_dict = apply_author_dict(sc_sent_ls)
print('3',end='\r')
dr_author_dict = apply_author_dict(dr_sent_ls)

1

In [None]:
fm_author_dict_df = pd.DataFrame.from_dict(Counter(fm_author_dict),orient='index',columns=['count'])
f_author_dict_df = pd.DataFrame.from_dict(Counter(f_author_dict),orient='index',columns=['count'])
sc_author_dict_df = pd.DataFrame.from_dict(Counter(sc_author_dict),orient='index',columns=['count'])
dr_author_dict_df = pd.DataFrame.from_dict(Counter(dr_author_dict),orient='index',columns=['count'])

In [None]:
fm_author_dict_df.sort_values('count',ascending=False,inplace=True)
f_author_dict_df.sort_values('count',ascending=False,inplace=True)
sc_author_dict_df.sort_values('count',ascending=False,inplace=True)
dr_author_dict_df.sort_values('count',ascending=False,inplace=True)

In [None]:
display(fm_author_dict_df.head(10))
display(f_author_dict_df.head(10))
display(sc_author_dict_df.head(10))
display(dr_author_dict_df.head(10))

In [None]:
fm_single_author_ls = fm_author_dict_df[fm_author_dict_df['count'] > 10].index.tolist()
f_single_author_ls = f_author_dict_df[f_author_dict_df['count'] > 10].index.tolist()
sc_single_author_ls = sc_author_dict_df[sc_author_dict_df['count'] > 10].index.tolist()
dr_single_author_ls = dr_author_dict_df[dr_author_dict_df['count'] > 10].index.tolist()
single_author_ls = fm_single_author_ls + f_single_author_ls + sc_single_author_ls + dr_single_author_ls
single_author_ls =list(set(single_author_ls))

In [None]:
splited_single_author_dict = []
for i in single_author_ls : 
    if re.findall('[a-zA-Z]',i) :
        splited_single_author_dict.append(i.strip().split("and"))
    elif re.findall('[가-힣]',i) :
        splited_single_author_dict.append(i.strip().split(" "))

In [None]:
single_author_dict = [j.strip() for i in splited_single_author_dict for j in i if len(j) >2]
single_author_dict = list(set(single_author_dict))

In [None]:
def apply_single_author_dict(sent_ls,author_dict = single_author_dict):
    
    single_author_ls = []

    for catch in author_dict :
        for sent in sent_ls :
            if catch in sent : 
                single_author_ls.append(catch)
    return single_author_ls

In [None]:
%%time
fm_single_author_dict = apply_single_author_dict(fm_sent_ls)
print('1',end='\r')
f_single_author_dict = apply_single_author_dict(f_sent_ls)
print('2',end='\r')
sc_single_author_dict = apply_single_author_dict(sc_sent_ls)
print('3',end='\r')
dr_single_author_dict = apply_single_author_dict(dr_sent_ls)

In [None]:
fm_single_author_dict_df = pd.DataFrame.from_dict(Counter(fm_single_author_dict),orient='index',columns=['count'])
f_single_author_dict_df = pd.DataFrame.from_dict(Counter(f_single_author_dict),orient='index',columns=['count'])
sc_single_author_dict_df = pd.DataFrame.from_dict(Counter(sc_single_author_dict),orient='index',columns=['count'])
dr_single_author_dict_df = pd.DataFrame.from_dict(Counter(dr_single_author_dict),orient='index',columns=['count'])

In [None]:
fm_single_author_dict_df.sort_values('count',ascending=False,inplace=True)
f_single_author_dict_df.sort_values('count',ascending=False,inplace=True)
sc_single_author_dict_df.sort_values('count',ascending=False,inplace=True)
dr_single_author_dict_df.sort_values('count',ascending=False,inplace=True)

In [None]:
display(fm_single_author_dict_df.head(10))
display(f_single_author_dict_df.head(10))
display(sc_single_author_dict_df.head(10))
display(dr_single_author_dict_df.head(10))