In [3]:
import io
import re
 
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
 
def extract_text_from_pdf(pdf_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
 
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
 
        text = fake_file_handle.getvalue()
 
    # close open handles
    converter.close()
    fake_file_handle.close()
 
    if text:
        return text

In [4]:
file = extract_text_from_pdf('09800627.pdf')

In [83]:
def make_references(start,end):
    references = file[re.search(start,file).end():re.search(end,file).end()+1]
    return references

In [84]:
references = make_references('참 고 문 헌','SSRN')
len(references)

6222

In [48]:
references = file[re.search('참 고 문 헌',file).end():re.search('SSRN',file).end()+1]

In [49]:
references

'김성민, 장용원, “현금배당과 자사주매입의 선택에 관한 연구 : 현금흐름의 지속성을 중심으로,” ｢선물연구｣, 24, 2, 2016, 591-617.손삼호, 황세운, 빈기범, “한국 주식시장에서 장기소비 위험 모형의 유효성과 경기변동에 대한 시사점,” ｢선물연구｣, 20, 3, 2012, 265-295.윤보현, 최영민, “한국 주식시장에서의 대안 인덱스 투자전략 연구,” ｢선물연구｣, 22, 2, 2014, 285-308.Bansal, R., R. F. Dittmar, and C. T. Lundblad, “Consumption, dividends, and the cross section of equity returns,” The Journal of Finance, 60, 4, 2005, 1639-1672.Bansal, R., R. F. Dittmar, and D. Kiku, “Cointegration and consumption risks in asset returns,” The Review of Financial Studies, 22, 3, 2009, 1343-1375.Bansal, R. and A. Yaron, “Risks for the long run : A potential resolution of asset pricing puzzles,” The Journal of Finance, 59, 4, 2004, 1481-1509.Banz, R. W., “The Relationship Between Return and Market Value of Common Stocks,” Journal of Financial Economics, 9, 1, 1981, 3-18.Basu, S., “Investment performance of common stocks in relation to their price-earnings ratios : A test of the efficient market hypothesis,” The Journal of Finance, 32, 3

In [86]:
def make_new_references(references):
    code_ls = re.findall('[0-9]+[-][0-9]+',references)
    
    start_idx , end_idx =[],[]
    for i,val in enumerate(code_ls):
        start_idx.append(references.find(code_ls[i]))
        end_idx.append(references.find(code_ls[i]) + len(val))
    
    new_references = []
    for idx,val in enumerate(end_idx):
        if idx == 0: new_references.append(references[:val])
        else : new_references.append(references[end_idx[idx-1]:val])
            
    return new_references

In [89]:
new_references = make_new_references(references=references)
len(new_references)

45

In [68]:
code_ls = re.findall('[0-9]+[-][0-9]+',references)

In [69]:
start_idx , end_idx =[],[]
for i,val in enumerate(code_ls):
    start_idx.append(references.find(code_ls[i]))
    end_idx.append(references.find(code_ls[i]) + len(val))
len(start_idx) , len(end_idx)

(45, 45)

In [70]:
new_references = []
for idx,val in enumerate(end_idx):
    if idx == 0: new_references.append(references[:val])
    else : new_references.append(references[end_idx[idx-1]:val])

In [71]:
new_references

['김성민, 장용원, “현금배당과 자사주매입의 선택에 관한 연구 : 현금흐름의 지속성을 중심으로,” ｢선물연구｣, 24, 2, 2016, 591-617',
 '.손삼호, 황세운, 빈기범, “한국 주식시장에서 장기소비 위험 모형의 유효성과 경기변동에 대한 시사점,” ｢선물연구｣, 20, 3, 2012, 265-295',
 '.윤보현, 최영민, “한국 주식시장에서의 대안 인덱스 투자전략 연구,” ｢선물연구｣, 22, 2, 2014, 285-308',
 '.Bansal, R., R. F. Dittmar, and C. T. Lundblad, “Consumption, dividends, and the cross section of equity returns,” The Journal of Finance, 60, 4, 2005, 1639-1672',
 '.Bansal, R., R. F. Dittmar, and D. Kiku, “Cointegration and consumption risks in asset returns,” The Review of Financial Studies, 22, 3, 2009, 1343-1375',
 '.Bansal, R. and A. Yaron, “Risks for the long run : A potential resolution of asset pricing puzzles,” The Journal of Finance, 59, 4, 2004, 1481-1509',
 '.Banz, R. W., “The Relationship Between Return and Market Value of Common Stocks,” Journal of Financial Economics, 9, 1, 1981, 3-18',
 '.Basu, S., “Investment performance of common stocks in relation to their price-earnings ratios : A test of the efficient market hypoth

In [90]:
def make_idx(new_references):
    person_idx = []
    for idx,val in enumerate(new_references):
        person_idx.append(val[:val.find('“')])
    
    journal_idx = []
    for idx,val in enumerate(new_references):
        journal_idx.append(val[val.find('”')+1:])
        
    year_ls = []
    for idx,val in enumerate(new_references):
        if re.findall('20[0-9]{2}',val) or re.findall('19[0-9]{2}',val) != []:
            year_ls.append((re.findall('20[0-9]{2}',val) or re.findall('19[0-9]{2}',val))[0])
        else : year_ls.append('0')
            
    df = pd.DataFrame()
    df['person'] = person_idx
    df['journal'] = journal_idx
    df['year'] = year_ls
    df['code'] = code_ls
    
    return df

In [91]:
df = make_idx(new_references=new_references)

In [72]:
person_idx = []
for idx,val in enumerate(new_references):
    person_idx.append(val[:val.find('“')])
len(person_idx)

45

In [73]:
person_idx

['김성민, 장용원, ',
 '.손삼호, 황세운, 빈기범, ',
 '.윤보현, 최영민, ',
 '.Bansal, R., R. F. Dittmar, and C. T. Lundblad, ',
 '.Bansal, R., R. F. Dittmar, and D. Kiku, ',
 '.Bansal, R. and A. Yaron, ',
 '.Banz, R. W., ',
 '.Basu, S., ',
 '.Breeden, D., ',
 '.Campbell, J. Y., ',
 '.Campbell, J. Y., ',
 '.Campbell, J. Y. and A. S. Kyle, ',
 '.Campbell, J. Y. and R. J. Shiller, ',
 '.Carhart, M. M., ',
 '.\x0c현금흐름 위험 기반 KOSPI 수익률 횡단면 연구341Cohen, R. B., C. Polk, and T. Vuolteenaho, ',
 '.Da, Z., ',
 '.Daniel, K. D. and D. A. Marshall, ',
 '.De Long, J. B., A. Shleifer, L. H. Summers, and R. J. Waldmann, ',
 '.Dechow, P. M., R. G. Sloan, and M. T. Soliman, ',
 ', 2004, 197-22',
 '.Epstein, L. G. and S. E. Zin, ',
 '.Fama, E. F., ',
 '.Fama, E. F. and J. D. MacBeth, ',
 '.Fama, E. F. and K. R. French, ',
 '.Fama, E. F. and K. R. French, ',
 '.Fama, E. F. and K. R. French, ',
 '.Fama, E. F. and K. R. French, ',
 '.Fama, E. F. and K. R. French, ',
 '.Fama, E. F. and K. R. French, ',
 '.Grullon, G. and R. Michaely

In [74]:
journal_idx = []
for idx,val in enumerate(new_references):
    journal_idx.append(val[val.find('”')+1:])
len(journal_idx)

45

In [75]:
year_ls = []
for idx,val in enumerate(new_references):
    if re.findall('20[0-9]{2}',val) or re.findall('19[0-9]{2}',val) != []:
        year_ls.append((re.findall('20[0-9]{2}',val) or re.findall('19[0-9]{2}',val))[0])
    else : year_ls.append('0')
len(year_ls)

45

In [76]:
len(code_ls)

45

In [77]:
len(person_idx) ,len(journal_idx) ,len(year_ls) ,len(code_ls)

(45, 45, 45, 45)

In [78]:
import pandas as pd
df = pd.DataFrame()
df['person'] = person_idx
df['journal'] = journal_idx
df['year'] = year_ls
df['code'] = code_ls

### 사전에 정해진 규칙에 따라 데이터 프레임을 형성해주는데, 최종본을 확인하고 직접 수정해준다.

In [79]:
df['person'][14] = df['person'][14][32:]

In [80]:
df

Unnamed: 0,person,journal,year,code
0,"김성민, 장용원,","｢선물연구｣, 24, 2, 2016, 591-617",2016,591-617
1,".손삼호, 황세운, 빈기범,","｢선물연구｣, 20, 3, 2012, 265-295",2012,265-295
2,".윤보현, 최영민,","｢선물연구｣, 22, 2, 2014, 285-308",2014,285-308
3,".Bansal, R., R. F. Dittmar, and C. T. Lundblad,","The Journal of Finance, 60, 4, 2005, 1639-1672",2005,1639-1672
4,".Bansal, R., R. F. Dittmar, and D. Kiku,","The Review of Financial Studies, 22, 3, 2009,...",2009,1343-1375
5,".Bansal, R. and A. Yaron,","The Journal of Finance, 59, 4, 2004, 1481-1509",2004,1481-1509
6,".Banz, R. W.,","Journal of Financial Economics, 9, 1, 1981, 3-18",1981,3-18
7,".Basu, S.,","The Journal of Finance, 32, 3, 1977, 663-682",1977,663-682
8,".Breeden, D.,","Journal of Financial Economics, 6, 1979, 273-296",1979,273-296
9,".Campbell, J. Y.,","Handbook of Macro-economics, 1, C, 1999, 1231...",1999,1231-1303
