In [6]:
import pandas as pd 
import numpy as np
import lxml
import traceback
import re 
from bs4 import BeautifulSoup as bs
from thefuzz import fuzz


In [7]:
test_download = open('econ_lit_xml/jpe_90-21.xml', 'r', encoding='utf-8')
content = test_download.read()
test_bs_content = bs(content, 'lxml')

In [8]:
candidate_list = test_bs_content.find_all('rec')
jpe_list = []
print(len(candidate_list))
for candidate in candidate_list:
    try:
        journal_title = candidate.find('jtl').text
    except:
        print('failing out on record {}'.format(candidate.get('resultid')))
        continue
    if journal_title == "Journal of Political Economy":
        jpe_list.append(candidate)  
    else:
        print('failing out on record {}'.format(candidate.get('resultid')))

len(jpe_list)

1798


1798

In [9]:
xml_dict = {
    'doi' : 'ui',
    'title' : 'atl',
    'volume' : 'vid',
    'issue' : 'iid',
    'date' : 'dt',
    'abstract' : 'ab',
    'pages' : 'pages',
    'author' : 'au',
    'jel_desc' : 'su',
}


def field_population(temp_df, xml_obj, nav_dict, field):
    try:
        if field in ['jel_desc', 'author']:
            
            list_of = [desc.text for desc in xml_obj.find_all(nav_dict[field])]
            # print(list_of)
            temp_df.at[0,field] = list_of
            return temp_df

        value = xml_obj.find(nav_dict[field]).text
        temp_df[field] = value

        return temp_df
    except:
        # print('---------------------------------------')
        # print('fail on resultID {}, {}'.format(xml_obj.get('resultid'),field))
        value = 'ECONLIT None Found'
        temp_df[field] = value
        return temp_df
        # print(traceback.format_exc())


test_df = pd.DataFrame({})


for article in jpe_list:
    temp_df = pd.DataFrame({
        'doi' : [None],
        'title' : [None],
        'volume' : [None],
        'issue' : [None],
        'date' : [None],
        'abstract' : [None],
        'pages' : [None],
        'author' : [None],
        'jel_desc' : [None]
    })
    for field in xml_dict.keys():
        temp_df = field_population(temp_df, article, xml_dict, field)
    
    test_df = pd.concat([test_df, temp_df], ignore_index=True)




In [10]:
def insert_dash_date(string):
    try:
        listed = list(string)
        listed.insert(6, '-')
        listed.insert(4, '-')
        
        rejoined = ''.join(listed)
        return rejoined
    except:
        return string

test_df.date = test_df.date.apply(lambda x: insert_dash_date(x))
# test_df = test_df.explode('author')


In [11]:
jel_df = pd.read_xml('aea_jel_codes.xml', xpath='classification')

In [12]:
test_merge = test_df
test_merge['id'] = np.arange(len(test_merge))
# test_merge.set_index('id', inplace=True)

test_merge = test_merge.explode('jel_desc')

test_merge = pd.merge(test_merge, jel_df, how='left', left_on='jel_desc', right_on='description')

# test_merge.index.rename("index", inplace=True)
test_merge = test_merge.groupby(by='id').agg(
    {
        'jel_desc' : lambda x: x.tolist(),
        'code' : lambda x: x.tolist(),
        'doi' : lambda x: x.unique(),
        'title' : lambda x: x.unique(),
        'volume' : lambda x: x.unique(),
        'issue' : lambda x: x.unique(),
        'date' : lambda x: x.unique(),
        'pages' : lambda x: x.unique(),
        'author' : lambda x: x,
        'abstract' : lambda x: x,
    }
).rename({'code' : 'jel_code'}, axis=1)


test_merge = test_merge.explode('doi')
test_merge = test_merge.explode('title')
test_merge = test_merge.explode('volume')
test_merge = test_merge.explode('issue')
test_merge = test_merge.explode('date')
test_merge = test_merge.explode('pages')
# test_merge = test_merge.explode('author')

for row in test_merge.index.unique().tolist(): 
    try:
        test_merge.loc[row, 'author'] = test_merge.loc[row, 'author'][0]
        if type(test_merge.loc[row, 'abstract']==list):
            test_merge.loc[row, 'abstract'] = test_merge.loc[row, 'abstract'][0]
    except:
        # print(test_merge.loc[row, 'author'])
        print(test_merge.loc[row,'title'])


test_jpe_codes_years = test_merge[['volume', 'issue', 'date', 'jel_desc', 'jel_code', 'title']]

Index to Volume 128.
Index to Volume 127.
Index to Volume 126.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
2021 Lucas Prize Announcement.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
N.PAG.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times

In [13]:

test_jpe_codes_years['L_code'] = 0
test_jpe_codes_years['K_code'] = 0
test_jpe_codes_years['D4_code'] = 0
test_jpe_codes_years['G34_code'] = 0
test_jpe_codes_years['O3_code'] = 0

test_jpe_codes_years[['year','month','day']] = test_jpe_codes_years['date'].str.split('-', expand=True)
for row in test_jpe_codes_years.index.tolist():
    codes = test_jpe_codes_years.loc[row, 'jel_code']
    for code in codes: 
        if type(code) != str:
            continue
        if re.search(r'L', code):
            test_jpe_codes_years.loc[row, 'L_code'] = 1
        if re.search(r'K', code):
            test_jpe_codes_years.loc[row, 'K_code'] = 1
        if re.search(r'D4', code):
            test_jpe_codes_years.loc[row, 'D4_code'] = 1
        if re.search(r'G34', code):
            test_jpe_codes_years.loc[row, 'G34_code'] = 1
        if re.search(r'O3', code):
            test_jpe_codes_years.loc[row, 'O3_code'] = 1

test_jpe_codes_years

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_jpe_codes_years['L_code'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_jpe_codes_years['K_code'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_jpe_codes_years['D4_code'] = 0


Unnamed: 0_level_0,volume,issue,date,jel_desc,jel_code,title,L_code,K_code,D4_code,G34_code,O3_code,year,month,day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,129,4,2021-04-01,[Trade Policy; International Trade Organizatio...,"[F13, F14, L80]",The Economic Structure of International Trade-...,1,0,0,0,0,2021,04,01
1,129,4,2021-04-01,"[Firm Behavior: Empirical Analysis, Empirical ...","[D22, F14, F23, L14, L25, O31, O33]","Upstream, Downstream: Diffusion and Impacts of...",1,0,0,0,1,2021,04,01
2,129,4,2021-04-01,"[Consumer Economics: Empirical Analysis, Micro...","[D12, O12, Z12, Z13]",How Do We Choose Our Identity? A Revealed Pref...,0,0,0,0,0,2021,04,01
3,129,4,2021-04-01,"[Property Law, Energy, Environmental, Health, ...","[K11, K32, P14, Q53, Q58]",Securing Property Rights,0,1,0,0,0,2021,04,01
4,129,4,2021-04-01,"[Portfolio Choice; Investment Decisions, Equit...","[G11, G12, G41]",Asset Classes,0,0,0,0,0,2021,04,01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1793,98,5,1990-10-02,"[Economic Development Models and Theories, Hum...","[nan, nan]",Endogenous Technological Change,0,0,0,0,0,1990,10,02
1794,98,5,1990-10-02,"[Demographic Economics, Human Capital; Value o...","[nan, nan]",Population Growth and Human Capital Investment...,0,0,0,0,0,1990,10,02
1795,98,5,1990-10-02,"[Economic Development Models and Theories, Dem...","[nan, nan, nan]","Human Capital, Fertility, and Economic Growth",0,0,0,0,0,1990,10,02
1796,98,5,1990-10-02,[Economic Development Models and Theories],[nan],The Problem of Development: Introduction,0,0,0,0,0,1990,10,02


In [14]:
test_jpe_codes_years.groupby('year').sum().to_csv('test_jpe_prelim_jel_results.csv', index=False, encoding='utf-8')