In [1]:
import pandas as pd 
import numpy as np
import lxml
import traceback
from bs4 import BeautifulSoup as bs


In [2]:
test_download = open('econ_lit_xml/jpe_90-21.xml', 'r', encoding='utf-8')
content = test_download.read()
test_bs_content = bs(content, 'lxml')

In [3]:
candidate_list = test_bs_content.find_all('rec')
jpe_list = []
for candidate in candidate_list:
    try:
        journal_title = candidate.find('jtl').text
    except:
        print('failing out on record {}'.format(candidate.get('resultid')))
        continue
    if journal_title == "Journal of Political Economy":
        jpe_list.append(candidate)  
    else:
        print('failing out on record {}'.format(candidate.get('resultid')))

len(jpe_list)

1798

In [4]:
xml_dict = {
    'doi' : 'ui',
    'title' : 'atl',
    'volume' : 'vid',
    'issue' : 'iid',
    'date' : 'dt',
    'abstract' : 'ab',
    'pages' : 'pages',
    'author' : 'au',
    'jel_desc' : 'su',
}


def field_population(temp_df, xml_obj, nav_dict, field):
    try:
        if field in ['jel_desc', 'author']:
            
            list_of = [desc.text for desc in xml_obj.find_all(nav_dict[field])]
            # print(list_of)
            temp_df.at[0,field] = list_of
            return temp_df

        value = xml_obj.find(nav_dict[field]).text
        temp_df[field] = value

        return temp_df
    except:
        # print('---------------------------------------')
        # print('fail on resultID {}, {}'.format(xml_obj.get('resultid'),field))
        value = 'ECONLIT None Found'
        temp_df[field] = value
        return temp_df
        # print(traceback.format_exc())


test_df = pd.DataFrame({

    })


for article in jpe_list:
    temp_df = pd.DataFrame({
        'doi' : [None],
        'title' : [None],
        'volume' : [None],
        'issue' : [None],
        'date' : [None],
        'abstract' : [None],
        'pages' : [None],
        'author' : [None],
        'jel_desc' : [None]
    })
    for field in xml_dict.keys():
        temp_df = field_population(temp_df, article, xml_dict, field)
    
    test_df = pd.concat([test_df, temp_df], ignore_index=True)





In [5]:
def insert_dash_date(string):
    try:
        listed = list(string)
        listed.insert(6, '-')
        listed.insert(4, '-')
        
        rejoined = ''.join(listed)
        return rejoined
    except:
        return string

test_df.date = test_df.date.apply(lambda x: insert_dash_date(x))
# test_df = test_df.explode('author')


In [6]:
jel_df = pd.read_xml('aea_jel_codes.xml', xpath='classification')

In [7]:
test_merge = test_df
test_merge['id'] = np.arange(len(test_merge))
# test_merge.set_index('id', inplace=True)

test_merge = test_merge.explode('jel_desc')

test_merge = pd.merge(test_merge, jel_df, how='left', left_on='jel_desc', right_on='description')

# test_merge.index.rename("index", inplace=True)
test_merge = test_merge.groupby(by='id').agg(
    {
        'jel_desc' : lambda x: x.tolist(),
        'code' : lambda x: x.tolist(),
        'doi' : lambda x: x.unique(),
        'title' : lambda x: x.unique(),
        'volume' : lambda x: x.unique(),
        'issue' : lambda x: x.unique(),
        'date' : lambda x: x.unique(),
        'pages' : lambda x: x.unique(),
        'author' : lambda x: x,
        'abstract' : lambda x: x,
    }
).rename({'code' : 'jel_code'}, axis=1)


test_merge = test_merge.explode('doi')
test_merge = test_merge.explode('title')
test_merge = test_merge.explode('volume')
test_merge = test_merge.explode('issue')
test_merge = test_merge.explode('date')
test_merge = test_merge.explode('pages')
# test_merge = test_merge.explode('author')

for row in test_merge.index.unique().tolist(): 
    try:
        test_merge.loc[row, 'author'] = test_merge.loc[row, 'author'][0]
        test_merge.loc[row, 'abstract'] = test_merge.loc[row, 'abstract'][0]
    except:
        # print(test_merge.loc[row, 'author'])
        print(test_merge.loc[row,'title'])


test_merge

Index to Volume 128.
Index to Volume 127.
Index to Volume 126.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
2021 Lucas Prize Announcement.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
N.PAG.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times, Previous Two Years.
Back Cover.
Recent Referees.
Front Matter.
JPE Turnaround Times

Unnamed: 0_level_0,jel_desc,jel_code,doi,title,volume,issue,date,pages,author,abstract
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,[Trade Policy; International Trade Organizatio...,"[F13, F14, L80]",ECONLIT None Found,The Economic Structure of International Trade-...,129,4,2021-04-01,1287-1317,"[Staiger, Robert W., Sykes, Alan O.]",The existing economics literature on internati...
1,"[Firm Behavior: Empirical Analysis, Empirical ...","[D22, F14, F23, L14, L25, O31, O33]",ECONLIT None Found,"Upstream, Downstream: Diffusion and Impacts of...",129,4,2021-04-01,1252-1286,"[Basker, Emek, Simcoe, Timothy]","We study the adoption, diffusion, and impacts ..."
2,"[Consumer Economics: Empirical Analysis, Micro...","[D12, O12, Z12, Z13]",ECONLIT None Found,How Do We Choose Our Identity? A Revealed Pref...,129,4,2021-04-01,1193-1251,"[Atkin, David, Colson-Sihra, Eve, Shayo, Moses]",Are identities fungible? How do people come to...
3,"[Property Law, Energy, Environmental, Health, ...","[K11, K32, P14, Q53, Q58]",ECONLIT None Found,Securing Property Rights,129,4,2021-04-01,1157-1192,"[Behrer, A. Patrick, Glaeser, Edward L., Ponze...",A central challenge in securing property right...
4,"[Portfolio Choice; Investment Decisions, Equit...","[G11, G12, G41]",ECONLIT None Found,Asset Classes,129,4,2021-04-01,1100-1156,"[Jacquet, Nicolas L.]",This paper proposes a theory of endogenous dif...
...,...,...,...,...,...,...,...,...,...,...
1793,"[Economic Development Models and Theories, Hum...","[nan, nan]",ECONLIT None Found,Endogenous Technological Change,98,5,1990-10-02,S71-102,"[Romer, Paul M.]",Growth in this model is driven by technologica...
1794,"[Demographic Economics, Human Capital; Value o...","[nan, nan]",ECONLIT None Found,Population Growth and Human Capital Investment...,98,5,1990-10-02,S38-70,"[Rosenzweig, Mark R.]",This paper presents evidence from empirical st...
1795,"[Economic Development Models and Theories, Dem...","[nan, nan, nan]",ECONLIT None Found,"Human Capital, Fertility, and Economic Growth",98,5,1990-10-02,S12-37,"[Becker, Gary S., Murphy, Kevin M., Tamura, Ro...",The authors' analysis of growth assumes endoge...
1796,[Economic Development Models and Theories],[nan],ECONLIT None Found,The Problem of Development: Introduction,98,5,1990-10-02,S1-11,"Erlich, Isaac",E


In [17]:
%store -r JPE_core_df
test_merge[test_merge.volume == '125']


sc_eho_try = pd.merge(JPE_core_df, test_merge,
     how='left',
      left_on=['sc_vol', 'sc_issue', 'sc_page_range'],
      right_on=['volume', 'issue', 'pages'])



sc_eho_try.to_csv('test_out.csv', index=False, encoding='utf-8')

In [26]:
sc_eho_try['title_match'] = 0
for row in sc_eho_try.index.tolist():
    if sc_eho_try.loc[row, 'sc_title'] == sc_eho_try.loc[row,'title']:
        sc_eho_try.loc[row, 'title_match'] = 1
    else:
        sc_eho_try.loc[row, 'title_match'] = 0

sc_eho_try.query('title_match == 1')

Unnamed: 0,doi_x,sc_title,sc_issn,sc_pub_name,sc_vol,sc_issue,sc_page_range,sc_abstract_api_endpoint,sc_human_url,sc_pub_date,...,jel_code,doi_y,title,volume,issue,date,pages,author,abstract,title_match
130,10.1086/707011,Queens,223808,Journal of Political Economy,128,7,2579-2652,https://api.elsevier.com/content/abstract/scop...,https://www.scopus.com/inward/citedby.uri?part...,2020-07-01,...,"[D72, J16, N43, N44]",ECONLIT None Found,Queens,128,7,2020-07-01,2579-2652,"[Dube, Oeindrila, Harish, S. P.]",Do states experience more peace under female l...,1
360,10.1086/694632,Memories of Friedman and Patinkin,223808,Journal of Political Economy,125,6,1831-1834,https://api.elsevier.com/content/abstract/scop...,https://www.scopus.com/inward/citedby.uri?part...,2017-12-01,...,"[B22, B31]",ECONLIT None Found,Memories of Friedman and Patinkin,125,6,2017-12-01,1831-1834,"[Lucas, Robert E., Jr.]",ECONLIT None Found,1
372,10.1086/694623,Finance at the University of Chicago,223808,Journal of Political Economy,125,6,1790-1799,https://api.elsevier.com/content/abstract/scop...,https://www.scopus.com/inward/citedby.uri?part...,2017-12-01,...,"[B26, D25, nan, G14, G31]",ECONLIT None Found,Finance at the University of Chicago,125,6,2017-12-01,1790-1799,"[Fama, Eugene F.]",ECONLIT None Found,1
499,10.1086/680415,Preface,223808,Journal of Political Economy,123,2,263-265,https://api.elsevier.com/content/abstract/scop...,https://www.scopus.com/inward/citedby.uri?part...,2015-04-01,...,[Y20],ECONLIT None Found,Preface,123,2,2015-04-01,263-265,"Cunha, Flavio",E,1
592,10.1086/666746,"The Schooling Decision: Family Preferences, In...",223808,Journal of Political Economy,120,3,359-397,https://api.elsevier.com/content/abstract/scop...,https://www.scopus.com/inward/citedby.uri?part...,2012-06-01,...,"[D13, I21, I28]",ECONLIT None Found,"The Schooling Decision: Family Preferences, In...",120,3,2012-06-01,359-397,"[Bursztyn, Leonardo, Coffman, Lucas C.]",This paper experimentally analyzes the schooli...,1
593,10.1086/666588,Moving Back Home: Insurance against Labor Mark...,223808,Journal of Political Economy,120,3,446-512,https://api.elsevier.com/content/abstract/scop...,https://www.scopus.com/inward/citedby.uri?part...,2012-06-01,...,"[D13, J22, J64]",ECONLIT None Found,Moving Back Home: Insurance against Labor Mark...,120,3,2012-06-01,446-512,"[Kaplan, Greg]",This paper demonstrates that the option to mov...,1
594,10.1086/666655,A Theory of Rational Jurisprudence,223808,Journal of Political Economy,120,3,513-551,https://api.elsevier.com/content/abstract/scop...,https://www.scopus.com/inward/citedby.uri?part...,2012-06-01,...,"[D83, K41]",ECONLIT None Found,A Theory of Rational Jurisprudence,120,3,2012-06-01,513-551,"[Baker, Scott, Mezzetti, Claudio]",We examine a dynamic model of up-or-down probl...,1
595,10.1086/666747,Capital Taxation: Quantitative Explorations of...,223808,Journal of Political Economy,120,3,398-445,https://api.elsevier.com/content/abstract/scop...,https://www.scopus.com/inward/citedby.uri?part...,2012-06-01,...,"[D15, H21]",ECONLIT None Found,Capital Taxation: Quantitative Explorations of...,120,3,2012-06-01,398-445,"[Farhi, Emmanuel, Werning, Ivan]",Economies with private information provide a r...,1
596,10.1086/666669,"Illiquid Banks, Financial Stability, and Inter...",223808,Journal of Political Economy,120,3,552-591,https://api.elsevier.com/content/abstract/scop...,https://www.scopus.com/inward/citedby.uri?part...,2012-06-01,...,"[E52, E58, G21, G28]",ECONLIT None Found,"Illiquid Banks, Financial Stability, and Inter...",120,3,2012-06-01,552-591,"[Diamond, Douglas W., Rajan, Raghuram G.]",Banks finance illiquid assets with demandable ...,1
686,10.1086/599283,Information and the Skewness of Music Sales,223808,Journal of Political Economy,117,2,324-369,https://api.elsevier.com/content/abstract/scop...,https://www.scopus.com/inward/citedby.uri?part...,2009-04-01,...,"[D12, L81, L82]",ECONLIT None Found,Information and the Skewness of Music Sales,117,2,2009-04-01,324-369,"[Hendricks, Ken, Sorensen, Alan]",This paper studies the role of product discove...,1
