In [10]:
import pandas as pd
import numpy as np
import lxml
import traceback
import re
from bs4 import BeautifulSoup as bs
from thefuzz import fuzz



In [3]:
def read_econlit_xml(pub_code):
    input_path = 'econ_lit_xml/' + pub_code + '_90-21.xml'
    econlit_download = open(input_path, 'r', encoding='utf-8')
    content = econlit_download.read()
    econlit_content = bs(content, 'lxml')

    return econlit_content

In [12]:
def econlit_obs_counter(pubcode_econlit_bs_content, pub_code, journal_title_list):
    candidate_article_list = pubcode_econlit_bs_content.find_all('rec')
    pub_list = []
    print("\t\t NUMBER OF {} CANDIDATE ARTICLES FROM ECONLIT XML: {}".format(pub_code, len(candidate_article_list)))

    for candidate in candidate_article_list:
        try:
            journal_title = candidate.find('jtl').text
        except: 
            print('No "Journal Title" found for resultID {}, of {}'.format(candidate.get('resultid'), pub_code))

        if journal_title in journal_title_list:
            pub_list.append(candidate)
        else:
            print("'Journal Title ({}) for resultID {} of {} not as expected. Moving on.".format(journal_title, candidate.get('resultid'), pub_code))

    print("\t\t NUMBER OF {} ACTUAL ARTICLES FROM ECONLIT XML: {}".format(pub_code, len(pub_list)))

    return pub_list

In [14]:
def field_population(temp_df, xml_obj, nav_dict, field):
    try:
        # There are 2 fields that contain multiple entries: The JEL Descriptions and Authors (since we're discarding affiliations here)
        if field in ['jel_desc', 'author']:
            list_of = [desc.text for desc in xml_obj.find_all(nav_dict[field])]
            temp_df.at[0, field] = list_of
            return temp_df

        value = xml_obj.find(nav_dict[field]).text
        temp_df[field] = value

        return temp_df

    except:
        value = 'ECONLIT None Found'
        temp_df[field] = value
        return temp_df

In [15]:
def pub_field_pop_macro(pub_list):

    xml_dict = {
        'doi' : 'ui',
        'title' : 'atl',
        'volume' : 'vid',
        'issue' : 'iid',
        'date' : 'dt',
        'abstract' : 'ab',
        'pages' : 'pages',
        'author' : 'au',
        'jel_desc' : 'su',
        'issn' : 'issn'
    }

    pub_df = pd.DataFrame({})

    for article in pub_list:
        temp_df = pd.DataFrame({
            'doi' : [None],
            'title' : [None],
            'volume' : [None],
            'issue' : [None],
            'date' : [None],
            'abstract' : [None],
            'pages' : [None],
            'author' : [None],
            'jel_desc' : [None],
            'issn' : [None]
        })

        for field in xml_dict.keys(): 
            temp_df = field_population(temp_df, article, xml_dict, field)

        pub_df = pd.concat([pub_df, temp_df], ignore_index=True)
    return pub_df


In [7]:
def insert_dash_date(string):
    try:
        listed = list(string)
        listed.insert(6, '-')
        listed.insert(4, '-')

        rejoined = ''.join(listed)
        return rejoined
    except:
        return string

In [8]:
def instantiate_jel_codes():
    jel_df = pd.read_xml('aea_jel_codes.xml', xpath='classification')

    return jel_df 

In [16]:
def codes_articles_merge(pub_df, jel_df):
    merge_df = pub_df
    merge_df['id'] = np.arange(len(merge_df))


    merge_df = merge_df.explode('jel_desc')
    merge_df = pd.merge(merge_df, jel_df,
                        how = 'left',
                        left_on = 'jel_desc',
                        right_on = 'description')


    merge_df = merge_df.groupby(by='id').agg(
        {
            'jel_desc' : lambda x: x.tolist(),
            'code' : lambda x: x.tolist(),
            'doi' : lambda x: x.unique(),
            'title' : lambda x: x.unique(),
            'volume' : lambda x: x.unique(),
            'issue' : lambda x: x.unique(),
            'date' : lambda x: x.unique(),
            'pages' : lambda x: x.unique(),
            'issn' : lambda x: x.unique(), 
            'author' : lambda x: list(set(x.explode())),
            'abstract' : lambda x: ''.join(set(x.explode())),
        }
    ).rename({'code' : 'jel_code'}, axis=1)

    merge_df = merge_df.explode('doi')
    merge_df = merge_df.explode('title')
    merge_df = merge_df.explode('volume')
    merge_df = merge_df.explode('issue')
    merge_df = merge_df.explode('date')
    merge_df = merge_df.explode('pages')
    merge_df = merge_df.explode('issn')
    merge_df = merge_df.explode('date')



    # return merge_df
    merge_df.reset_index(inplace=True)


    # pub_jel_df = merge_df[['volume', 'issue', 'date', 'abstract', 'author', 'jel_desc', 'jel_code', 'title']]

    # pub_jel_df = merge_df

    # return pub_jel_df
    return merge_df 


In [26]:
def jel_code_identification(pub_jel_df, codes_patterns_dict):
    for jel_pattern in codes_patterns_dict.keys():
        pub_jel_df[jel_pattern] = 0

    pub_jel_df[['year', 'month', 'day']] = pub_jel_df['date'].str.split('-', expand=True)

    for row in pub_jel_df.index.tolist():
        article_codes = pub_jel_df.loc[row, 'jel_code']
        for code in article_codes:
            if type(code) != str:
                continue
            for code_col in codes_patterns_dict.keys():
                pattern = codes_patterns_dict[code_col]
                if re.search(pattern, code):
                    pub_jel_df.loc[row, code_col] = 1

    return pub_jel_df


In [27]:
def df_to_csv(df, out_path):
    df.to_csv(out_path, index=False, encoding='utf-8')

In [41]:
instruction_dict = {
    # 'jpe' : {
    #     'pub_code' : 'jpe',
    #     'journal_names' : ['Journal of Political Economy'],
    #     'out_path' : 'econlit_scopus_matching_out/jpe_econlit.csv'
    # },
    # 'qje' : {
    #     'pub_code' : 'qje',
    #     'journal_names' : ['Quarterly Journal of Economics'],
    #     'out_path' : 'econlit_scopus_matching_out/qje_econlit.csv'
    # },
    # 'res' : {
    #     'pub_code' : 'res',
    #     'journal_names' : ['Review of Economic Studies'],
    #     'out_path' : 'econlit_scopus_matching_out/res_econlit.csv'
    # },
    # 'eca' : {
    #     'pub_code' : 'eca',
    #     'journal_names' : ['Econometrica'],
    #     'out_path' : 'econlit_scopus_matching_out/eca_econlit.csv'
    # },
    # 'aer' : {
    #     'pub_code' : 'aer',
    #     'journal_names' : ['American Economic Review'],
    #     'out_path' : 'econlit_scopus_matching_out/aer_econlit.csv'
    # },
    # 'rje' : {
    #     'pub_code' : 'rje',
    #     'journal_names' : ['RAND Journal of Economics (Wiley-Blackwell)', 'RAND Journal of Economics (RAND Journal of Economics)'],
    #     'out_path' : 'econlit_scopus_matching_out/rje_econlit.csv'
    # },
}

# Instantiate the df of JEL codes and descriptions for matching
jel_df = instantiate_jel_codes()
   


for pub in instruction_dict.keys(): 
    pub_code = instruction_dict.get(pub).get('pub_code')
    pub_journal_names = instruction_dict.get(pub).get('journal_names')
    pub_outpath = instruction_dict.get(pub).get('out_path')
    
    print("--------------------------------------------")
    print("STARTING ON {}".format(pub_code))

    # 1. Read in EconLit XML 
    pub_econlit_content = read_econlit_xml(pub_code)
    print('\t 1. ECONLIT XML OBJECT READ-IN')

    # 2. Convert that XML document into a list of XML objects, each an article 
    pub_article_list = econlit_obs_counter(pub_econlit_content, pub_code, pub_journal_names)
    print('\t 2. CONVERTED ECONLIT XML OBJECT TO LIST OF ARTICLE-XML OBJECTS')

    # 3. Instantiate a df that converts each those article-XML objects into a row of a df (This function itself calls 'field_population()')
    pub_df = pub_field_pop_macro(pub_article_list)
    print('\t 3. POPULATING A DF WITH THOSE ARTICLE-XML OBJECTS')

    # 3.1 Modify the "date" string in that df to have a certain format (for future splitting)
    pub_df.date = pub_df.date.apply(lambda x: insert_dash_date(x))    
    print('\t\t 3.1 REFORMATTING DF DATE FORMATS')

    # 4. Merge the JEL codes into the pub_df which only has JEL descriptions
    pub_jel_df = codes_articles_merge(pub_df, jel_df)
    print('\t 4. MERGED JEL CODES WITH ARTICLE-LEVEL DF')

    #5 Output to file
    df_to_csv(pub_jel_df, pub_outpath)
    print('\t 5. DF OUTPUTTED AS CSV')


--------------------------------------------
STARTING ON rje
	 1. ECONLIT XML OBJECT READ-IN
		 NUMBER OF rje CANDIDATE ARTICLES FROM ECONLIT XML: 1288
'Journal Title' for resultID 503 of rje not as expected. Moving on.
		 NUMBER OF rje ACTUAL ARTICLES FROM ECONLIT XML: 1287
	 2. CONVERTED ECONLIT XML OBJECT TO LIST OF ARTICLE-XML OBJECTS
	 3. POPULATING A DF WITH THOSE ARTICLE-XML OBJECTS
		 3.1 REFORMATTING DF DATE FORMATS
	 4. MERGED JEL CODES WITH ARTICLE-LEVEL DF
	 5. PICKED OUT THE JEL CODES WE CARE ABOUT
	 6. DF OUTPUTTED AS CSV


In [None]:
test_dict = {
    'eca' : {
        'pub_code' : 'eca',
        'journal_names' : ['Econometrica'],
        'out_path' : 'econlit_scopus_matching_out/eca_econlit_test.csv'
    },
}

jel_df = instantiate_jel_codes()
for pub in test_dict.keys():
    pub_code = test_dict.get(pub).get('pub_code')
    pub_journal_names = test_dict.get(pub).get('journal_names')
    pub_outpath = test_dict.get(pub).get('out_path')

    print("--------------------------------------------")
    print("STARTING ON {}".format(pub_code))

    # 1. Read in EconLit XML 
    pub_econlit_content = read_econlit_xml(pub_code)
    print('\t 1. ECONLIT XML OBJECT READ-IN')

    # 2. Convert that XML document into a list of XML objects, each an article 
    pub_article_list = econlit_obs_counter(pub_econlit_content, pub_code, pub_journal_names)
    print('\t 2. CONVERTED ECONLIT XML OBJECT TO LIST OF ARTICLE-XML OBJECTS')

    # 3. Instantiate a df that converts each those article-XML objects into a row of a df (This function itself calls 'field_population()')
    pub_df = pub_field_pop_macro(pub_article_list)
    print('\t 3. POPULATING A DF WITH THOSE ARTICLE-XML OBJECTS')

    # 3.1 Modify the "date" string in that df to have a certain format (for future splitting)
    pub_df.date = pub_df.date.apply(lambda x: insert_dash_date(x))    
    print('\t\t 3.1 REFORMATTING DF DATE FORMATS')

    # 4. Merge the JEL codes into the pub_df which only has JEL descriptions
    pub_jel_df = codes_articles_merge(pub_df, jel_df)
    print('\t 4. MERGED JEL CODES WITH ARTICLE-LEVEL DF')