In [24]:
from pathlib import Path
import pandas as pd
import string

In [25]:
xlsxfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\identifiers_xlsx_processed")
txtfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\papers_pdf")
outputfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\macro_papers")
outputfile = Path("macro_papers.xlsx")

In [26]:
keyword_list = ["macro", "macroeconomic", "macroeconomics", "fiscal policy", "monetary policy"]

## Functions

In [27]:
def get_journal_folder(journal):
    dict = {'American Economic Review': 'aer', 
            'Econometrica': 'econometrica',
            'Journal of Political Economy': 'jpe',
            'Quarterly Journal of Economics': 'qje',
            'Review of Economic Studies': 'res'}
    return dict[journal]

In [28]:
def contain_keyword(keyword_list, text):
    # Boolean
    contain_keyword = any(keyword in text for keyword in keyword_list)
    return contain_keyword

In [29]:
def why_macro_paper(row):
    # print("jel_e: ")
    # print(row['jel_e'])
    # print("contain_keyword: ")
    # print(row['contain_keyword'])
    
    if row['jel_e'] == 1 and row['contain_keyword'] == 0:
        return "jel_e"
    elif row['jel_e'] == 0 and row['contain_keyword'] == 1:
        return "contain_keyword"
    elif row['jel_e'] == 1 and row['contain_keyword'] == 1:
        return "jel_e and contain_keyword"
    else:
        return "not_macro_paper"

## Main Code

In [30]:
df_macro_papers = pd.DataFrame()

for xlsxfile in xlsxfolder.iterdir():
    if xlsxfile.suffix == '.xlsx':
        # Print file name, and also see journal name
        print(xlsxfile)
        
        # Read in identifier xlsx file
        df = pd.read_excel(Path(xlsxfile))
        
        # Initialize lists
        list_contain_keyword = []
        list_filepath = []

        for index, row in df.iterrows():
            # Set variable names
            year = row['Publication Year']
            author = row['Author']
            title = row['Title']
            journal = row['Publication Title']
            date = row['Date']
            issue = row['Issue']
            filename = row['filename']

            # Generate path towards txtfile
            txtfilepath = Path(txtfolder / Path(get_journal_folder(journal)) / Path(str(year)) / Path(str(issue)) / Path(str(filename) + ".txt"))
            list_filepath.append(txtfilepath)
            # print(txtfilepath)

            # Read txtfile and check if it contains keywords
            with open(txtfilepath,"r", encoding='utf-8') as txtfile:
                text = txtfile.read()
                text = text.lower()

                if contain_keyword(keyword_list, text):
                    list_contain_keyword.append(1)
                else:
                    list_contain_keyword.append(0)

        # Add columns to original dataframe
        df = df.assign(contain_keyword = list_contain_keyword)
        df = df.assign(filepath = list_filepath)

        # Determine if paper meets the criteria for a macro paper:
        # either classified under JEL:E, or contains any of the specified keywords.
        df['is_macro_paper'] = df['jel_e'] | df['contain_keyword']
        print("Number of papers: ", df.shape)

        # Only keep macro papers: 
        df = df[df['is_macro_paper'] == 1]
        df['macro_paper_reason'] = df.apply(why_macro_paper, axis = 1)
        print("Number of macro papers: ", df.shape)
        
        # Append df of macro papers from this particular journal to the compiled df of macro papers for all journals
        df_macro_papers = df_macro_papers.append(df)

# Save compiled df to excel
df_macro_papers.to_excel(Path(outputfolder / outputfile), index=False)

C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\identifiers_xlsx_processed\AER_2015_2021_processed.xlsx
Number of papers:  (786, 12)
Number of macro papers:  (445, 13)
C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\identifiers_xlsx_processed\Econometrica_2015_2021_processed.xlsx
Number of papers:  (390, 12)
Number of macro papers:  (163, 13)
C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\identifiers_xlsx_processed\JPE_2015_2021_processed.xlsx
Number of papers:  (465, 12)
Number of macro papers:  (220, 13)
C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\identifiers_xlsx_processed\QJE_2015_2021_processed.xlsx
Number of papers:  (288, 12)
Number of macro papers:  (160, 13)
C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\identifiers_xlsx_processed\RES_2015_2021_processed.xlsx
Number of papers:  (458, 12)
Number of macro papers:  (241, 13)


In [31]:
# Visualization
df_macro_papers

Unnamed: 0,Publication Year,Author,Title,Publication Title,Date,Issue,Pages,filename,jel_e,contain_keyword,filepath,is_macro_paper,macro_paper_reason
2,2021,"GOEREE, JACOB K.; LOUIS, PHILIPPOS",M Equilibrium: A Theory of Beliefs and Choices...,American Economic Review,2021-12,12,4002-4045,aer.20201683,0,1,C:\Users\jasonjia\Dropbox\Projects\channels_in...,1,contain_keyword
5,2021,"CHAHROUR, RYAN; NIMARK, KRISTOFFER; PITSCHNER,...",Sectoral Media Focus and Aggregate Fluctuations.,American Economic Review,2021-12,12,3872-3922,aer.20191895,1,1,C:\Users\jasonjia\Dropbox\Projects\channels_in...,1,jel_e and contain_keyword
7,2021,"BRAGUINSKY, SERGUEY; ATSUSHI OHYAMA; TETSUJI O...","Product Innovation, Product Diversification, a...",American Economic Review,2021-12,12,3795-3826,aer.20201656,0,1,C:\Users\jasonjia\Dropbox\Projects\channels_in...,1,contain_keyword
8,2021,"Kostøl, Andreas R.; Myhre, Andreas S.",Labor Supply Responses to Learning the Tax and...,American Economic Review,2021-11,11,3733-3766,aer.20201877,0,1,C:\Users\jasonjia\Dropbox\Projects\channels_in...,1,contain_keyword
13,2021,"Almunia, Miguel; Antràs, Pol; Lopez-Rodriguez,...",Venting Out: Exports during a Domestic Slump.,American Economic Review,2021-11,11,3611-3662,aer.20181853,1,1,C:\Users\jasonjia\Dropbox\Projects\channels_in...,1,jel_e and contain_keyword
...,...,...,...,...,...,...,...,...,...,...,...,...,...
447,2015,"Cho, In-Koo; Kasa, Kenneth",Learning and Model Validation.,Review of Economic Studies,2015-01,1,45-82,rdu026,0,1,C:\Users\jasonjia\Dropbox\Projects\channels_in...,1,contain_keyword
452,2015,"Grubb, Michael D.",Consumer Inattention and Bill-Shock Regulation.,Review of Economic Studies,2015-01,1,219-257,rdu024,0,1,C:\Users\jasonjia\Dropbox\Projects\channels_in...,1,contain_keyword
453,2015,"Handbury, Jessie; Weinstein, David E.",Goods Prices and Availability in Cities.,Review of Economic Studies,2015-01,1,258-296,rdu033,0,1,C:\Users\jasonjia\Dropbox\Projects\channels_in...,1,contain_keyword
454,2015,"Herweg, Fabian; Schmidt, Klaus M.",Loss Aversion and Inefficient Renegotiation.,Review of Economic Studies,2015-01,1,297-332,rdu034,0,1,C:\Users\jasonjia\Dropbox\Projects\channels_in...,1,contain_keyword


## Tests

In [104]:
# Test
get_journal_folder("Econometrica")

'econometrica'

In [79]:
# Test to check that | works correctly
pd.DataFrame([1,1,0,0]) | pd.DataFrame([1,0,1,0])

Unnamed: 0,0
0,1
1,1
2,1
3,0


In [128]:
dfcopy['macro_paper_reason'] = dfcopy.apply(why_macro_paper, axis = 1) 
dfcopy['macro_paper_reason']

2                contain_keyword
5      jel_e and contain_keyword
7                contain_keyword
8                contain_keyword
13     jel_e and contain_keyword
                 ...            
435              contain_keyword
437    jel_e and contain_keyword
442              contain_keyword
443              contain_keyword
444              contain_keyword
Name: macro_paper_reason, Length: 234, dtype: object