In [11]:
from pathlib import Path
import pandas as pd
import string

In [12]:
xlsxfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\identifiers_xlsx_processed")
txtfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\papers_pdf")
outputfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\macro_papers")
outputfile = Path("macro_papers.xlsx")

In [13]:
keyword_list = ["macro", "macroeconomic", "macroeconomics", "fiscal policy", "monetary policy"]

## Functions

In [14]:
def get_journal_folder(journal):
    dict = {'American Economic Review': 'aer', 
            'Econometrica': 'econometrica',
            'Journal of Political Economy': 'jpe',
            'Quarterly Journal of Economics': 'qje',
            'Review of Economic Studies': 'res'}
    return dict[journal]

In [15]:
def contain_keyword(keyword_list, text):
    # Boolean
    contain_keyword = any(keyword in text for keyword in keyword_list)
    return contain_keyword

In [16]:
def why_macro_paper(row):
    # print("jel_e: ")
    # print(row['jel_e'])
    # print("contain_keyword: ")
    # print(row['contain_keyword'])
    
    if row['jel_e'] == 1 and row['contain_keyword'] == 0:
        return "jel_e"
    elif row['jel_e'] == 0 and row['contain_keyword'] == 1:
        return "contain_keyword"
    elif row['jel_e'] == 1 and row['contain_keyword'] == 1:
        return "jel_e and contain_keyword"
    else:
        return "not_macro_paper"

## Main Code

In [25]:
df_macro_papers = pd.DataFrame()

for xlsxfile in xlsxfolder.iterdir():
    if xlsxfile.suffix == '.xlsx':
        print(xlsxfile)
        df = pd.read_excel(Path(xlsxfile))
        print(df.head())
        list_contain_keyword = []
        list_filepath = []

        for index, row in df.iterrows():
            year = row['Publication Year']
            author = row['Author']
            title = row['Title']
            journal = row['Publication Title']
            date = row['Date']
            issue = row['Issue']
            filename = row['filename']

            # Generate path towards txtfile
            txtfilepath = Path(txtfolder / Path(get_journal_folder(journal)) / Path(str(year)) / Path(str(issue)) / Path(str(filename) + ".txt"))
            list_filepath.append(txtfilepath)
            # print(txtfilepath)

            # Read txtfile and check if it contains keywords
            with open(txtfilepath,"r", encoding='utf-8') as txtfile:
                text = txtfile.read()
                text = text.lower()

                if contain_keyword(keyword_list, text):
                    list_contain_keyword.append(1)
                else:
                    list_contain_keyword.append(0)

        # Add columns to original dataframe
        df = df.assign(contain_keyword = list_contain_keyword)
        df = df.assign(filepath = list_filepath)

        # Determine if paper meets the criteria for a macro paper:
        # either classified under JEL:E, or contains any of the specified keywords.

        df['is_macro_paper'] = df['jel_e'] | df['contain_keyword']
        print(df.shape)

        # Only keep macro papers: 
        df = df[df['is_macro_paper'] == 1]
        df['macro_paper_reason'] = df.apply(why_macro_paper, axis = 1)
        print(df.shape)
        df_macro_papers = df_macro_papers.append(df)

df_macro_papers.to_excel(Path(outputfolder / outputfile), index=False)

C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\identifiers_xlsx_processed\AER_2015_2021_processed.xlsx
   Publication Year                                             Author  \
0              2021  ARKHANGELSKY, DMITRY; ATHEY, SUSAN; HIRSHBERG,...   
1              2021  KARTIK, NAVIN; KLEINER, ANDREAS; VAN WEELDEN, ...   
2              2021                 GOEREE, JACOB K.; LOUIS, PHILIPPOS   
3              2021     BAILEY, MARTHA J.; SHUQIAO SUN; TIMPE, BRENDEN   
4              2021                                         YIQUN CHEN   

                                               Title  \
0               Synthetic Difference-in-Differences.   
1                     Delegation in Veto Bargaining.   
2  M Equilibrium: A Theory of Beliefs and Choices...   
3  Prep School for Poor Kids: The Long-Run Impact...   
4  Team-Specific Human Capital and Team Performan...   

          Publication Title     Date  Issue      Pages      filename  jel_e  
0  American Economic Revie

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\jasonjia\\Dropbox\\Projects\\channels_in_macro\\data\\papers_pdf\\aer\\2017\\12\\aer.20151425.txt'

In [21]:
# Visualization
df_macro_papers

## Tests

In [104]:
# Test
get_journal_folder("Econometrica")

'econometrica'

In [79]:
# Test to check that | works correctly
pd.DataFrame([1,1,0,0]) | pd.DataFrame([1,0,1,0])

Unnamed: 0,0
0,1
1,1
2,1
3,0


In [128]:
dfcopy['macro_paper_reason'] = dfcopy.apply(why_macro_paper, axis = 1) 
dfcopy['macro_paper_reason']

2                contain_keyword
5      jel_e and contain_keyword
7                contain_keyword
8                contain_keyword
13     jel_e and contain_keyword
                 ...            
435              contain_keyword
437    jel_e and contain_keyword
442              contain_keyword
443              contain_keyword
444              contain_keyword
Name: macro_paper_reason, Length: 234, dtype: object