In [1]:
from pathlib import Path
import pandas as pd
import os
from collections import Counter
import re

# View all rows of a dataframe in Jupyter
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
xlsxfolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\identifiers_xlsx_processed")
pdffolder = Path(r"C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\papers_pdf")
txtfolder = pdffolder

## Functions

In [3]:
def read_xlsx_identifiers(xlsxfolder):
    df_papers = pd.DataFrame()

    for xlsxfile in xlsxfolder.iterdir():
        if xlsxfile.suffix == '.xlsx':
            # Print file name, and also see journal name
            print("reading in: ", xlsxfile)

            # Read in identifier xlsx file
            df = pd.read_excel(Path(xlsxfile))

            # Append df of macro papers from this particular journal to the compiled df of macro papers for all journals
            df_papers = df_papers.append(df)

    return df_papers

In [4]:
def print_list(text):
    # Can print both list of strings, as well as strings (list of chars)
    for index, line in enumerate(text):
        print(index, repr(line)) # use repr so that things like \n and \x0c are printed, not resolved.

In [5]:
def print_files(folder):
    for root, dirs, files in os.walk(folder):
        for name in files:
            print("file: ", os.path.join(root, name))

In [6]:
def print_dirs(folder):
    for root, dirs, files in os.walk(folder):
        for name in dirs:
            print("dir: ",os.path.join(root, name))

In [7]:
def check_txt_exists_for_each_pdf(folder):
    list_txt_exist = []
    list_txt_does_not_exist = []
    
    for root, dirs, files in os.walk(folder):
        for name in files:
            # Get only pdf files
            if name.endswith('.pdf'): 
                # print("pdf: ", os.path.join(root, name))
                
                # Get equivalent txt filepath
                txtfile = Path(Path(root) / Path(Path(name).stem + ".txt"))
                # print("txt: ", txtfile)
                
                # Check if filepath exists
                if os.path.exists(txtfile):
                    # print("txt exists")
                    list_txt_exist.append(txtfile)
                else:
                    # print("txt does not exist")
                    list_txt_does_not_exist.append(txtfile)
    
    print("number of pdf files with corresponding txt file: ", len(list_txt_exist))
    print("number of pdf files without corresponding txt file: ", len(list_txt_does_not_exist))
    return list_txt_exist, list_txt_does_not_exist

In [8]:
def check_pdf_exists_for_each_txt(folder):
    list_pdf_exist = []
    list_pdf_does_not_exist = []
    
    for root, dirs, files in os.walk(folder):
        for name in files:
            # Get only txt files
            if name.endswith('.txt'): 
                
                # Get equivalent pdf filepath
                pdffile = Path(Path(root) / Path(Path(name).stem + ".pdf"))

                # Check if filepath exists
                if os.path.exists(pdffile):
                    list_pdf_exist.append(pdffile)
                else:
                    list_txt_does_not_exist.append(pdffile)
    
    print("number of txt files with corresponding pdf file: ", len(list_pdf_exist))
    print("number of txt files without corresponding pdf file: ", len(list_pdf_does_not_exist))
    return list_pdf_exist, list_pdf_does_not_exist

In [9]:
def get_journal_folder(journal):
    dict = {'American Economic Review': 'aer', 
            'Econometrica': 'econometrica',
            'Journal of Political Economy': 'jpe',
            'Quarterly Journal of Economics': 'qje',
            'Review of Economic Studies': 'res'}
    return dict[journal]

In [10]:
def get_list_filepath_towards_txt(df_papers, txtfolder):
    list_filepath = []
    
    for index, row in df_papers.iterrows():
        # Set variable names
        journal = row['Publication Title']
        year = row['Publication Year']
        issue = row['Issue']
        filename = row['filename']

        # Generate path towards txtfile
        txtfilepath = Path(txtfolder / Path(get_journal_folder(journal)) / Path(str(year)) / Path(str(issue)) / Path(str(filename) + ".txt"))
        list_filepath.append(txtfilepath)
            
    return list_filepath

In [11]:
def check_txt_exists_for_each_xlsx_filepath(list_filepath):
    list_txt_exist_xlsx = []
    list_txt_does_not_exist_xlsx = []
    
    for filepath in list_filepath:
        # Check if filepath exists
        if os.path.exists(filepath):
            # print("txt exists")
            list_txt_exist_xlsx.append(filepath)
        else:
            # print("txt does not exist")
            list_txt_does_not_exist_xlsx.append(filepath)
    
    print("number of xlsx filepaths with corresponding txt file: ", len(list_txt_exist_xlsx))
    print("number of xlsx filepaths without corresponding txt file: ", len(list_txt_does_not_exist_xlsx))
    
    return list_txt_exist_xlsx, list_txt_does_not_exist_xlsx

In [12]:
def check_xlsx_filepath_exists_for_each_txt(folder, list_filepath):
    list_xlsx_filepath_exists = []
    list_xlsx_filepath_does_not_exist = []
    
    for root, dirs, files in os.walk(folder):
        for name in files:
            # Get only txt files
            if name.endswith('.txt'): 
                filepath = Path(os.path.join(root, name))
                
                # Check if filepath exists
                if filepath in list_filepath:
                    list_xlsx_filepath_exists.append(filepath)
                else:
                    list_xlsx_filepath_does_not_exist.append(filepath)
                    
    print("number of txt files with corresponding xlsx filepath: ", len(list_xlsx_filepath_exists))
    print("number of txt files without corresponding xlsx filepath: ", len(list_xlsx_filepath_does_not_exist))
    
    return list_xlsx_filepath_exists, list_xlsx_filepath_does_not_exist

In [13]:
def find_title_in_first_page_of_paper(df_papers, verbose=False):
    list_count_words_found = []
    list_count_words_not_found = []
    list_list_words_found = []
    list_list_words_not_found = []
    
    if verbose:
        list_first_page = []
        
    for index, row in df_papers.iterrows():
        # Set variable names and remove weird symbols ([^...] means any character not in the set [...])
        # Spaces are kept to distinguish between words
        title = re.sub("[^a-zA-Z0-9 ]", "", row['Title'].lower())
        txtfilepath = row['filepath']

        # Read txtfile and check if it contains keywords
        with open(txtfilepath,"r", encoding='utf-8') as txtfile:
            text = txtfile.read()
            text = text.lower()

        # Extract first page - take everything before the first instance of x0c (page break), and remove weird symbols
        # Spaces are not kept to remove non-matches due to weird spacing
        index_firstpage = text.find('\x0c') 
        first_page = text[0:index_firstpage]
        first_page = re.sub("[^a-zA-Z0-9]", "", first_page)
        
        if verbose:
            list_first_page.append(first_page)
            
        # Break down the author, title and journal strings into a list of words 
        title_list_of_words = title.split(" ")
        
        # For each word, check that it's contained in the text
        count_words_found = 0
        count_words_not_found = 0
        list_words_found = []
        list_words_not_found = []
        
        for word in title_list_of_words:
            if word in first_page:
                count_words_found += 1
                list_words_found.append(word)
            else:
                count_words_not_found +=1
                list_words_not_found.append(word)

        list_count_words_found.append(count_words_found)
        list_count_words_not_found.append(count_words_not_found)
        list_list_words_found.append(list_words_found)
        list_list_words_not_found.append(list_words_not_found)
    
    # Add the list to the df as a df column
    df_papers['count_words_found_title'] = list_count_words_found
    df_papers['count_words_not_found_title'] = list_count_words_not_found
    df_papers['list_words_found'] = list_list_words_found
    df_papers['list_words_not_found'] = list_list_words_not_found

    if verbose:
        df_papers['first_page'] = list_first_page
    
    # Filter out the papers where not all words of the title are found
    df_words_not_found = df_papers[df_papers['count_words_not_found_title'] != 0]
    number_of_papers_without_all_words_in_title_found = df_words_not_found.shape[0]
    print("number of xlsx papers where all words in 'title' column are found in first page of txt: ", df_papers.shape[0] - number_of_papers_without_all_words_in_title_found)
    print("number of xlsx papers where not all words in 'title' column are found in first page of txt: ", number_of_papers_without_all_words_in_title_found)
    
    if df_words_not_found.shape[0] > 0:
        print("Check through the following xlsx papers:")
        print(df_words_not_found[['Title','Publication Title','list_words_found','list_words_not_found']])
    
    return df_papers, number_of_papers_without_all_words_in_title_found

In [14]:
# Notes
# Path(root) # 'C:/Users/jasonjia/Dropbox/Projects/channels_in_macro/data/papers_pdf/res/2021/6'
# Path(name) # "rdab0168.txt"
# Path(name).stem # "rdab0168"
# Path(name).suffix # ".txt"

## Code

In [15]:
df_papers = read_xlsx_identifiers(xlsxfolder)

reading in:  C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\identifiers_xlsx_processed\AER_2015_2021_processed.xlsx
reading in:  C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\identifiers_xlsx_processed\Econometrica_2015_2021_processed.xlsx
reading in:  C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\identifiers_xlsx_processed\JPE_2015_2021_processed.xlsx
reading in:  C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\identifiers_xlsx_processed\QJE_2015_2021_processed.xlsx
reading in:  C:\Users\jasonjia\Dropbox\Projects\channels_in_macro\data\identifiers_xlsx_processed\RES_2015_2021_processed.xlsx


In [16]:
# Start of Tests
print("--- Start of Tests ---\n")

# Test 1: Check that for every pdf in the folder, there exists a txt file with the same name
print("Test 1: Check that for every pdf in the folder, there exists a txt file with the same name")
list_txt_exist, list_txt_does_not_exist = check_txt_exists_for_each_pdf(pdffolder)

if len(list_txt_does_not_exist) > 0:
    print("Test 1 failed\n")
else:
    print("Test 1 passed\n")

# Test 2: Check that for every txt in the folder, there exists a pdf file with the same name
print("Test 2: Check that for every txt in the folder, there exists a pdf file with the same name")
list_pdf_exist, list_pdf_does_not_exist = check_pdf_exists_for_each_txt(txtfolder)

if len(list_pdf_does_not_exist) > 0:
    print("Test 2 failed\n")
else:
    print("Test 2 passed\n")

# Test 3: Get the filepath for each paper listed in xlsx based on a formula, and check that it is unique
print("Test 3: Get the filepath for each paper listed in xlsx based on a formula, and check that it is unique")

# Get filepath for each paper listed in xlsx based on a formula 
list_filepath = get_list_filepath_towards_txt(df_papers, txtfolder)
df_papers['filepath'] = list_filepath

# Check that there are no duplicates
print("number of entries in list_filepath: ", len(list_filepath))
print("number of unique entries in list_filepath: ", len(set(list_filepath)))

if len(list_filepath) > len(set(list_filepath)):
    print("Warning: there are duplicate entries! Duplicate entries:")
    # Identify and print duplicate entries
    counts = dict(Counter(list_filepath))
    duplicates = {key:value for key, value in counts.items() if value > 1}
    print(duplicates)
    print("Test 3 failed\n")
else:
    print("Test 3 passed\n")

# Test 4: Check that for every paper listed in xlsx, there is a corresponding txt paper based on a formula for the file path
print("Test 4: Check that for every paper listed in xlsx, there is a corresponding txt paper based on a formula for the file path")
list_txt_exist_xlsx, list_txt_does_not_exist_xlsx = check_txt_exists_for_each_xlsx_filepath(list_filepath)

if len(list_txt_does_not_exist_xlsx) > 0:
    print("Test 4 failed\n")
else:
    print("Test 4 passed\n")

# Test 5: Check that for every txt file, the same filepath appears in the xlsx
print("Test 5: Check that for every txt file, the same filepath appears in the xlsx")
list_xlsx_filepath_exists, list_xlsx_filepath_does_not_exist = check_xlsx_filepath_exists_for_each_txt(txtfolder, list_filepath)

if len(list_xlsx_filepath_does_not_exist) > 0:
    print("Test 5 failed\n")
else:
    print("Test 5 passed\n")
    
# Test 6: Check that the corresponding txt paper has the same title as in the xlsx file
print("Test 6: Check that the corresponding txt paper has the same title as in the xlsx file")
df_papers, number_of_papers_without_all_words_in_title_found = find_title_in_first_page_of_paper(df_papers)

if number_of_papers_without_all_words_in_title_found > 0:
    print("Test 6 failed\n")
else:
    print("Test 6 passed\n")

# End of Tests
print("--- End of Tests ---")

--- Start of Tests ---

Test 1: Check that for every pdf in the folder, there exists a txt file with the same name
number of pdf files with corresponding txt file:  2387
number of pdf files without corresponding txt file:  0
Test 1 passed

Test 2: Check that for every txt in the folder, there exists a pdf file with the same name
number of txt files with corresponding pdf file:  2387
number of txt files without corresponding pdf file:  0
Test 2 passed

Test 3: Get the filepath for each paper listed in xlsx based on a formula, and check that it is unique
number of entries in list_filepath:  2387
number of unique entries in list_filepath:  2387
Test 3 passed

Test 4: Check that for every paper listed in xlsx, there is a corresponding txt paper based on a formula for the file path
number of xlsx filepaths with corresponding txt file:  2387
number of xlsx filepaths without corresponding txt file:  0
Test 4 passed

Test 5: Check that for every txt file, the same filepath appears in the xlsx
