In [64]:
import pandas as pd
import datetime as dt
import re
import os

In [4]:
def does_fuzzy_report_exist(pub_code):
    fuzzy_report_path = 'econlit_scopus_matching_out/{}_fuzzy_results/{}_fuzzy_matching_report.txt'.format(pub_code, pub_code)
    fuzzy_report_status = os.path.exists(fuzzy_report_path)

    return fuzzy_report_status


In [48]:
def get_report_timestamp(pub_code, report_type: str):
    if report_type == 'fuzzy':
        report_path = 'econlit_scopus_matching_out/{}_fuzzy_results/{}_fuzzy_matching_report.txt'.format(pub_code, pub_code)
    elif report_type == 'hand_selection':
        report_path = 'econlit_scopus_matching_out/{}_fuzzy_results/{}_hand_selection_report.txt'.format(pub_code, pub_code)

    
    
    with open(report_path, 'r') as report:
        report_lines = report.readlines()
        timestamp_line = report_lines[0]

        timestamp = dt.datetime.strptime(timestamp_line[:-1],'%Y-%m-%d %X.%f')
        return timestamp

In [46]:
def does_hand_selection_report_exist(pub_code):
    hand_selection_report_path = 'econlit_scopus_matching_out/{}_fuzzy_results/{}_hand_selection_report.txt'.format(pub_code, pub_code)

    hand_selection_report_status = os.path.exists(hand_selection_report_path)
    return hand_selection_report_status

In [40]:
def generate_hand_selection_report(pub_code):
    fuzzy_report_path = fuzzy_report_path = 'econlit_scopus_matching_out/{}_fuzzy_results/{}_fuzzy_matching_report.txt'.format(pub_code, pub_code)
    hand_selection_report_path = 'econlit_scopus_matching_out/{}_fuzzy_results/{}_hand_selection_report.txt'.format(pub_code, pub_code)

    fuzzy_report = open(fuzzy_report_path, 'r')
    fuzzy_report_lines = fuzzy_report.readlines()
    fuzzy_report.close()

    hand_selection_report_lines = fuzzy_report_lines

    current_timestamp = str(dt.datetime.now())+'\n'    
    report_title = 'HAND SELECTION_REPORT\n'

    hand_selection_report_lines[0] = current_timestamp
    hand_selection_report_lines[1] = report_title


    with open(hand_selection_report_path, 'w') as hand_selection_report:
        hand_selection_report.writelines(hand_selection_report_lines)

    return


In [138]:
def identify_append_hand_selections(pub_code):

    path_prepend = 'econlit_scopus_matching_out/'

    hand_selection_report_path = path_prepend +'{}_fuzzy_results/{}_hand_selection_report.txt'.format(pub_code, pub_code)

    hand_selection_report = open(hand_selection_report_path, 'r').read()
    # print(hand_selection_report)

    unmatched_scopus_articles_start = re.search(r'SCOPUS ARTICLES REMAINING UNMATCHED \(\d*\)', hand_selection_report).start()
    unmatched_econlit_articles_start = re.search(r'ECONLIT ARTICLES REMAINING UNMATCHED \(\d*\)', hand_selection_report).start()

    unmatched_scopus_articles_string = hand_selection_report[unmatched_scopus_articles_start: unmatched_econlit_articles_start]
    unmatched_econlit_articles_string = hand_selection_report[unmatched_econlit_articles_start:]


    hand_selection_pattern = r'\*\*\*(.*) \(VOL\. \d+'

    hand_selected_scopus_titles_obj = re.finditer(hand_selection_pattern,unmatched_scopus_articles_string)
    hand_selected_econlit_titles_obj = re.finditer(hand_selection_pattern,unmatched_econlit_articles_string)
    hand_selected_scopus_titles = []
    hand_selected_econlit_titles = []
    for i, title in enumerate(hand_selected_scopus_titles_obj):
        # print(title)
        hand_selected_scopus_titles.append(title[1])
    for j, title in enumerate(hand_selected_econlit_titles_obj):
        # print(title)
        hand_selected_econlit_titles.append(title[1])

    print('Hand-selected Scopus titles', hand_selected_scopus_titles)
    print('Hand-selected EconLit titles', hand_selected_econlit_titles)



    fuzzy_matched_path = path_prepend + '{}_fuzzy_results/{}_fuzzy_matches.csv'.format(pub_code, pub_code)
    scopus_unmatched_path = path_prepend + '{}_fuzzy_results/{}_fuzzy_unmatched_scopus.csv'.format(pub_code, pub_code)
    econlit_unmatched_path = path_prepend + '{}_fuzzy_results/{}_fuzzy_unmatched_econlit.csv'.format(pub_code, pub_code)


    fuzzy_matched_df = pd.read_csv(fuzzy_matched_path)
    print('No. observations prior to appending hand-selected: ', len(fuzzy_matched_df))

    scopus_unmatched_df = pd.read_csv(scopus_unmatched_path)
    econlit_unmatched_df = pd.read_csv(econlit_unmatched_path)
    
    scopus_hand_selected_df = scopus_unmatched_df[scopus_unmatched_df['sc_title'].isin(hand_selected_scopus_titles)]

    econlit_hand_selected_df = econlit_unmatched_df[econlit_unmatched_df['title'].isin(hand_selected_econlit_titles)]

    print('Trying to append {} hand-selected Scopus article(s)'.format(len(scopus_hand_selected_df)))
    print('Trying to append {} hand-selected EconLit article(s)'.format(len(econlit_hand_selected_df)))

    print(fuzzy_matched_df.dtypes)
    print(scopus_hand_selected_df.dtypes)
    print(econlit_hand_selected_df.dtypes)


    hand_selected_df = pd.concat([fuzzy_matched_df, scopus_hand_selected_df, econlit_hand_selected_df], ignore_index=True)
    # Future warning here applies to the 'sc_open_access_Status' column because in econlit_hand_selected_df they are all np.nan (numeric dtype). No real need to worry about this because it will be coerced to an object dtype which preserves info for us.

    print('No. observations after to appending hand-selected: ',len(hand_selected_df))

    return hand_selected_df

In [110]:
def output_hand_appended(pub_code, hand_selected_df):
    path_prepend = 'econlit_scopus_matching_out/'
    outpath = path_prepend + '{}_hand_selected_appended.csv'.format(pub_code)

    hand_selected_df.to_csv(outpath, encoding='utf-8', index=False)
    
    return

In [142]:
run_list = [
    # 'AER',
    # 'ECA',
    # 'JPE',
    # 'QJE',
    'RES',
    # 'RJE',
]

In [146]:
for pub_code in run_list:
    if does_fuzzy_report_exist(pub_code):
        if does_hand_selection_report_exist(pub_code):
            if get_report_timestamp(pub_code, 'fuzzy') < get_report_timestamp(pub_code, 'hand_selection'):
                print('Adding hand-selected observations')
                hand_selected_df = identify_append_hand_selections(pub_code)
                output_hand_appended(pub_code, hand_selected_df)


            else:
                print("It looks like your Fuzzy-matching report for {} has been generated more recently than your hand-selection report. That it means the latter could be out of date. Please _either update timestamp manually or...________".format(pub_code))
        else:
            print("It looks like a hand-selection report for {} doesn't exist yet. Generating one now.".format(pub_code))
            generate_hand_selection_report(pub_code)
            print("Please examine the newly generated hand-selection report for {} and identify articles as you see fit".format(pub_code))
    else:
        print("It looks like a fuzzy matching report for {} doesn't exist yet. Please first run the fuzzy-matching algorithm".format(pub_code))




It looks like a hand-selection report for RES doesn't exist yet. Generating one now.
Please examine the newly generated hand-selection report for RES and identify articles as you see fit
