In [64]:
import pandas as pd
import datetime as dt
import re
import os

In [4]:
def does_fuzzy_report_exist(pub_code):
    fuzzy_report_path = 'econlit_scopus_matching_out/{}_fuzzy_results/{}_fuzzy_matching_report.txt'.format(pub_code, pub_code)
    fuzzy_report_status = os.path.exists(fuzzy_report_path)

    return fuzzy_report_status


In [48]:
def get_report_timestamp(pub_code, report_type: str):
    if report_type == 'fuzzy':
        report_path = 'econlit_scopus_matching_out/{}_fuzzy_results/{}_fuzzy_matching_report.txt'.format(pub_code, pub_code)
    elif report_type == 'hand_selection':
        report_path = 'econlit_scopus_matching_out/{}_fuzzy_results/{}_hand_selection_report.txt'.format(pub_code, pub_code)

    
    
    with open(report_path, 'r') as report:
        report_lines = report.readlines()
        timestamp_line = report_lines[0]

        timestamp = dt.datetime.strptime(timestamp_line[:-1],'%Y-%m-%d %X.%f')
        return timestamp

In [46]:
def does_hand_selection_report_exist(pub_code):
    hand_selection_report_path = 'econlit_scopus_matching_out/{}_fuzzy_results/{}_hand_selection_report.txt'.format(pub_code, pub_code)

    hand_selection_report_status = os.path.exists(hand_selection_report_path)
    return hand_selection_report_status

In [40]:
def generate_hand_selection_report(pub_code):
    fuzzy_report_path = fuzzy_report_path = 'econlit_scopus_matching_out/{}_fuzzy_results/{}_fuzzy_matching_report.txt'.format(pub_code, pub_code)
    hand_selection_report_path = 'econlit_scopus_matching_out/{}_fuzzy_results/{}_hand_selection_report.txt'.format(pub_code, pub_code)

    fuzzy_report = open(fuzzy_report_path, 'r')
    fuzzy_report_lines = fuzzy_report.readlines()
    fuzzy_report.close()

    hand_selection_report_lines = fuzzy_report_lines

    current_timestamp = str(dt.datetime.now())+'\n'    
    report_title = 'HAND SELECTION_REPORT\n'

    hand_selection_report_lines[0] = current_timestamp
    hand_selection_report_lines[1] = report_title


    with open(hand_selection_report_path, 'w') as hand_selection_report:
        hand_selection_report.writelines(hand_selection_report_lines)

    return


In [107]:
def identify_append_hand_selections(pub_code):

    path_prepend = 'econlit_scopus_matching_out/'

    hand_selection_report_path = path_prepend +'{}_fuzzy_results/{}_hand_selection_report.txt'.format(pub_code, pub_code)

    hand_selection_report = open(hand_selection_report_path, 'r').read()
    # print(hand_selection_report)

    unmatched_scopus_articles_start = re.search(r'SCOPUS ARTICLES REMAINING UNMATCHED \(\d*\)', hand_selection_report).start()
    unmatched_econlit_articles_start = re.search(r'ECONLIT ARTICLES REMAINING UNMATCHED \(\d*\)', hand_selection_report).start()

    unmatched_scopus_articles_string = hand_selection_report[unmatched_scopus_articles_start: unmatched_econlit_articles_start]
    unmatched_econlit_articles_string = hand_selection_report[unmatched_econlit_articles_start:]


    hand_selection_pattern = r'\*\*\*(.*)\n'

    hand_selected_scopus_titles_obj = re.finditer(hand_selection_pattern,unmatched_scopus_articles_string)
    hand_selected_econlit_titles_obj = re.finditer(hand_selection_pattern,unmatched_econlit_articles_string)
    hand_selected_scopus_titles = []
    hand_selected_econlit_titles = []
    for i, title in enumerate(hand_selected_scopus_titles_obj):
        hand_selected_scopus_titles.append(title[1])
    for j, title in enumerate(hand_selected_econlit_titles_obj):
        hand_selected_econlit_titles.append(title)



    fuzzy_matched_path = path_prepend + '{}_fuzzy_results/{}_fuzzy_matches.csv'.format(pub_code, pub_code)
    scopus_unmatched_path = path_prepend + '{}_fuzzy_results/{}_fuzzy_unmatched_scopus.csv'.format(pub_code, pub_code)
    econlit_unmatched_path = path_prepend + '{}_fuzzy_results/{}_fuzzy_unmatched_econlit.csv'.format(pub_code, pub_code)


    fuzzy_matched_df = pd.read_csv(fuzzy_matched_path)
    print(len(fuzzy_matched_df))
    print(fuzzy_matched_df.columns)
    scopus_unmatched_df = pd.read_csv(scopus_unmatched_path)
    econlit_unmatched_df = pd.read_csv(econlit_unmatched_path)
    
    scopus_hand_selected_df = scopus_unmatched_df[scopus_unmatched_df['sc_title'].isin(hand_selected_scopus_titles)]

    econlit_hand_selected_df = econlit_unmatched_df[econlit_unmatched_df['title'].isin(hand_selected_econlit_titles)]

    print(len(scopus_hand_selected_df))
    print(len(econlit_hand_selected_df))


    hand_selected_df = pd.concat([fuzzy_matched_df, scopus_hand_selected_df], ignore_index=True)
    print(hand_selected_df.columns)

    print(len(hand_selected_df))

    return hand_selected_df

In [109]:
def output_hand_appended(pub_code, hand_selected_df):
    path_prepend = 'econlit_scopus_matching_out/'
    outpath = path_prepend + '{}_hand_selected_appended.csv'.format(pub_code)

    hand_selected_df.to_csv(outpath, encoding='utf-8', index=False)
    
    return

In [108]:
if does_fuzzy_report_exist('RES'):
    if does_hand_selection_report_exist('RES'):
        if get_report_timestamp('RES', 'fuzzy') < get_report_timestamp('RES', 'hand_selection'):
            print('Adding hand-selected observations')
            hand_selected_df = identify_append_hand_selections('RES')
            output_hand_appended('RES', hand_selected_df)


        else:
            print("It looks like your Fuzzy-matching report has been generated more recently than your hand-selection report. That it means the latter could be out of date. Please _either update timestamp manually or...________")
    else:
        print("It looks like a hand-selection report for _______ doesn't exist yet. Generating one now.")
        generate_hand_selection_report('RES')
else:
    print("It looks like a fuzzy matching report for _______doesn't exist yet. Please first run the fuzzy-matching algorithm")




Adding hand-selected observations
1423
Index(['doi_x', 'sc_title', 'sc_issn', 'sc_pub_name', 'sc_vol', 'sc_issue',
       'sc_page_range', 'sc_abstract_api_endpoint', 'sc_human_url',
       'sc_pub_date', 'sc_open_access_status', 'sc_query_used',
       'sc_title_upper', 'index', 'id', 'jel_desc', 'jel_code', 'doi_y',
       'title', 'volume', 'issue', 'date', 'pages', 'issn', 'author',
       'abstract', 'title_upper'],
      dtype='object')
2
0
Index(['doi_x', 'sc_title', 'sc_issn', 'sc_pub_name', 'sc_vol', 'sc_issue',
       'sc_page_range', 'sc_abstract_api_endpoint', 'sc_human_url',
       'sc_pub_date', 'sc_open_access_status', 'sc_query_used',
       'sc_title_upper', 'index', 'id', 'jel_desc', 'jel_code', 'doi_y',
       'title', 'volume', 'issue', 'date', 'pages', 'issn', 'author',
       'abstract', 'title_upper'],
      dtype='object')
1425
