In [38]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
from thefuzz import fuzz

In [36]:
def load_input_csvs(pub_code):
    print('READING IN {}'.format(pub_code))
    scopus_core_path = 'scopus_data/' + pub_code + '_scopus_core.csv'
    econlit_path = 'econlit_xml_csv/' + pub_code + '_econlit.csv'

    scopus_df = pd.read_csv(scopus_core_path, encoding='utf-8')
    econlit_df = pd.read_csv(econlit_path, encoding='utf-8')

    print('{} ------ NUMBER OF SCOPUS OBSERVATIONS: {}'.format(pub_code, len(scopus_df)))
    print('{} ----- NUMBER OF ECONLIT OBSERVATIONS: {}'.format(pub_code, len(econlit_df)))


    return scopus_df, econlit_df

In [3]:
def naive_match(pub_code, scopus_df, econlit_df):
    scopus_df['sc_title_upper'] = scopus_df['sc_title'].apply(lambda x: x.upper())
    econlit_df['title_upper'] = econlit_df['title'].apply(lambda x: x.upper())

    naive_match_df = pd.merge(scopus_df, econlit_df,
        how='outer',
        left_on=['sc_vol', 'sc_issue', 'sc_title_upper'],
        right_on=['volume', 'issue', 'title_upper'],
        indicator=True)

    return naive_match_df

In [4]:
def left_right_onlys(naive_match_df):
    nm_scopus_only = naive_match_df[naive_match_df._merge == 'left_only']
    nm_econlit_only = naive_match_df[naive_match_df._merge == 'right_only']

    nm_scopus_only.reset_index(inplace=True)
    nm_econlit_only.reset_index(inplace=True)

    nm_scopus_only = nm_scopus_only.drop(columns=['_merge', 'level_0'], axis=1)
    nm_econlit_only = nm_econlit_only.drop(columns=['_merge', 'level_0'], axis=1)

    print('Number of SCOPUS-only observations: {}'.format(len(nm_scopus_only)))
    print('Number of ECONLIT-only observations: {}'.format(len(nm_econlit_only)))

    return nm_scopus_only, nm_econlit_only

In [5]:
def custom_score_compute(econlit_row, scopus_volume, scopus_issue, scopus_pagerange, scopus_doi, scopus_title_upper):
    econlit_volume = econlit_row.volume
    econlit_issue = econlit_row.issue
    econlit_doi = econlit_row.doi_y
    econlit_pagerange = econlit_row.pages 
    econlit_title_upper = econlit_row.title_upper

    score = 0
    ### 10 points for being in the correct volume, issue, and page range
    if (int(econlit_volume), int(econlit_issue)) == (int(scopus_volume), int(scopus_issue)):
        score += 10
    else:
        score = 0
        return score
    if econlit_pagerange == scopus_pagerange:
        score += 10


    ### Up to 10 points for Title fuzzy-match edit-distance
    set_edit_distance_ratio = fuzz.token_set_ratio(econlit_title_upper, scopus_title_upper)
    sort_edit_distance_ratio = fuzz.token_sort_ratio(econlit_title_upper, scopus_title_upper)
    gen_edit_distance_ratio = fuzz.ratio(econlit_title_upper, scopus_title_upper)
    
    # Each of the ratios returns a score on the interval [0,100] so the average will also be on this interval
    average_edit_distance_ratio = np.mean([set_edit_distance_ratio, sort_edit_distance_ratio, gen_edit_distance_ratio])
    
    print('\t\u251d{}\n\t|\t\t\u251d-AVERAGE SCORE: {}\n\t|\t\t\u251d-set score: {}\n\t|\t\t\u251d-sort score: {}\n\t|\t\t\u2517- gen score: {}'.format(econlit_title_upper, average_edit_distance_ratio, set_edit_distance_ratio, sort_edit_distance_ratio, gen_edit_distance_ratio))


    # Divide the [0,100] by 10 to rescale to [0,10] points towards the custom score
    score += average_edit_distance_ratio/10


    ### Up to 2 points for DOI fuzzy-match edit-distance

    return score

def custom_scorer(scopus_row, econlit_only_df):
    scopus_volume = scopus_row.sc_vol
    scopus_issue = scopus_row.sc_issue
    scopus_pagerange = scopus_row.sc_page_range
    scopus_doi = scopus_row.doi_x
    scopus_title_upper = scopus_row.sc_title_upper


    econlit_only_df.loc[:, 'custom_match_score'] = econlit_only_df.apply(lambda x: custom_score_compute(x, scopus_volume, scopus_issue, scopus_pagerange, scopus_doi, scopus_title_upper), axis=1)

    econlit_only_df_scored_list = econlit_only_df['custom_match_score'].tolist()
    return econlit_only_df_scored_list


In [15]:
def index_matching(score_matrix):
    rows = score_matrix.shape[0]
    cols = score_matrix.shape[1]
    print("SCORE MATRIX DIMENSIONS: ({} ROWS, {} COLS)".format(rows, cols))

    matched_pairs = []

    # rows
    for scopus_index in range(0, rows):
        # First we find the column (econlit article) that has the highest matching-score for this scopus observation
        best_match_score = max(score_matrix[scopus_index])      
        best_match_index = np.argmax(score_matrix[scopus_index])


        # Then we make sure that the best match for this scopus article does not match better with another scopus article

        best_matchs_match_index = np.argmax(score_matrix[:, best_match_index], axis=0)
        best_matchs_match_score = score_matrix[best_matchs_match_index][best_match_index]

        if scopus_index == best_matchs_match_index:
            if best_match_score <= 17:
                continue
            else:
                matched_pairs.append((int(scopus_index), int(best_match_index)))

        else:
            print('On Scopus index {}, the best match appears to be Econlit index {} (match score: {}), but that Econlit matches best with Scopus index {} (match score: {})'.format(scopus_index, best_match_index, best_match_score, best_matchs_match_index, best_matchs_match_score))

    return matched_pairs
        


In [7]:
def interpret_matches(matched_pairs, nm_scopus_only, nm_econlit_only):
    print('Interpreting {} matches'.format(len(matched_pairs)))

    for pair in matched_pairs:
        scopus_index = pair[0]
        econlit_index = pair[1]

        scopus_title = nm_scopus_only.loc[scopus_index, 'sc_title_upper']
        econlit_title = nm_econlit_only.loc[econlit_index, 'title_upper']

        match_score = unmatched_score_matrix[scopus_index][econlit_index]

        print('Based on scores({}), want to match (({})) with (({}))'.format(match_score, scopus_title, econlit_title))



In [8]:
def matrix_instantiation(scopus_only, econlit_only):
    # Rows (number of scopus-only articles)
    n = len(scopus_only)
    # Columns (number of econlit-only articles)
    m = len(econlit_only)

    matrix = np.zeros((n,m))

    for i in range(0,n):
        scopus_row = scopus_only.iloc[i, :]
        temp_econlit_df = econlit_only
        print(scopus_row.sc_title_upper)
        matrix[i][0:m] = custom_scorer(scopus_row, temp_econlit_df)
        print('\n\n')

    return matrix

In [17]:
def append_fuzzy_matches(fuzzy_matches_indices, scopus_only, econlit_only, naive_matched_df):
    
    scopus_columns = ['doi_x', 'sc_title', 'sc_issn', 'sc_pub_name', 'sc_vol', 'sc_issue', 'sc_page_range', 'sc_abstract_api_endpoint', 'sc_human_url', 'sc_pub_date', 'sc_open_access_status', 'sc_query_used', 'sc_title_upper']
    econlit_columns = ['jel_desc', 'jel_code', 'doi_y', 'title', 'volume', 'issue', 'date', 'pages', 'issn', 'author', 'abstract', 'title_upper']
    
    matching_columns = scopus_columns + econlit_columns

    
    fuzzy_matched_df = pd.DataFrame(np.full((0, len(matching_columns)), np.nan), columns=matching_columns)

    for i, pair in enumerate(fuzzy_matches_indices):
        scopus_index = pair[0]
        econlit_index = pair[1]

        fuzzy_matched_df.loc[i, scopus_columns] = scopus_only.loc[scopus_index, scopus_columns]
        fuzzy_matched_df.loc[i, econlit_columns] = econlit_only.loc[econlit_index, econlit_columns]


    fuzzy_matches_appended = pd.concat([naive_matched_df, fuzzy_matched_df], ignore_index=True)
    return fuzzy_matches_appended

In [11]:
def fuzzy_unmatched_remainders(matched_indices, nm_scopus_only_df, nm_econlit_only_df):
    
    fuzzy_unmatched_scopus = nm_scopus_only_df
    fuzzy_unmatched_econlit = nm_econlit_only_df

    for pair in matched_indices:
        matched_scopus_index = pair[0]
        matched_econlit_index = pair[1]

        fuzzy_unmatched_scopus = fuzzy_unmatched_scopus.drop([matched_scopus_index])
        fuzzy_unmatched_econlit = fuzzy_unmatched_econlit.drop([matched_econlit_index])


    return fuzzy_unmatched_scopus, fuzzy_unmatched_econlit

In [57]:
def output_fuzzy_matched_etc(pub_code: str, fuzzy_matches_appended: pd.DataFrame, fuzzy_unmatched_scopus: pd.DataFrame, fuzzy_unmatched_econlit: pd.DataFrame):
    fuzzy_folder_path = 'econlit_scopus_matching_out/{}_fuzzy_results/'.format(pub_code)
    fuzzy_matches_appended_path = fuzzy_folder_path + '{}_fuzzy_matches.csv'.format(pub_code)
    unmatched_scopus_path = fuzzy_folder_path + '{}_fuzzy_unmatched_scopus.csv'.format(pub_code)
    unmatched_econlit_path = fuzzy_folder_path + '{}_fuzzy_unmatched_econlit.csv'.format(pub_code)

    if os.path.exists(fuzzy_folder_path):
        fuzzy_matches_appended.to_csv(fuzzy_matches_appended_path, encoding='utf-8', index=False)
        fuzzy_unmatched_scopus.to_csv(unmatched_scopus_path, encoding='utf-8', index=False)
        fuzzy_unmatched_econlit.to_csv(unmatched_econlit_path, encoding='utf-8', index=False)


    else:
        print('{} folder/path does not exist'.format(fuzzy_folder_path))
        print('Creating path now')
        os.makedirs(fuzzy_folder_path)
        output_fuzzy_matched_etc(pub_code, fuzzy_matches_appended, fuzzy_unmatched_scopus, fuzzy_unmatched_econlit)
    return

In [65]:
def generate_matching_report(pub_code, scopus_df, econlit_df, naive_match_df, nm_scopus_only, nm_econlit_only, fuzzy_matches_appended, fuzzy_unmatched_scopus, fuzzy_unmatched_econlit):

    bars = '------------------------------------'

    # ORIGINALS
    scopus_original = 'Original {} Scopus-collected observations: {}'.format(pub_code, len(scopus_df))
    econlit_original = 'Original {} EconLit-collected observations: {}'.format(pub_code, len(econlit_df))
    total_original = 'Original {} collection TOTAL observations: {}'.format(pub_code, len(scopus_df) + len(econlit_df))


    # POST-NAIVE-MATCH
    naive_match = 'Number of {} naively-matched observations: {}'.format(pub_code, len(naive_match_df))
    post_naive_scopus_only = 'Post-naive match {} Scopus-only observations: {}'.format(pub_code, len(nm_scopus_only))
    post_naive_econlit_only = 'Post-naive match {} EconLit-only observations: {}'.format(pub_code, len(nm_econlit_only))


    ### POST-FUZZY-MATCH
    fuzzy_matches_added = len(fuzzy_matches_appended) - len(naive_match_df)
    fuzzy_match = 'Number of {} fuzzy-matched observations {} ({} added)'.format(pub_code, len(fuzzy_matches_appended), fuzzy_matches_added)
    post_fuzzy_scopus_only = 'Number of {} post-fuzzy-match Scopus-only remainder observations: {}'.format(pub_code, len(fuzzy_unmatched_scopus))
    post_fuzzy_econlit_only = 'Number of {} post-fuzzy-match EconLit-only remainder observations: {}'.format(pub_code, len(fuzzy_unmatched_econlit))


    post_fuzzy_scopus_titles = fuzzy_unmatched_scopus.sc_title.tolist()
    post_fuzzy_econlit_titles = fuzzy_unmatched_econlit.title.tolist()

    lines = ['FUZZY MATCHING REPORT', bars, scopus_original, econlit_original, total_original, bars, naive_match, post_naive_scopus_only, post_naive_econlit_only, bars, fuzzy_match, post_fuzzy_scopus_only, post_fuzzy_econlit_only, bars]

    matching_report_path = 'econlit_scopus_matching_out/{}_fuzzy_results/{}_fuzzy_matching_report.txt'.format(pub_code, pub_code)
    with open(matching_report_path, 'w') as report:
        report.write(str(datetime.now()))
        for line in lines:
            report.write('\n{}'.format(line))

        report.write('\nSCOPUS ARTICLES REMAINING UNMATCHED ({})\n'.format(len(fuzzy_unmatched_scopus)))
        for title in post_fuzzy_scopus_titles:
            report.write('\n\t{}'.format(title))



        report.write('\nECONLIT ARTICLES REMAINING UNMATCHED ({})\n'.format(len(fuzzy_unmatched_econlit)))
        for title in post_fuzzy_econlit_titles:
            report.write('\n\t{}'.format(title))

    return


In [66]:
scopus_df, econlit_df = load_input_csvs('RES')
print('----------------------------------------')
naive_match_df = naive_match('RES', scopus_df, econlit_df)
print('----------------------------------------')
nm_scopus_only, nm_econlit_only = left_right_onlys(naive_match_df)
naive_match_df = naive_match_df[naive_match_df._merge == 'both']
naive_match_df.reset_index(inplace=True)
naive_match_df = naive_match_df.drop(columns=['_merge', 'level_0'], axis=1)
print('----------------------------------------')
unmatched_score_matrix = matrix_instantiation(nm_scopus_only, nm_econlit_only)
print(unmatched_score_matrix.shape)
matched_indices_list = index_matching(unmatched_score_matrix)
interpret_matches(matched_indices_list, nm_scopus_only, nm_econlit_only)
print('----------------------------------------')
fuzzy_matches_appended = append_fuzzy_matches(matched_indices_list, nm_scopus_only, nm_econlit_only, naive_match_df)
print(len(naive_match_df))
print(len(fuzzy_matches_appended))
fuzzy_unmatched_scopus, fuzzy_unmatched_econlit = fuzzy_unmatched_remainders(matched_indices_list, nm_scopus_only, nm_econlit_only)
print('----------------------------------------')
output_fuzzy_matched_etc('RES', fuzzy_matches_appended, fuzzy_unmatched_scopus, fuzzy_unmatched_econlit)
generate_matching_report('RES', scopus_df, econlit_df, naive_match_df, nm_scopus_only, nm_econlit_only, fuzzy_matches_appended, fuzzy_unmatched_scopus, fuzzy_unmatched_econlit)

READING IN RES
RES ------ NUMBER OF SCOPUS OBSERVATIONS: 1461
RES ----- NUMBER OF ECONLIT OBSERVATIONS: 1540
----------------------------------------
----------------------------------------
Number of SCOPUS-only observations: 146
Number of ECONLIT-only observations: 225
----------------------------------------
MONOPOLY WITHOUT A MONOPOLIST: AN ECONOMIC ANALYSIS OF THE BITCOIN PAYMENT SYSTEM



OPTIMAL TAXATION WITH PRIVATE INSURANCE



MEASURING BIAS IN CONSUMER LENDING



DEFAULT EFFECTS AND FOLLOW-ON BEHAVIOUR: EVIDENCE FROM AN ELECTRICITY PRICING PROGRAM



THE VALUE OF UNEMPLOYMENT INSURANCE



WORKFORCE COMPOSITION, PRODUCTIVITY, AND LABOUR REGULATIONS IN A COMPENSATING DIFFERENTIALS THEORY OF INFORMALITY



RULES WITHOUT COMMITMENT: REPUTATION AND INCENTIVES



SHADOW BANKING AND THE FOUR PILLARS OF TRADITIONAL FINANCIAL INTERMEDIATION



NON-PARAMETRIC ANALYSIS OF TIME-INCONSISTENT PREFERENCES



VARIATION MARGINS, FIRE SALES, AND INFORMATION-CONSTRAINED OPTIMALITY



A THEORY 