In [25]:
import pandas as pd
import numpy as np
from thefuzz import fuzz

In [18]:
def load_input_csvs(pub_code):
    print('READING IN {}'.format(pub_code))
    scopus_core_path = 'econlit_scopus_matching_out/' + pub_code + '_scopus_core.csv'
    econlit_path = 'econlit_scopus_matching_out/' + pub_code.lower() + '_econlit.csv'

    scopus_df = pd.read_csv(scopus_core_path, encoding='utf-8')
    econlit_df = pd.read_csv(econlit_path, encoding='utf-8')

    print('{} ------ NUMBER OF SCOPUS OBSERVATIONS: {}'.format(pub_code, len(scopus_df)))
    print('{} ----- NUMBER OF ECONLIT OBSERVATIONS: {}'.format(pub_code, len(econlit_df)))


    return scopus_df, econlit_df

In [19]:
def naive_match(pub_code, scopus_df, econlit_df):
    scopus_df['sc_title_upper'] = scopus_df['sc_title'].apply(lambda x: x.upper())
    econlit_df['title_upper'] = econlit_df['title'].apply(lambda x: x.upper())

    naive_match_df = pd.merge(scopus_df, econlit_df,
        how='outer',
        left_on=['sc_vol', 'sc_issue', 'sc_title_upper'],
        right_on=['volume', 'issue', 'title_upper'],
        indicator=True)

    scopus_only_count = len(naive_match_df[naive_match_df._merge == 'left_only'])
    econlit_only_count = len(naive_match_df[naive_match_df._merge == 'right_only'])

    print('{} ------ NUMBER OF SCOPUS-ONLY OBSERVATIONS: {}'.format(pub_code, scopus_only_count))
    print('{} ----- NUMBER OF ECONLIT-ONLY OBSERVATIONS: {}'.format(pub_code, econlit_only_count))

    return naive_match_df

In [26]:
def left_right_onlys(naive_match_df):
    nm_scopus_only = naive_match_df[naive_match_df._merge == 'left_only']
    nm_econlit_only = naive_match_df[naive_match_df._merge == 'right_only']

    nm_scopus_only.reset_index(inplace=True)
    nm_econlit_only.reset_index(inplace=True)

    print('Number of SCOPUS-only observations: {}'.format(len(nm_scopus_only)))
    print('Number of ECONLIT-only observations: {}'.format(len(nm_econlit_only)))

    return nm_scopus_only, nm_econlit_only

In [None]:
def custom_scorer(scopus_row, econlit_only_df):
    scopus_volume = scopus_row.sc_volume
    scopus_issue = scopus_row.sc_issue
    scopus_pagerange = scopus_row.sc_page_range
    scopus_doi = scopus_row.doi
    scopus_title_upper = scopus_row.sc_title_upper
    


In [38]:
def scores_instantiation(scopus_only, econlit_only):
    # Rows (number of scopus-only articles)
    n = len(scopus_only)
    # Columns (number of econlit-only articles)
    m = len(econlit_only)

    score_matrix = np.zeros((n,m))

    for i in range(0,n):
        # for j in range(0,m):
            # score_matrix[i][j] = round(float(str(i) + '.' + str (j)), 4)
        scopus_row = scopus_only.loc[i, ;]
        temp_econlit_df = econlit_only
        score_matrix[i][0:m] = custom_scorer(scopus_row, temp_econlit_df)
    
    return score_matrix

In [39]:
scopus_df, econlit_df = load_input_csvs('ECA')
print(scopus_df.columns)
print(econlit_df.columns)
print('----------------------------------------')
naive_match_df = naive_match('ECA', scopus_df, econlit_df)
print('----------------------------------------')
nm_scopus_only, nm_econlit_only = left_right_onlys(naive_match_df)
print('----------------------------------------')
unmatched_score_matrix = scores_instantiation(nm_scopus_only, nm_econlit_only)
print(unmatched_score_matrix.shape)
print(unmatched_score_matrix)


READING IN ECA
ECA ------ NUMBER OF SCOPUS OBSERVATIONS: 1668
ECA ----- NUMBER OF ECONLIT OBSERVATIONS: 2533
Index(['doi', 'sc_title', 'sc_issn', 'sc_pub_name', 'sc_vol', 'sc_issue',
       'sc_page_range', 'sc_abstract_api_endpoint', 'sc_human_url',
       'sc_pub_date', 'sc_open_access_status', 'sc_query_used'],
      dtype='object')
Index(['volume', 'issue', 'date', 'abstract', 'author', 'jel_desc', 'jel_code',
       'title', 'L_code', 'K_code', 'D4_code', 'O3_code', 'G34_code', 'year',
       'month', 'day'],
      dtype='object')
----------------------------------------
ECA ------ NUMBER OF SCOPUS-ONLY OBSERVATIONS: 140
ECA ----- NUMBER OF ECONLIT-ONLY OBSERVATIONS: 1005
----------------------------------------
Number of SCOPUS-only observations: 140
Number of ECONLIT-only observations: 1005
----------------------------------------
(140, 1005)
[[0.000000e+00 1.000000e-01 2.000000e-01 ... 1.002000e-01 1.003000e-01
  1.004000e-01]
 [1.000000e+00 1.100000e+00 1.200000e+00 ... 1.1002

In [44]:
matrix = np.zeros((2,3))
matrix[0,1] = 1
print(matrix)
print(matrix.shape)

matrix[1][0:3] = [1,2,3]
matrix

[[0. 1. 0.]
 [0. 0. 0.]]
(2, 3)


array([[0., 1., 0.],
       [1., 2., 3.]])