In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from itertools import combinations
import lxml
import traceback
import re
from bs4 import BeautifulSoup as bs
from thefuzz import fuzz

In [None]:
authors_df = pd.read_csv('econlit_scopus_matching_out/JPE_author_abstract_funding.csv')

In [None]:
aer_df = pd.read_csv('econlit_scopus_matching_out/aer_econlit.csv')
eca_df = pd.read_csv('econlit_scopus_matching_out/eca_econlit.csv')
jpe_df = pd.read_csv('econlit_scopus_matching_out/jpe_econlit.csv')
qje_df = pd.read_csv('econlit_scopus_matching_out/qje_econlit.csv')
res_df = pd.read_csv('econlit_scopus_matching_out/res_econlit.csv')
rje_df = pd.read_csv('econlit_scopus_matching_out/rje_econlit.csv')

aer_df['publication'] = 'aer'
eca_df['publication'] = 'eca'
jpe_df['publication'] = 'jpe'
qje_df['publication'] = 'qje'
res_df['publication'] = 'res'
rje_df['publication'] = 'rje'


econlit_df = pd.concat([aer_df, eca_df, jpe_df, qje_df, res_df, rje_df], ignore_index=True)

In [None]:
# abstracts_df = authors_df[['doi', 'sc_title', 'sc_abstract_text']]
abstracts_df = econlit_df[['title', 'date', 'year', 'publication', 'abstract', 'jel_code', 'L_code', 'K_code', 'D4_code', 'O3_code', 'G34_code']]

abstracts_df['anti_trust_indicator'] = 0
abstracts_df['market_power_indicator'] = 0
abstracts_df['L4_code'] = 0
abstracts_df['K21_code'] = 0
abstracts_df['J3_code'] = 0



anti_trust_pattern = r'anti trust|anti-trust|antitrust'
market_power_pattern = r'market power|market-power'

In [None]:
for row in range(0, len(abstracts_df)):
    # sc_abstract_text 

    abstract = abstracts_df.loc[row, 'abstract']
    publication = abstracts_df.loc[row, 'publication']
    if type(abstracts_df.loc[row, 'abstract']) !=str:
        continue
    if re.search(anti_trust_pattern, abstract, flags=re.I):
        # print("Anti-trust: {}".format(row))
        abstracts_df.loc[row, 'anti_trust_indicator']=1
        print('ANTI-TRUST--{} in abstract: {}'.format(publication, row))
    if re.search(market_power_pattern, abstract, flags=re.I):
        # print("Market power: {}".format(row))
        abstracts_df.loc[row, 'market_power_indicator']=1
        print('MARKET-POWER--{} in abstract: {}'.format(publication, row))
    # if re.search(r'merger', abstract, flags=re.I):
    #     print('MERGER--{} in abstract: {}'.format(publication, row))
    # if re.search(r'cartel', abstract, flags=re.I):
    #     print('CARTEL--{} in abstract: {}'.format(publication, row))
    # if re.search(r'monopol', abstract, flags=re.I):
    #     print('MONOPOL--{} in abstract: {}'.format(publication, row))
    # if re.search(r'deadweight loss|deadweight|dead-weight|dead weight', abstract, flags=re.I):
    #     print('DEADWEIGHT LOSS--{} in abstract: {}'.format(publication, row))
    # if re.search(r'anti compet|anti-compet|anticompet', abstract, flags=re.I):
    #     print('ANTI-COMPET--{} in abstract: {}'.format(publication, row))

    if type(abstracts_df.loc[row, 'jel_code']) != str:
        continue
    if re.search(r'L4', abstracts_df.loc[row, 'jel_code'], flags=re.I):
        abstracts_df.loc[row, 'L4_code'] = 1

    if re.search(r'K21', abstracts_df.loc[row, 'jel_code'], flags=re.I):
        abstracts_df.loc[row, 'K21_code'] = 1
    if re.search(r'J3', abstracts_df.loc[row, 'jel_code'], flags=re.I):
        abstracts_df.loc[row, 'J3_code'] = 1


    


        
# print('ANTI-TRUST COUNT: {}'.format(len(abstracts_df[abstracts_df.anti_trust_indicator==1])))
# print('MARKET POWER COUNT: {}'.format(len(abstracts_df[abstracts_df.market_power_indicator==1])))
# print('L4-AER CODE COUNT: {}'.format(len(abstracts_df[(abstracts_df.L4_code==1) & (abstracts_df.publication=='aer')])))
# print('K21-AER CODE COUNT: {}'.format(len(abstracts_df[(abstracts_df.K21_code==1) & (abstracts_df.publication=='aer')])))
# print('L4-ECA CODE COUNT: {}'.format(len(abstracts_df[(abstracts_df.L4_code==1) & (abstracts_df.publication=='eca')])))
# print('K21-ECA CODE COUNT: {}'.format(len(abstracts_df[(abstracts_df.K21_code==1) & (abstracts_df.publication=='eca')])))
# print('L4-jpe CODE COUNT: {}'.format(len(abstracts_df[(abstracts_df.L4_code==1) & (abstracts_df.publication=='jpe')])))
# print('K21-jpe CODE COUNT: {}'.format(len(abstracts_df[(abstracts_df.K21_code==1) & (abstracts_df.publication=='jpe')])))
# print('L4-qje CODE COUNT: {}'.format(len(abstracts_df[(abstracts_df.L4_code==1) & (abstracts_df.publication=='qje')])))
# print('K21-qje CODE COUNT: {}'.format(len(abstracts_df[(abstracts_df.K21_code==1) & (abstracts_df.publication=='qje')])))
# print('L4-res CODE COUNT: {}'.format(len(abstracts_df[(abstracts_df.L4_code==1) & (abstracts_df.publication=='res')])))
# print('K21-res CODE COUNT: {}'.format(len(abstracts_df[(abstracts_df.K21_code==1) & (abstracts_df.publication=='res')])))
# print('L4-rje CODE COUNT: {}'.format(len(abstracts_df[(abstracts_df.L4_code==1) & (abstracts_df.publication=='rje')])))
# print('K21-rje CODE COUNT: {}'.format(len(abstracts_df[(abstracts_df.K21_code==1) & (abstracts_df.publication=='rje')])))


In [None]:
publications_year_df = abstracts_df.groupby(by=['publication', 'year']).agg(
    {
        'title' : lambda x: len(x)
    }
)
publications_year_df.reset_index(inplace=True)
publications_year_df.rename(columns={'title': 'count'}, inplace=True)
publications_year_df.to_csv('../adhoc_material/Reports/04-21/publications_year.csv', index=False, encoding='utf-8')


In [None]:
theory_codes_pattern = r'C7|D11|D5|D21|D85|D86'

L_papers_df = abstracts_df[abstracts_df.L_code==1]
L_papers_df.reset_index(inplace=True)
L_papers_df['contains_theory'] = 0

for i in range(0, 10):
    generated_key = 'L' + str(i)+ '_code'
    L_papers_df[generated_key]=0

for j in [0,1,2,3,4,9]:
    generated_key = 'L4' + str(j) + '_code'
    L_papers_df[generated_key] = 0

for row in range(0,len(L_papers_df)):
    if re.search(theory_codes_pattern, L_papers_df.loc[row, 'jel_code'], flags=re.I):
        L_papers_df.loc[row, 'contains_theory'] =1

    for i in range(0, 10):
        generated_pattern = r'L'+ str(i)
        generated_column = 'L' + str(i) + '_code'
        if re.search(generated_pattern, L_papers_df.loc[row, 'jel_code'], flags=re.I):
            L_papers_df.loc[row, generated_column] = 1

    for j in [0,1,2,3,4,9]:
        generated_pattern = r'L4'+str(j)
        generated_column = 'L4' + str(j) + '_code'
        if re.search(generated_pattern, L_papers_df.loc[row, 'jel_code'], flags=re.I):
            L_papers_df.loc[row, generated_column] = 1
            print(generated_pattern)
            print(abstracts_df.loc[row, 'jel_code'])


print("TOTAL IO: {}".format(len(L_papers_df)))
print("TOTAL ANTITRUST: {}".format(len(L_papers_df[L_papers_df.L4_code ==1])))
print("IO and at least one theory : {}".format(len(L_papers_df[L_papers_df.contains_theory==1])))
print("ANTITRUST AND THEORY: {}".format(len(L_papers_df[(L_papers_df.contains_theory==1) & (L_papers_df.L4_code == 1)]))) 
## Plot these counts and shares by year

L_papers_df.to_csv('../adhoc_material/Reports/04-21/L_papers_indicators.csv', index=False, encoding='utf-8')
# L_papers_df

In [194]:
str_1 ="[nan, 'E13', 'E32', 'E44', 'E52', nan, 'L44', nan]"
str_2 = '[nan]'
str_3 = 'random'



nan_test = re.search(r'nan', str_1)
nan_test_iter = re.finditer(r'nan', str_1)
modified_str = str_1
reversing_nans = []

for group in nan_test_iter:
    reversing_nans.append(group)

for group in reversed(reversing_nans):
    print(group)
    if (group.start(), group.end()) != (1,4):

        modified_str = modified_str[:group.start()] + "np." + modified_str[group.start():]
        # print(group.start())
        # print(group.end())

        print('found a dead one')
print(modified_str)

test_list = ['a', np.nan, 'b','a', np.nan]
if len(test_list) == 1 and test_list[0] is np.nan:
    print('here')
while np.nan in test_list:
    test_list.remove(np.nan)
test_list

<re.Match object; span=(46, 49), match='nan'>
found a dead one
<re.Match object; span=(34, 37), match='nan'>
found a dead one
<re.Match object; span=(1, 4), match='nan'>
[nan, 'E13', 'E32', 'E44', 'E52', np.nan, 'L44', np.nan]


['a', 'b', 'a']

In [None]:
top_five_jel = abstracts_df[abstracts_df.publication != 'rje']
top_five_jel.loc[:,'jel_list_pre'] = np.nan
top_five_jel.loc[:, 'jel_list_elem_weight'] = np.nan
top_five_jel.loc[:, 'first_jel'] = np.nan
top_five_jel.loc[: , 'predom_jel'] = np.nan
top_five_jel['jel_list_pre'] = top_five_jel['jel_list_pre'].astype('object')


for row in range(0, len(top_five_jel)):
    jel_list_as_string = top_five_jel.loc[row, 'jel_code']
    nan_test_iter = re.finditer(r'nan', jel_list_as_string)
    reversing_nans = []

    # we need to collect up the matches so that we can edit in reverse
    for group in nan_test_iter:
        reversing_nans.append(group)

    # we edit in reverse because we need to ensure we don't ruin the match locations with premature insertions
    for group in reversed(reversing_nans):
        jel_list_as_string = jel_list_as_string[:group.start()] + "np." + jel_list_as_string[group.start():]
        print('EDITED AN "nan" TO "np.nan" AT INDEX: {}'.format(row))
        print(jel_list_as_string)


    exec('jel_str = {}'.format(jel_list_as_string))
    while np.nan in jel_str:
        jel_str.remove(np.nan)

    jel_list = []
    for code in jel_str:
        if str(code) == 'np.nan':
            continue
        jel_alpha = str(code)[0]
        jel_list.append(jel_alpha)

    
    top_five_jel.at[row, 'jel_list_pre'] = jel_list
    if len(jel_list) == 0:
        top_five_jel.loc[row, 'jel_list_elem_weight'] = 0
        top_five_jel.loc[row, 'first_jel'] = np.nan
    else:
        top_five_jel.loc[row, 'jel_list_elem_weight'] = 1/len(jel_list)
        top_five_jel.loc[row, 'first_jel'] = jel_list[0]

    jel_category_tuples = [(jel_alpha, jel_list.count(jel_alpha)) for jel_alpha in jel_list]
    jel_category_tuples = list(set(jel_category_tuples))
    jel_category_tuples.sort(key=lambda tuple: tuple[1])
    jel_category_tuples.reverse()
    if len(jel_category_tuples) == 0:
        top_five_jel.loc[row, 'predom_jel'] = np.nan
    elif len(jel_category_tuples) == 1 and jel_category_tuples[0] is np.nan:
        top_five_jel.loc[row, 'predom_jel'] = np.nan
    else:
        top_five_jel.loc[row, 'predom_jel'] = jel_category_tuples[0][0]

    print("INDEX: {}\n\tJEL_LIST_PRE: {}\n\tFIRST_JEL: {}\n\tPREDOM_JEL: {}".format(
        row, top_five_jel.loc[row, 'jel_list_pre'], top_five_jel.loc[row, 'first_jel'], top_five_jel.loc[row, 'predom_jel']
    ))

    # print(jel_list)
    
top_five_jel = top_five_jel[['title', 'year', 'publication', 'predom_jel', 'first_jel', 'jel_list_pre', 'jel_list_elem_weight']]
top_five_jel.to_csv('../adhoc_material/Reports/04-21/top_five_jel_breakdown.csv', index=False, encoding='utf8')
top_five_jel = top_five_jel[['title', 'year', 'publication', 'jel_list_pre', 'jel_list_elem_weight']]
top_five_jel = top_five_jel.explode(column='jel_list_pre')
top_five_jel.to_csv('../adhoc_material/Reports/04-21/top_five_jel_unnested.csv', index=False, encoding='utf8')
top_five_jel

In [None]:
year_pub_count = pd.DataFrame()
year_pub_count['count'] = abstracts_df.groupby(by=['year', 'publication']).size()
year_pub_count.reset_index(inplace=True)
year_pub_count
indicator_count_df = abstracts_df.groupby(by=['year', 'publication']).agg({
    'L_code': lambda x: x.sum(),
    'K_code': lambda x: x.sum(),
    'L4_code': lambda x: x.sum(),
    'K21_code': lambda x: x.sum(),
    'D4_code': lambda x: x.sum(),
    'O3_code': lambda x: x.sum(),
    'G34_code': lambda x: x.sum(),
    'J3_code': lambda x: x.sum(),
    'anti_trust_indicator': lambda x: x.sum(),
    'market_power_indicator': lambda x: x.sum(),
})
indicator_count_df.reset_index(inplace=True)
indicator_count_df = pd.merge(indicator_count_df, year_pub_count, how='left', on=['year', 'publication'])
indicator_count_df.to_csv('indicator_count_df.csv', index=False, encoding='utf-8')

In [None]:
# aer_authors_df = pd.read_csv('econlit_scopus_matching_out/AER_author_abstract_funding.csv')
eca_authors_df = pd.read_csv('econlit_scopus_matching_out/ECA_author_abstract_funding.csv')
jpe_authors_df = pd.read_csv('econlit_scopus_matching_out/JPE_author_abstract_funding.csv')
qje_authors_df = pd.read_csv('econlit_scopus_matching_out/QJE_author_abstract_funding.csv')
res_authors_df = pd.read_csv('econlit_scopus_matching_out/RES_author_abstract_funding.csv')
rje_authors_df = pd.read_csv('econlit_scopus_matching_out/RJE_author_abstract_funding.csv')

authors_df = pd.concat([eca_authors_df, jpe_authors_df, qje_authors_df, res_authors_df, rje_authors_df], ignore_index=False)
authors_df = authors_df[['doi', 'sc_title', 'sc_pub_name', 'sc_author_given_name', 'sc_author_last_name', 'sc_author_id']]
authors_df.reset_index(inplace=True)

In [None]:
for row in range(0, len(authors_df)):
    pub_name = authors_df.loc[row, 'sc_pub_name']
    if type(pub_name) != str:
        continue

    if pub_name == 'Econometrica : journal of the Econometric Society':
        authors_df.loc[row, 'sc_pub_name'] = 'Econometrica'
    elif pub_name == 'The Rand journal of economics':
        authors_df.loc[row, 'sc_pub_name'] = 'RAND Journal of Economics'

In [None]:
authors_df.drop_duplicates(subset=['doi', 'sc_pub_name', 'sc_author_id'], inplace=True)

In [None]:
# test_df = authors_df.head(100)
coauthors_df = authors_df.head(1000).groupby(by=['doi']).apply(lambda df: unique_author_pairs(df))
coauthors_df.reset_index(inplace=True)
coauthors_df[['author', 'coauthor']] = pd.DataFrame(coauthors_df['pairs'].tolist(), index=coauthors_df.index)
# coauthors_df

In [None]:
def unique_author_pairs(sub_df):
    author_ids_list = sub_df['sc_author_id'].unique().tolist()
    doi = sub_df['doi'].unique()[0]
    
    if len(author_ids_list) ==1:
        unique_author_pairs = pd.Series(pair for pair in combinations([author_ids_list[0], np.nan], 2))
    else:
        unique_author_pairs = pd.Series(pair for pair in combinations(author_ids_list,2))
    # print(doi)
    # print(author_ids_list)
    # print(unique_author_pairs)
    
    # print('-----------------------------')
    return pd.DataFrame({
        # 'doi' : doi,
        'pairs' : unique_author_pairs
    })

In [None]:
G = nx.from_pandas_edgelist(coauthors_df, 'author', 'coauthor')
# G = nx.from_pandas_adjacency(coauthors_df[['author', 'coauthor']])

G.remove_node(np.nan)
nan_edges = [edge for edge in G.edges if np.nan in edge]
print(nan_edges)
# G.remove_edges_from(nan_edges)
# for tup in G.edges:
#     if np.nan in tup:
#         source = tup[0]
#         target = tup[1]
#         print(tup)
#         G.remove_edge(source, target)
from matplotlib.pyplot import figure
figure(figsize=(20,15))
# nx.draw_shell(G, node_size=2)
# nx.draw_random(G, node_size=5)
# nx.draw_networkx_edges(G)
pos = nx.kamada_kawai_layout(G)
nx.draw(G, pos=pos)  # Draw the original graph