In [1]:
import os, re, copy, json, pprint, argparse, warnings, xmltojson, shutil
import pandas as pd
from pathlib import Path
import bibtexparser
from collections import Counter
warnings.filterwarnings('ignore')

`PURPOSE`: authors populated based on the citation counts by financial anomalies related literature. 

Steps:
1. Get the titles from `reference.bib`
2. Match the title to the refrence file to get metadata 
3. count the number of times the title is cited by financial anomalies related literature collected in file `financial_anomalies_cited_network_raw`

In [2]:
with open('../financialanomalies/references.bib') as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)

def match_title(title_str):
    for item in bib_database.entries:
        title_str= re.sub('[^a-zA-Z0-9]', ' ', title_str)
        title_to_be_compared= re.sub('[^a-zA-Z0-9]', ' ', item.get('title').lower())

        if title_str==title_to_be_compared:
            return item.get('title').lower(), item.get('journal').lower(), item.get('author').lower()
        
with open("financial_anomalies_cited_network_raw", "r") as fp:
    academic_networks = json.load(fp)

In [3]:
counter = 0
all_titles=[]
for item in academic_networks:
    if not isinstance(item.get('main_title'), type(None)) | isinstance(item.get('cited_titles'), type(None)):
        all_titles.append([re.sub('[^a-zA-Z0-9]', ' ', sitem[0].lower().strip())  
                           for sitem in item.get('cited_titles') if not isinstance(sitem[0], type(None))])
        
all_titles = sorted([(v,k) for k,v in dict(Counter([subitem for item in all_titles for subitem in item ])).items()], reverse=True)

dfs=[]
for sitem in all_titles:
    try: 
        meta_data={}
        meta_data['title']=str(sitem[1])
        meta_data['authors']=match_title(sitem[1])[2]
        meta_data['citation_count']=sitem[0]
        dfs.append(meta_data)
    except Exception:
        pass

df = pd.DataFrame(dfs)
df.to_csv('citations_count_titles.csv')


In [10]:
df=pd.read_csv('academic_citation_networks.csv',error_bad_lines=False)
from ast import literal_eval
import numpy as np
def parse_dicct(strr):
    if len(strr) >=30:
        try: 
            return  dict(literal_eval(json.loads(json.dumps(str(strr).replace("'", '"')) )))
        except Exception:
            pass

df['cited_by']= df['cited_by'].apply(lambda x : parse_dicct(x))


cited_by={}
for item in df['cited_by'].values.tolist():
   if not isinstance(item, type(None)):
       cited_by[item.get('article_title')] = item.get('citations_count')


df = pd.DataFrame([item for item in df['cited_by'].values.tolist() if not isinstance(item, type(None))])
df=df[['citations_count', 'article_title', 'auhtors']]
all_google_scholar_citations= [(item[0], len(list(item[1])), cited_by[item[0]], list(set([subitem for item in list(df['auhtors'].iloc[list(item[1])].values) for subitem in item])) )
        for item in dict(df.groupby('article_title').groups).items()]
df= pd.DataFrame(all_google_scholar_citations, columns=["title", "search_hit_count","google_scholar_citation_count", "authors"])
df['google_scholar_citation_count']= df['google_scholar_citation_count'].astype('int')
df=df.sort_values('google_scholar_citation_count', ascending=False)

df.to_csv('google_scholar_citation_count.csv')
df

Unnamed: 0,title,search_hit_count,google_scholar_citation_count,authors
328,Efficient capital markets,4,38502,[]
164,Common risk factors in the returns on stocks a...,7,36708,[KR French]
139,Capital asset prices: A theory of market equil...,2,34308,[WF Sharpe]
662,On persistence in mutual fund performance,1,23662,[]
727,Principles of corporate finance,1,22032,[F Allen]
...,...,...,...,...
377,Financial constraints and security issuance,1,1,[]
436,Has increased prominence of translation result...,1,1,[B Marshall]
744,R&D Expenditures and Idiosyncratic Risk: Good ...,1,1,[]
299,Dont Get Carried Away: Uncovering Macro Charac...,1,1,[]


In [6]:
!clear
!git add .
!git status
!git commit -m "added  frequency of cited titles search count hit and google scholar citation report"
!git push
!clear 


[H[2JOn branch main
Your branch is up to date with 'origin/main'.

Changes to be committed:
  (use "git restore --staged <file>..." to unstage)
	[32mmodified:   academic_graph_analysis.ipynb[m

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	[31mmodified:   academic_graph_analysis.ipynb[m

[main 1e66dda] added  frequency of cited titles search count hit and google scholar citation report
 1 file changed, 37 insertions(+), 4 deletions(-)
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 16 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 1004 bytes | 502.00 KiB/s, done.
Total 3 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/krishpn/academic_graphs.git
   51d7138..1e66dda  main -> main
[H[2J