# Step 7: Other In-paper Analysis
It computes the Retraction Indexing Agreement Scores and other analysis

Input File: 
   - Retracted publications Union list with RIA score (from Step 6)
       - unionlist_completed_ria{date}.csv
   - STI2023 Union list
       - 2023-09-03_journalcategory_knownretractionlist_updated.csv
  
###### Uncomment the line of code below to save analysis results to your local directory:
       - (i)  "....to_csv(..)" 
       - (ii) "plt.savefig(...)" 

In [None]:
import os
from collections import Counter
import seaborn as sns
import pandas as pd
import dataframe_image as dfi
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from datetime import date, datetime as dt
import time,datetime
import re
import unicodedata
import ast  # Module to handle literal_eval function

In [None]:
from upsetplot import generate_counts
from upsetplot import plot
from upsetplot import from_memberships
from upsetplot import UpSet
from matplotlib import pyplot
from itertools import combinations

In [None]:
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
from matplotlib.ticker import MaxNLocator

%matplotlib notebook

In [None]:
#!pip install dfstyle

In [None]:
"""
Set up & defining file directories: Targeting the retraction_index_path
"""
retraction_index_path = os.path.abspath('./.')


data_dir = retraction_index_path+'/data/' # data directory
result_dir = retraction_index_path+'/result/'

In [None]:
# Supply the date of  retracted items were collected from each source and unionlist created date
getdate = {'unionlist':'2024-07-09'
          } 

In [None]:
unionlist3_adjusted= pd.read_csv(data_dir+'unionlist/unionlist_completed_ria_'+getdate['unionlist']+'.csv')

# reformatting ['PubMedID'] column
unionlist3_adjusted['PubMedID'].fillna(0, inplace=True)
unionlist3_adjusted['PubMedID']= unionlist3_adjusted['PubMedID'].astype(int).astype(str)
unionlist3_adjusted['PubMedID'].replace(0,'', inplace=True)

unionlist3_adjusted

In [None]:
unionlist3_adjusted.info()

## Section 7: Investigating 100% RetractionIndexingAgreement

In [None]:
"""
Investigating number source across 100% RetractionIndexingAgreement
"""

def ria100_distribution(source):
    """
    It finds the distribution of 100% RetractionIndexingAgreement_ITEM(%) for a given source: finding out
    how many databases also index retracted items alongside the source database
    :param source: database to check 
    :return: dataframe of indexed sources distribution
    """
    
    df100 = unionlist3_adjusted[unionlist3_adjusted['RetractionIndexingAgreement_ITEM(%)']==100].copy()
    s100= df100[df100.source_new.apply(lambda x: source in x )].copy()
    s100['size_source_old']= s100['source_old'].apply(lambda x: len(list(x.split(','))))
    s_count= dict(Counter(s100['size_source_old']))

    df = pd.DataFrame(s_count.items(), columns=['Category', 'Count'])
    
    df['LogCount'] = np.log(df['Count'] + 1)
    
    # Calculate the percentage of log count
    total_log_count = df['LogCount'].sum()
    df['LogCountPercentage'] = (df['LogCount'] / total_log_count) * 100
    
    # Compute the percentage
    total_count = df['Count'].sum()
    df['Percentage'] = (df['Count'] / total_count) * 100
    df.reset_index(drop='na', inplace=True)
    return df
    

In [None]:
"""
Confirmation count at 100% of Retraction Indexing Agreement
"""
ria100_distribution('WoS_Core') #['Count'].sum()


In [None]:
"""
Items with Percentage of RIA < 50%
"""
(unionlist3_adjusted[unionlist3_adjusted['RetractionIndexingAgreement_ITEM(%)']<50].count()[0]/len(unionlist3_adjusted))*100


In [None]:
"""
Percentage of items that their RIA score is not 100%  
"""
df100 = unionlist3_adjusted[unionlist3_adjusted['RetractionIndexingAgreement_ITEM(%)']==100].copy()
not_RIA100= len(unionlist3_adjusted) - len(df100)
(not_RIA100/len(unionlist3_adjusted)) *100

In [None]:
"""
Distribution of 100 RetractionIndexingAgreement by source
"""

list_ria100= df100['source_old'].apply(eval).tolist()
print(f"Total items with 100% RIA is {len(list_ria100)}")
source_ria100= [x[0] for x in list_ria100]
Counter(source_ria100)

In [None]:
"""
Distribution of 100 RetractionIndexingAgreement by source
"""
list_ria100= df100['source_old'].apply(eval).tolist()
length_ria100= [len(x) for x in list_ria100]
print("Numbers of databases (k) with numbers of retracted items indexed (v)")
print(Counter(length_ria100))
x100ria= Counter(length_ria100)
for k,v in x100ria.items():
    print(k,':\t',v,'\t-',((v/len(list_ria100))*100))


In [None]:
"""
Investigating number intersection among indexing sources 
"""

all_inter_idx = unionlist3_adjusted['source_old'].copy()
list_all_inter_idx = all_inter_idx.apply(lambda x: len(list(x.split(','))))
result_all_inter_idx =Counter(list_all_inter_idx)
print(result_all_inter_idx)

# % of items in the union list that cut across all the databases
result_all_inter_idx[11]/sum(result_all_inter_idx.values())*100

In [None]:
"""
Percentage missed out by Retraction Watch
"""
no_rw = unionlist3_adjusted[unionlist3_adjusted['source_old'].str.contains('Retraction Watch')].count().iloc[0]
print(f'RetractionWatch is {no_rw} items, which only accounts for {no_rw/len(unionlist3_adjusted)*100}% of \
{len(unionlist3_adjusted)}-item unionlist')

print(f'The RetractionWatch source misses about {len(unionlist3_adjusted)- no_rw} items ({100 - (no_rw/len(unionlist3_adjusted)*100)}%)')


In [None]:
def what_is_missed(source):
    """
    It calculates the numbers of retracted items that data source failed to account for
    :param source: the given database
    
    """
    source_new = unionlist3_adjusted[unionlist3_adjusted['source_new'].str.contains(source)].count().iloc[0]
    print("Total # Covered: ",source_new)

    source_old = unionlist3_adjusted[unionlist3_adjusted['source_old'].str.contains(source)].count().iloc[0]
    print("Total # Indexed: ",source_old)

    print(f" The total items missed by {source} is {source_new - source_old}")

In [None]:
"""
Indexed items missed out by Sources
"""
sources_indexed= ['BCI','BIOABS','CCC','Compendex','Crossref', 'GEOBASE', 'Medline',
                   'PubMed', 'Retraction Watch', 'Scopus', 'WoS_Core']
with pd.option_context('mode.chained_assignment', None):
    for source in sources_indexed:
        what_is_missed(source)
        print("*"*30)

In [None]:
"""
Knowing the oldest indexed paper
"""
unionlist3_adjusted[unionlist3_adjusted['Year']== (unionlist3_adjusted['Year'].min())]

In [None]:
unionlist3_adjusted.loc[4915]['Title']

In [None]:
"""
Import STI2023 unionlist
"""
unionlist_sti = pd.read_csv(retraction_index_path+"/STI2023/2023-09-03_journalcategory_knownretractionlist_updated.csv").drop(['Unnamed: 0'],axis=1)
unionlist_sti['DOI']= unionlist_sti['DOI'].str.lower().astype(str)
unionlist_sti.head()

In [None]:
"""
unionlist_crws: Items from Crossref, Retraction Watch, Web of Science Core, Scopus in present union list
"""
unionlist_crws= unionlist3_adjusted[unionlist3_adjusted['source_old'].str.strip().str.contains(r'WoS_Core|Retraction Watch|Scopus|Crossref')].copy()
len(unionlist_crws)

In [None]:
"""
STI2023 present in unionlist_crws
"""
unionlist_crws[unionlist_crws['DOI'].isin(unionlist_sti['DOI'])]

In [None]:
"""
# newly added DOIs from STI2023 period to Unionlist_CRWS in Jul 2024 
"""
print(f'Estimating the # of items newly added and indexed in Crossref, Scopus, Retraction Watch, Web of Science Core:\n {(len(unionlist_crws) - len(unionlist_sti))}')

crws_per_added= ((len(unionlist_crws) - len(unionlist_sti))/len(unionlist_sti)) #*100
print(f'The estimated percentage of newly added DOIs: {crws_per_added*100}')

In [None]:
"""
Examining the Actual DOIs of STI2023 not in Unionlist_crws 2024
"""
pass

In [None]:
"""
Finding DOIs in Unionlist CRWS 2024 and not in STI2023 unionlist:
Actual newly added DOIs in Unionlist_crws
"""
diff_notin_sti_doi= set(unionlist_crws['DOI'] ) - set(unionlist_sti['DOI'])
print(f"The total DOIs in Unionlist CRWS 2024 and not in STI2023 is: {len(diff_notin_sti_doi)}")
print(f"{(len(diff_notin_sti_doi)/len(unionlist_sti))*100}% increase")

In [None]:
"""
Finding DOIs in STI unionlist and not in Unionlist CRWS 2024
"""
diff_in_ul24_doi= set(unionlist_sti['DOI']) - set(unionlist_crws['DOI'])

print(f"The total DOIs in STI2023 and not in Unionlist CRWS 2024 is: {len(diff_in_ul24_doi)}")

In [None]:
def get_doi_count(df,column, source):
    """
    It counts numbers of DOIs in a source
    :param df: DataFrame to search from
    :param column: Column to search in
    :param source: the source name to search
    
    :return: numbers of DOIs found
    """
    count= df[df[column].str.contains(source)].count().iloc[0]
    return count

In [None]:
"""
Finding the breakdown of DOIs that are missing in CRWS databases of present union list 2024
"""
df_diff_notin_ul24= unionlist_sti[unionlist_sti['DOI'].isin(diff_in_ul24_doi)]

ndoi_not_in_Crossref= get_doi_count(df_diff_notin_ul24, 'source', 'Crossref') #190
ndoi_not_in_RW= get_doi_count(df_diff_notin_ul24, 'source', 'Retraction Watch') #45
ndoi_not_in_Scopus= get_doi_count(df_diff_notin_ul24, 'source', 'Scopus') #69
ndoi_not_in_WoS= get_doi_count(df_diff_notin_ul24, 'source', 'Web of Science') #2011

print(f'The total numbers of DOIs in STI2023 and not in Unionlist CRWS 2024 is: {len(df_diff_notin_ul24)}')

print('Here is the breakdown:')
print(f" Crossref: {ndoi_not_in_Crossref}\n Retraction Watch: {ndoi_not_in_RW}\n Web of Science: {ndoi_not_in_WoS}\n Scopus: {ndoi_not_in_Scopus}")
    

In [None]:
"""
Finding 425 DOIs [diff_in_ul24_doi] not found in CRWS unionlist in the remaining databases that are in unionlist 2024
"""
x=unionlist3_adjusted[unionlist3_adjusted['DOI'].isin(diff_in_ul24_doi)].copy()

print(f"Of the {len(diff_in_ul24_doi)} STI DOIs not in the current CRWS subset of unionlist 2024,\n\
{len(x)} DOIs are found in the remaining subset of the unionlist 2024.\n\
Remaining {len(diff_in_ul24_doi) - len(x)} DOIs are missing")

x['source_old']= x['source_old'].apply(eval)
x_s= x['source_old'].tolist()
s_ = []
for s in x_s:
    s_+=s
Counter(s_)

In [None]:
unionlist_sti[unionlist_sti['DOI'].isin(diff_in_ul24_doi)]

In [None]:
"""
Comparing Total # retracted item in Crossref, Scopus, Retraction Watch, Web of Science Core compared with STI2023
"""

dois_inCommon_crws= unionlist_crws[unionlist_crws['DOI'].isin(unionlist_sti['DOI'])].copy()
dois_inCommon_crws

In [None]:
"""
Comparing Total # retracted item in Crossref, Scopus, Retraction Watch, Web of Science Core compared with STI2023
Newly added DOIs from STI2023 to unionlist_crws 2024
"""

newly_added_crws= unionlist_crws[~unionlist_crws['DOI'].isin(unionlist_sti['DOI'])].copy()
newly_added_crws

In [None]:
"""
Loading PMIDs of records with no DOI from STI: Crossref, Web of Science (276), Retraction Watch (682), Scopus (8)
"""
with open (retraction_index_path+"/STI2023/nodoi_sti_pmids.txt",'r') as fn:#.drop(['Unnamed: 0'],axis=1)
    sti_nodoi_pmids = fn.read().split('\n') 

In [None]:
len(set(sti_nodoi_pmids))

In [None]:
"""
Filtering PMIDs in STI2023 NoDOI that are not in STI Unionlist
    Because some PMIDs in STI2023 NoDOI also exist in STI unionlist, that is, 
    PMIDs are messy and the same PMID can occur for multiple items in the unionlist
"""
unionlist_sti['PubMedID']=unionlist_sti['PubMedID'].fillna('0').astype(int).astype(str)

# Getting STI NoDOI PMIDs that appear in STI unionlist
sti_nodoi_pmids_in_ul= unionlist_sti[unionlist_sti['PubMedID'].isin(sti_nodoi_pmids)]['PubMedID'].tolist()

# STI NoDOI PMIDs that do not appear in STI unionlist
sti_nodoi_pmids_notin_ul= list(set(sti_nodoi_pmids) - set(sti_nodoi_pmids_in_ul))
sti_nodoi_pmids_notin_ul.remove('')
len(sti_nodoi_pmids_notin_ul)

print(f"# of PMIDs with No DOI that were discarded during STI2023 unionlist formation, i.e.,")
print(f"The total # PMIDs without DOI in STI2023 unionlist is {len(sti_nodoi_pmids_notin_ul)}")


In [None]:
# These PMIDs are filtered out in the previous step since they occur for multiple items in the STI unionlist
unionlist_sti[unionlist_sti['PubMedID'].isin(sti_nodoi_pmids)]

In [None]:
'0' in sti_nodoi_pmids_notin_ul

In [None]:
"""
Finding items with PMIDs and no DOIs during STI2023 unionlist formation that are now present Unionlist 2024
i.e. Filtering PMIDs of STI NoDOI in Unionlist 2024
"""
df_sti_nodoi_pmid_inUnionlist= unionlist3_adjusted[(unionlist3_adjusted['PubMedID']!='') &\
                      (unionlist3_adjusted['PubMedID'].isin(sti_nodoi_pmids_notin_ul))]

df_sti_nodoi_pmid_inUnionlist

In [None]:
"""
Finding STI2023 PMIDs with noDOI found in Unionlist 2024 & of which that have DOIs
"""

with_nodoi_ID= df_sti_nodoi_pmid_inUnionlist[df_sti_nodoi_pmid_inUnionlist['DOI']\
                                             .str.startswith('nodoi')].count().iloc[0]

print(f"Of the total {len(df_sti_nodoi_pmid_inUnionlist)} STI NoDOI PMIDs items found in the new Unionlist 2024:\
        \n{with_nodoi_ID} have 'nodoi' IDs and {len(df_sti_nodoi_pmid_inUnionlist) - with_nodoi_ID} have DOI IDs")

In [None]:
df_sti_nodoi_pmid_inUnionlist

In [None]:
"""
Finding STI2023 PMIDs with noDOI found in unionlist_crws 2024 & of which that have DOIs 
"""
nfoundIn_unionlist_crws=df_sti_nodoi_pmid_inUnionlist[df_sti_nodoi_pmid_inUnionlist['source_old'].str.strip().str.\
                                                      contains(r'WoS_Core|Retraction Watch|Scopus|Crossref')].count().iloc[0]

print(f"# of STI2023 records with noDOI but PMID found in unionlist_crws 2024 is {nfoundIn_unionlist_crws}")

In [None]:
"""
Finding DOIs in unionlist CRWS 2024 and not in STI unionlist: Actual newly added DOIs in Unionlist_crws
"""
len(set(unionlist_crws['DOI']))- len(set(unionlist_sti['DOI']))


In [None]:
d=set(newly_added_crws['DOI'])- set(df_sti_nodoi_pmid_inUnionlist['DOI'])

newly_added_crws[newly_added_crws['DOI'].isin(d)].count()

In [None]:
"""
Finding the DOIs published in Year 2024 (to Jul) in newly added DOIs to unionlist_crws 2024

newly_added_crws = Newly added DOIs from STI2023 to unionlist_crws 2024
df_sti_nodoi_pmid_inUnionlist = unionlist 2024 items where PMID was previously dropped from STI2023

newly_added_crws= unionlist_crws[~unionlist_crws['DOI'].isin(unionlist_sti['DOI'])].copy()
df_sti_nodoi_pmid_inUnionlist = unionlist3_adjusted[(unionlist3_adjusted['PubMedID']!='') &\
                      (unionlist3_adjusted['PubMedID'].isin(sti_nodoi_pmids_notin_ul))]
"""

newly_added_crws[~newly_added_crws['DOI'].isin(df_sti_nodoi_pmid_inUnionlist['DOI'])\
                 & (newly_added_crws['Year']==2024)]#.count()

In [None]:
"""
Finding the DOIs published in the remaining Year 2023 (from Feb) after STI2023 unionlist

newly_added_crws = Newly added DOIs from STI2023 to unionlist_crws 2024
df_sti_nodoi_pmid_inUnionlist = unionlist 2024 items where PMID was previously dropped from STI2023

newly_added_crws= unionlist_crws[~unionlist_crws['DOI'].isin(unionlist_sti['DOI'])].copy()
df_sti_nodoi_pmid_inUnionlist = unionlist3_adjusted[(unionlist3_adjusted['PubMedID']!='') &\
                      (unionlist3_adjusted['PubMedID'].isin(sti_nodoi_pmids_notin_ul))]
"""

newly_added_crws[~newly_added_crws['DOI'].isin(df_sti_nodoi_pmid_inUnionlist['DOI'])\
                 & (newly_added_crws['Year']==2023)]

In [None]:
# all new items in Unionlist_crws 2024

unionlist_crws[~unionlist_crws['DOI'].isin(unionlist_sti['DOI'])]

In [None]:
"""
Investigating number intersection among covered sources 
"""
all_inter_cov = unionlist3_adjusted['source_new'].copy()
list_all_inter_cov = all_inter_cov.apply(lambda x: len(list(x.split(','))))
result_all_inter_cov =Counter(list_all_inter_cov)

print(result_all_inter_cov)
print(f'The total # of items covered by all the {max(result_all_inter_cov.keys())} sources is: \
{result_all_inter_cov[max(result_all_inter_cov.keys())]}, which is \
{result_all_inter_cov[11]/sum(result_all_inter_cov.values())*100}%')

In [None]:
"""
% Items that has Retraction Year
"""
rt= len(unionlist3_adjusted[unionlist3_adjusted['RetractionYear']>0])
print(f"The total number of items in the unionlist that have RetractionYear: {rt} of {len(unionlist3_adjusted)} i.e. \n {rt/len(unionlist3_adjusted)*100}%")


In [None]:
"""
Calculate items retracted in less 3 years
"""
rt_2yrs= unionlist3_adjusted[(unionlist3_adjusted['RetractionYear']>0) & (unionlist3_adjusted['TimetoRetraction']<=2)] #.count()[0]

print(f"Retracted items retracted within 2 years is {len(rt_2yrs)} of {rt}  i.e. {(len(rt_2yrs)/rt)*100}%  ")

In [None]:
sources_indexed

In [None]:
"""
Calculating # of databases co-indexing retracted paper at 100% RIA
"""

for s in sources_indexed:
    print(s)
    print(f"(Database#, DOIs#)")
    result_100_df = ria100_distribution(s)
    s_result= list(zip(result_100_df['Category'],result_100_df['Count']))
    print(f" Total items with 100% RIA: {result_100_df['Count'].sum()}")
    s_result=sorted(s_result, key=lambda x: x[1], reverse=True, )
    print(s_result)
    print("*"*20)
    
"""
Sample of analysis:
BCI 83.1% (732/881) in with 8 other databases, no items uniquely indexed as retracted
BIOABS 83.4% (732/878) in with 8 other databases, no items uniquely indexed as retracted 
CCC 59.5% (732/1230) in with 8 other databases, no items uniquely indexed as retracted 
Compendex 30.1% (52/173) with 9 others, no items uniquely indexed as retracted
Crossref 14.9% (732/4897) with 8 others and 3091 i.e. (63.1%, 3091/4897) items uniquely indexed as retracted
GEOBASE 83.5% (344/412) with 3 others, no items uniquely indexed
MEDLINE  59.1% (732/1238) with 8 others,  no items uniquely indexed
PubMed 39.6% (732/1847) with 8 others,  607 i.e (32.9%, 607/1847) items uniquely indexed
Retraction Watch 30.7% (731/2380) with 8 other databases, 566 i.e. (23.8%, 566/2380) items uniquely indexed
Scopus 44.3% (732/1651) with 8 other databases, 4 i.e. (0.24%, 4/1651) items uniquely indexed
Web of Science Core 58.7% (732/1247) with 8 other databases, 1 item uniquely indexed
"""
pass

In [None]:
# Calculating items retracted in year 2023 from STI2023
n_sti_2023items= len(unionlist_sti[unionlist_sti['Year']==2023])
n_sti_2023items

In [None]:
# Filter current Unionlist 2024 for items from year 2023

ul_2023= unionlist3_adjusted[unionlist3_adjusted['Year']>=2023]

In [None]:
now_ul_2023items= len(unionlist3_adjusted[unionlist3_adjusted['Year']>=2023])
print(f'The total number of retracted items from 2023 to date in our present unionlist is {now_ul_2023items}')
now_ul_2023items

# % increase between 2023 and now using all items in present UList
((now_ul_2023items - n_sti_2023items)/n_sti_2023items)*100

In [None]:
unionlist3_adjusted.columns

In [None]:
# Find # of items without Retraction Year
no_withRY= len(ul_2023[(ul_2023['RetractionYear']!=2023) & (ul_2023['RetractionYear']!=2024)])

print(f'Numbers of items without retraction year is {no_withRY} items between 2023 & now 2024')

In [None]:
# Estimating # of conference|proceeding|workshop|symposium in union list
items= r'conference|proceeding|workshop|symposium'
est_conf= unionlist3_adjusted[unionlist3_adjusted['Journal'].str.contains(items, case=False)]
print(f'The total number of conference in the union list is {len(est_conf)}')

In [None]:
"""
Investigating items indexed in PubMed and not in Medline
"""

doi_pubmed= unionlist3_adjusted[unionlist3_adjusted['source_old'].apply(lambda x: 'PubMed' in x)]['DOI']
doi_medline= unionlist3_adjusted[unionlist3_adjusted['source_old'].apply(lambda x: 'Medline' in x)]['DOI']

diff_pubmed_medline = set(doi_pubmed) - set(doi_medline)

unionlist3_adjusted[unionlist3_adjusted['DOI'].isin(diff_pubmed_medline)] \
# .to_csv(data_dir+'items_inpubmed_notin_medline.csv')

In [None]:
"""
Investigating items indexed in Medline and not in PubMed
"""

diff_medline_pubmed = set(doi_medline) - set(doi_pubmed)
unionlist3_adjusted[unionlist3_adjusted['DOI'].isin(diff_medline_pubmed)] \
# .to_csv(data_dir+'items_inmedline_notin_pubmed.csv')

### END