# Step 6: Data Analysis

It computes the Retraction Indexing Agreement Scores and other analysis

Input File: 
   - Retracted publications Union list (from Step 5)
       - unionlist_completed_{date}.csv 
   - Union list coverage check result (coverednotindexed) files for all sources (from Step 3):
       - ads_coverednotindexed_{date_coverage}.csv
       - bci_coverednotindexed_{date_coverage}.csv
       - bioabs_coverednotindexed_{date_coverage}.csv
       - ccc_coverednotindexed_{date_coverage}.csv
       - compendex_coverednotindexed_{date_coverage}.csv
       - crossref_coverednotindexed_{date_coverage}.csv
       - geobase_coverednotindexed_{date_coverage}.csv
       - georef_coverednotindexed_{date_coverage}.csv
       - ieee_coverednotindexed_{date_coverage}.csv
       - inspec_coverednotindexed_{date_coverage}.csv
       - medline_coverednotindexed_{date_coverage}.csv
       - pubmed_coverednotindexed_{date_coverage}.csv
       - sciencedirect_coverednotindexed_{date_coverage}.csv
       - scopus_coverednotindexed_{date_coverage}.csv
       - webofscience_coverednotindexed_{date_coverage}.csv
       - zoorec_coverednotindexed_{date_coverage}.csv

Output File: 
   - unionlist_completed_ria_{getdate}.csv
  

###### Uncomment the line of code below to save analysis results to your local directory:
       - (i)  "....to_csv(..)" 
       - (ii) "plt.savefig(...)" 

In [None]:
# !pip install dataframe_image
# !pip install upsetplot
# !pip install dfstyle

In [None]:
import os
from collections import Counter
import seaborn as sns
import pandas as pd
import dataframe_image as dfi
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from datetime import date, datetime as dt
import time,datetime
import re
import unicodedata
import ast  # Module to handle literal_eval function

In [None]:
from upsetplot import generate_counts
from upsetplot import plot
from upsetplot import from_memberships
from upsetplot import UpSet
from matplotlib import pyplot
from itertools import combinations

In [None]:
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
from matplotlib.ticker import MaxNLocator

%matplotlib notebook

In [None]:
"""
Set up & defining file directories: Targeting the retraction_index_path
"""
retraction_index_path = os.path.abspath('./.')


data_dir = retraction_index_path+'/data/' # data directory
result_dir = retraction_index_path+'/result/'

In [None]:
# Create 'result/RetractionIndexingAgreement_ITEM' directory to store RetractionIndexingAgreement Score
if not os.path.exists(result_dir+'RetractionIndexingAgreement_ITEM/'):
        os.mkdir(result_dir+'RetractionIndexingAgreement_ITEM/')

In [None]:
def convert_unicode(string: str) -> str:
    """
    It takes a string and passes it through different encoding parameter phases
    E.g. '10.\u200b1105/\u200btpc.\u200b010357' ->  '10.1105/tpc.010357'
    
    :param string: variable to be encoded
    :return: the actual string value devoided of encoded character
    """
    
    string = unicodedata.normalize('NFKD', string).encode('iso-8859-1', 'ignore').decode('iso-8859-1')
    string = unicodedata.normalize('NFKD', string).encode('latin1', 'ignore').decode('latin1')
    string = unicodedata.normalize('NFKD', string).encode('cp1252', 'ignore').decode('cp1252')
    return string


In [None]:
# Supply the date of coverage check for all the sources from Step 3 pipeline ({source}_coverednotindexed_{date}.csv)
date_coverage = {
            'ads': '2024-08-15',
            'bci': '2024-08-02',
            'bioabs': '2024-08-13',
            'ccc': '2024-08-05',
            'compendex': '2024-07-29',
            'crossref':'2024-08-06',
            'geobase': '2024-07-29',
            'georef':'2024-07-30',
            'ieee':'2024-08-11', 
            'inspec': '2024-08-02',
            'medline': '2024-08-05',
            'pubmed': '2024-07-26',
            'sciencedirect': '2024-08-05',
            'scopus': '2024-08-02',
            'webofsciencecore': '2024-07-30',
            'zoorec': '2024-08-05',
   
            'retractionwatch': '2024-07-03'
            }


# Supply the date of  retracted items were collected from each source and unionlist created date
getdate = {'scopus': '2024-07-05',
            'crossref':'2024-07-03',
            'retractionwatch': '2024-07-03',
            'pubmed': '2024-07-03',
            'geobase': '2024-07-05',
            'compendex': '2024-07-09',
            'bci': '2024-07-03',
            'bioabs': '2024-07-03',
            'ccc': '2024-07-03',
            'medline': '2024-07-03',
            'webofsciencecore': '2024-07-03',
           
            'unionlist':'2024-07-09'} 

## 1.  Generate New Unionlist For Items Coverage in Sources
- For each source, we import results items not indexed as retracted publication but were found in that source (i.e. coverage)  <- _notindexed

In [None]:
"""
Read in the unionlist of retracted items with their retracted years <- unionlist_indexed
"""

unionlist_indexed = pd.read_csv(data_dir+'unionlist/unionlist_completed_'+getdate['unionlist']+'.csv').drop('Unnamed: 0',axis=1)

# Deduplicating source value 
unionlist_indexed['source'] = unionlist_indexed['source'].apply(lambda x: '; '.join(sorted(set(x.split('; ')))))

unionlist_indexed #.head(5)

### Loading coverage items not indexed for each source

In [None]:
"""
Read in items that were covered but not indexed as retracted publication from each sources
<- source_notindexed
"""

compendex_notindexed= pd.read_csv(data_dir+'coverednotindexed/compendex_coverednotindexed_'+date_coverage['compendex']+'.csv').drop('Unnamed: 0',axis=1)
compendex_notindexed['source']=  'Compendex'

crossref_notindexed= pd.read_csv(data_dir+'coverednotindexed/crossref_coverednotindexed_'+date_coverage['crossref']+'.csv').drop('Unnamed: 0',axis=1)
crossref_notindexed['source']=  'Crossref'

geobase_notindexed= pd.read_csv(data_dir+'coverednotindexed/geobase_coverednotindexed_'+date_coverage['geobase']+'.csv').drop('Unnamed: 0',axis=1)
geobase_notindexed['source']=  'GEOBASE'

ieee_notindexed= pd.read_csv(data_dir+'coverednotindexed/ieee_coverednotindexed_'+date_coverage['ieee']+'.csv').drop('Unnamed: 0',axis=1)
ieee_notindexed['source']=  'IEEE'

inspec_notindexed= pd.read_csv(data_dir+'coverednotindexed/inspec_coverednotindexed_'+date_coverage['inspec']+'.csv').drop('Unnamed: 0',axis=1)
inspec_notindexed['source']=  'Inspec'

pubmed_notindexed = pd.read_csv(data_dir+'coverednotindexed/pubmed_coverednotindexed_'+date_coverage['pubmed']+'.csv').drop('Unnamed: 0',axis=1)
pubmed_notindexed['source']=  'PubMed'

scopus_notindexed= pd.read_csv(data_dir+'coverednotindexed/scopus_coverednotindexed_'+date_coverage['scopus']+'.csv').drop('Unnamed: 0',axis=1)
scopus_notindexed['source']=  'Scopus'

sciencedirect_notindexed= pd.read_csv(data_dir+'coverednotindexed/sciencedirect_coverednotindexed_'+date_coverage['sciencedirect']+'.csv')
sciencedirect_notindexed['source']=  'ScienceDirect'

ads_notindexed= pd.read_csv(data_dir+'coverednotindexed/ads_coverednotindexed_'+date_coverage['ads']+'.csv')
ads_notindexed['source']=  'ADS'

bci_notindexed= pd.read_csv(data_dir+'coverednotindexed/bci_coverednotindexed_'+date_coverage['bci']+'.csv')
bci_notindexed['source']=  'BCI'

bioabs_notindexed= pd.read_csv(data_dir+'coverednotindexed/bioabs_coverednotindexed_'+date_coverage['bioabs']+'.csv')
bioabs_notindexed['source']=  'BIOABS'

ccc_notindexed= pd.read_csv(data_dir+'coverednotindexed/ccc_coverednotindexed_'+date_coverage['ccc']+'.csv').drop('Unnamed: 0',axis=1)
ccc_notindexed['source']=  'CCC'

georef_notindexed= pd.read_csv(data_dir+'coverednotindexed/georef_coverednotindexed_'+date_coverage['georef']+'.csv')
georef_notindexed['source']=  'GeoRef'

medline_notindexed= pd.read_csv(data_dir+'coverednotindexed/medline_coverednotindexed_'+date_coverage['medline']+'.csv')
medline_notindexed['source']=  'Medline'

zoorec_notindexed= pd.read_csv(data_dir+'coverednotindexed/zoorec_coverednotindexed_'+date_coverage['zoorec']+'.csv').drop('Unnamed: 0',axis=1)
zoorec_notindexed['source']=  'ZOOREC'

woscore_notindexed= pd.read_csv(data_dir+'coverednotindexed/webofsciencecore_coverednotindexed_'+date_coverage['webofsciencecore']+'.csv')
woscore_notindexed['source']=  'WoS_Core'

In [None]:
"""
Merge all the items covered but not indexed as retracted publications from each source with the
unionlist of indexed items <- unionlist_covers
"""

merged_withdoi = pd.concat([unionlist_indexed,
                            crossref_notindexed,
                            pubmed_notindexed,
                            scopus_notindexed,
                            geobase_notindexed,
                            compendex_notindexed,
                            sciencedirect_notindexed,
                            ieee_notindexed,
                            georef_notindexed,
                            inspec_notindexed,
                            bci_notindexed,
                            bioabs_notindexed, 
                            medline_notindexed,
                            woscore_notindexed,
                            ads_notindexed,
                            zoorec_notindexed,
                            ccc_notindexed])

unionlist_covers= merged_withdoi.groupby('DOI').agg({'Author':'first', 
                              'Title': 'first',
                              'Year': 'first', 
                              'Journal': 'first',
                              'PubMedID':'first',
                              'source':'; '.join 
                              }).reset_index()

unionlist_covers.fillna('', inplace=True)


In [None]:
"""
Deduplicate the 'source' in the unionlist_covers dataframe (i.e. that entails covered all items) and 
call the source field 'source_new'.
"""

# Deduplicating source value 
unionlist_covers['source'] = unionlist_covers['source'].apply(lambda x: '; '.join(sorted(set(x.split('; ')))))

# Renaming column
unionlist_covers.rename(columns={'source': 'source_new'}, inplace=True)
unionlist_covers


## Section 2: Calculation of RetractionIndexAgreement by Source

In [None]:
"""
Merge unionlist_covers (that details covered items) and unionlist_indexed (that detailed indexed retracted items)
Call 'source' column in unionlist_indexed 'source_old'
"""

unionlist3 = pd.merge(unionlist_covers, unionlist_indexed[['DOI','source','MainCategory','RetractionYear']].rename(columns={'source':'source_old'}), on='DOI')

# Store 'source_old' & 'source_old' value as sorted list
unionlist3['source_old'] = unionlist3['source_old'].apply(lambda x: sorted(x.split('; ')))#.astype(str)
unionlist3['source_new'] = unionlist3['source_new'].apply(lambda x: sorted(x.split('; ')))#.astype(str)

# Convert Year to int
unionlist3['Year']= unionlist3['Year'].astype(int)

# Fill RetractionYear NA with 0
unionlist3['RetractionYear']= unionlist3['RetractionYear'].fillna(0).astype(int)

# Convert PubMedID to String
unionlist3['PubMedID']=unionlist3['PubMedID'].replace('','0').astype(int).replace(0,'').astype(str).str.strip()

# Calculate time until retraction
unionlist3['TimetoRetraction']= (unionlist3['RetractionYear'] - unionlist3['Year'])
unionlist3.head()

In [None]:
"""
Remove sources from sources_new that have no indexed items for initial calculations
"""
unionlist3_adjusted = unionlist3.copy(deep=True)

# Removing 'ADS','GeoRef','IEEE','Inspec','ScienceDirect', 'ZOOREC' since they have no indexed retracted items
removed_sources_new= ['ADS','GeoRef','IEEE','Inspec','ScienceDirect', 'ZOOREC']
unionlist3_adjusted['source_new']= unionlist3_adjusted['source_new'].apply(lambda x: [item for item in x if item not in removed_sources_new])

unionlist3_adjusted['source_new']= unionlist3_adjusted.source_new.astype(str)

unionlist3_adjusted['source_old']= unionlist3_adjusted.source_old.astype(str)

unionlist3_adjusted.head()

In [None]:
# Starting Calculations
# Set up the formula for the calculation
count_sources_covering_item = unionlist3_adjusted.source_new.apply(ast.literal_eval).apply(len)

count_sources_indexing_item_as_retracted = unionlist3_adjusted.source_old.apply(ast.literal_eval).apply(len)

RetractionIndexingDiscrepancy_ITEM = count_sources_indexing_item_as_retracted / count_sources_covering_item

# Add the calculation score to dataframe
unionlist3_adjusted['RetractionIndexingAgreement_ITEM(%)'] = ((RetractionIndexingDiscrepancy_ITEM)*100).astype(int)
unionlist3_adjusted = unionlist3_adjusted.sort_index()

# Saving file to unionlist folder
# unionlist3_adjusted.to_csv(data_dir+f"unionlist/unionlist_completed_ria_{getdate['unionlist']}.csv",index=False)

unionlist3_adjusted.head()

In [None]:
"""
Export of Retraction Indexing Agreement Score into 'RetractionIndexingAgreement_ITEM' folder
v = unique retraction indexing agreement scores from unionlist3_adjusted, stored as an array
c = count of items with unique retraction indexing agreement score, stored as an array
s = v variable stored as a list
"""

v, c = np.unique(unionlist3_adjusted['RetractionIndexingAgreement_ITEM(%)'], return_counts=True)
s = v.tolist()

for i in s:
    exp = unionlist3_adjusted[unionlist3_adjusted['RetractionIndexingAgreement_ITEM(%)']== i]
#     exp.to_csv(result_dir+'/RetractionIndexingAgreement_ITEM/RetractionIndexingAgreement_ITEM_' + str(i) + '.csv' )

# unionlist3_adjusted.to_csv(result_dir+'/RetractionIndexingAgreement_ITEM/RetractionIndexingAgreement_ITEM_all.csv')

In [None]:
v,c

### 2.1  Calculation of RetractionIndexAgreement for Each Source

In [None]:
def count_source(df: pd.DataFrame, source_name: str) -> tuple:
    """
    :param df: DataFrame to work on
    :param source_name: source to lookup to determine count
    
    :return: tuple of no_inOLD, no_inNEW
    """
    
    no_inOLD= len(set(df[(df['source_old'].str.contains(source_name, na=False))]['DOI'])) #.count()[0]
    
    no_inNEW= len(set(df[(df['source_new'].str.contains(source_name, na=False))]['DOI'])) #.count()[0]
    
    return  no_inOLD, no_inNEW

In [None]:
"""
Counting # of items indexed and covered in each source
"""

bci_count= count_source(unionlist3_adjusted, 'BCI')

bioabs_count= count_source(unionlist3_adjusted, 'BIOABS')

ccc_count= count_source(unionlist3_adjusted, 'CCC')

crossref_count= count_source(unionlist3_adjusted, 'Crossref')

compendex_count= count_source(unionlist3_adjusted, 'Compendex')

geobase_count= count_source(unionlist3_adjusted, 'GEOBASE')

pubmed_count= count_source(unionlist3_adjusted, 'PubMed')

medline_count= count_source(unionlist3_adjusted, 'Medline')

scopus_count= count_source(unionlist3_adjusted, 'Scopus')

webofsciencecore_count= count_source(unionlist3_adjusted, 'WoS_Core')

retractionwatch_count= count_source(unionlist3_adjusted, 'Retraction Watch')

ads_count= (0, len(ads_notindexed))
georef_count= (0, len(georef_notindexed))
ieee_count= (0, len(ieee_notindexed))
inspec_count=  (0, len(inspec_notindexed))
sciencedirect_count= (0, len(sciencedirect_notindexed))
zoorec_count= (0, len(zoorec_notindexed))


In [None]:
"""
Set up variables to be used for labeling and narrowing down sources with retraction indexing from unionlist
"""

all_sources = ['ADS','BCI','BIOABS','CCC','Compendex','Crossref', 'GEOBASE', 'GeoRef', 'IEEE','INSPEC','MEDLINE',
                   'PubMed', 'Retraction Watch', 'ScienceDirect', 'Scopus', 'Web of Science Core', 'ZOOREC'] 

# Web of Science Core listed in unionlist as WoS_Core
# Biological Abstracts listed in unionlist as BIOABS

sources_indexed= ['BCI','BIOABS','CCC','Compendex','Crossref', 'GEOBASE', 'Medline',
                   'PubMed', 'Retraction Watch', 'Scopus', 'WoS_Core']


In [None]:
"""
Calculating # items not indexed as retracted for each source -> not_covered list
"""
not_covered= [len(unionlist3_adjusted)- len(bci_notindexed)- bci_count[0],
                len(unionlist3_adjusted)- len(bioabs_notindexed)- bioabs_count[0],
                len(unionlist3_adjusted)- len(ccc_notindexed)- ccc_count[0],
                len(unionlist3_adjusted)- len(compendex_notindexed)- compendex_count[0],
                len(unionlist3_adjusted)- len(crossref_notindexed)- crossref_count[0],
                len(unionlist3_adjusted)- len(geobase_notindexed)- geobase_count[0],
                len(unionlist3_adjusted)- len(medline_notindexed)- medline_count[0],
                len(unionlist3_adjusted)- len(pubmed_notindexed)- pubmed_count[0],
                len(unionlist3_adjusted)- len(scopus_notindexed)- scopus_count[0],
                len(unionlist3_adjusted)- len(woscore_notindexed)- webofsciencecore_count[0]]

In [None]:
"""
- Computing the Retraction Indexing Agreement score for sources (indexed sources only)
- Putting results of # indexed, coverednotindexed, not_covered
"""

table = pd.DataFrame()
table['source']= ['BCI', 'BIOABS', 'CCC','Compendex', 'Crossref', 'GEOBASE', 'MEDLINE', 'PubMed', 'Scopus', 'Web of Science Core']
table['indexed_as_retracted']= [bci_count[0],
                                bioabs_count[0],
                                ccc_count[0], 
                                compendex_count[0], 
                                crossref_count[0],
                                geobase_count[0], 
                                medline_count[0], 
                                pubmed_count[0], 
                                scopus_count[0], 
                                webofsciencecore_count[0]]

table['covered_but_not_indexed_as_retracted'] = [len(bci_notindexed), 
                                                 len(bioabs_notindexed), 
                                                 len(ccc_notindexed),
                                                 len(compendex_notindexed),
                                                 len(crossref_notindexed),
                                                 len(geobase_notindexed),
                                                 len(medline_notindexed),
                                                 len(pubmed_notindexed),
                                                 len(scopus_notindexed),
                                                 len(woscore_notindexed)]

table['not_covered'] = not_covered

# Calculate the retraction agreement between sources that have retraction indexing
ria_source=[]

for i in range(0,10):
    D = table['indexed_as_retracted'][i] + table['covered_but_not_indexed_as_retracted'][i]   
    ria_source.append(round(((table['indexed_as_retracted'][i]/D)*100), 2))

table['RetractionIndexingAgreement_SOURCE(%)'] = [ria_source[0],
                                                  ria_source[1], 
                                                  ria_source[2], 
                                                  ria_source[3],
                                                  ria_source[4], 
                                                  ria_source[5], 
                                                  ria_source[6], 
                                                  ria_source[7], 
                                                  ria_source[8], 
                                                  ria_source[9]]
 #set first column as index
table = table.sort_values(by='source')#.set_index(table.columns[0])
table = table.reset_index()
table =table.drop('index',axis=1)
table


In [None]:
"""
Computing the Retraction Indexing Agreement score for all sources
"""
overview_RIA_source= pd.DataFrame()
overview_RIA_source['Source']= all_sources

sources_count= [ads_count,
                bci_count,
                bioabs_count, 
                ccc_count,
                compendex_count, 
                crossref_count, 
                geobase_count, 
                georef_count, 
                ieee_count,
                inspec_count, 
                medline_count, 
                pubmed_count, 
                retractionwatch_count,
                sciencedirect_count, 
                scopus_count, 
                webofsciencecore_count, 
                zoorec_count]

total_unionlist = len(unionlist3_adjusted)


indexed_as_retracted=[]
covered_notindexed=[]
not_coveredinUnionlist=[]

for source in sources_count:
    indexed_as_retracted.append(source[0])
    covered_notindexed.append(source[1] - source[0])
    not_coveredinUnionlist.append(total_unionlist - (source[1]))

overview_RIA_source['Indexed_as_retracted'] = indexed_as_retracted
overview_RIA_source['Covered_notindexed'] = covered_notindexed
overview_RIA_source['Not_coveredinUnionlist'] = not_coveredinUnionlist


# Calculating RIA Score for Sources
overview_RIA_source['RetractionIndexingAgreement_SOURCE(%)']= \
       round(((overview_RIA_source['Indexed_as_retracted'] / \
               (overview_RIA_source['Indexed_as_retracted'] + overview_RIA_source['Covered_notindexed'])) *100),2)


# Uncomment for confirmation -  all the rows must be equal to the total records of the Unionlist
# overview_RIA_source['Total']= overview_RIA_source['Indexed_as_retracted']+\
#                                 overview_RIA_source['Covered_notindexed'] + overview_RIA_source['Not_coveredinUnionlist']

overview_RIA_source

In [None]:
"""
Save overview_RIA_source result to folder
- uncomment last line to save 
"""

# overview_RIA_source.to_csv(result_dir+'RIA_score_sources_data.csv')

In [None]:
"""
Double-checking result in RIA_score for 'CCC'
"""

# Confirming the calculation
c= unionlist3_adjusted[unionlist3_adjusted['source_new'].apply(lambda x: 'CCC' in x)].count()[0]
i= unionlist3_adjusted[unionlist3_adjusted['source_old'].apply(lambda x: 'CCC' in x)].count()[0]
print(c,'\t',i)
print((i/c)*100)
print(c - i)

### 2.2  Calculation of RetractionIndexAgreement by Sources

In [None]:
"""
Partitioning the data source by size
"""

# ADS, GeoRef, IEEE, INSPEC, ScienceDirect, ZOOREC
overview_RIA_source_1 = overview_RIA_source.iloc[[0,7,8,9,13,16],:]

# Sources that index retracted publications, excluding Retraction Watch
overview_RIA_source.iloc[[1,2,3,4,5,6,10,11,14,15],:].copy()

# BCI, BIOABS, CCC, GEOBASE
overview_RIA_source_2_1 = overview_RIA_source.iloc[[1,2,3,6],:].copy()

# Compendex,  Crossref, MEDLINE, PubMed, Retraction Watch, Scopus, Web of Science Core
overview_RIA_source_2_2 = overview_RIA_source.iloc[[4,5,10,11,12,14,15],:].copy()
overview_RIA_source_2_2

##### Uncomment the last line of cell "plt.savefig(...)" to graph results for every analysis

In [None]:
"""
Overall Plot of "Indexed as Retracted", "Covered but Not Indexed as Retracted", "Not Covered"
"""

fig0, ax0 = plt.subplots(figsize=(18, 8))

table = overview_RIA_source.copy()

# Define data for plotting
x = np.arange(len(table['Source']))
y1 = table['Indexed_as_retracted']
y2 = table['Covered_notindexed']
y3 = table['Not_coveredinUnionlist']

# Set the width of the bars
bar_width = 0.7

# Set the positions of the bars on the x-axis
r1 = np.arange(len(table['Source']))

# Plot the bars for Group1
plt.bar(r1, y1, color='#377eb8', hatch='/', width=bar_width, edgecolor='white', label='Indexed as Retracted')

# Plot the bars for Group2 on top of Group1
plt.bar(r1, y2, bottom=y1, color='#f781bf', hatch='o', width=bar_width, edgecolor='white', label='Covered but Not Indexed as Retracted')

# Plot the bars for Group3 on top of Group1 and Group2
plt.bar(r1, y3, bottom=y1 + y2, color='#ff7f00', hatch='+', width=bar_width, edgecolor='white', label='Not Covered')

# Axis labels
sources_adjusted = all_sources

plt.xticks(r1, sources_adjusted, rotation=45, ha="right")  # Rotate labels for better visibility
plt.xlabel("Source", fontsize=14)
plt.ylabel("Number of DOIs", fontsize=14)

ax0.tick_params(axis='x', labelsize=14)

plt.legend(["Indexed as Retracted", "Covered but Not Indexed as Retracted", "Not Covered"],fontsize=15, loc='upper left')

fig0.patch.set_facecolor('#ccd9ff')  # Set the background color of the figure

ax0.set_yticks(range(0, 90000, 10000)) #70900

# Show the plot
plt.tight_layout()
plt.show()

# Save result to folder
# plt.savefig(result_dir+'recordsineachsource.png')

In [None]:
"""
Subplot: Overall Plot of "Indexed as Retracted", "Covered but Not Indexed as Retracted", "Not Covered"
"""

fig0_1, ax0_1 = plt.subplots(1, 3, figsize=(12, 7), sharey=True)
fig0_1.subplots_adjust(hspace=0.5, wspace=0.05) # Space between charts

# Set the width of the bars
bar_width = 0.8


# First Subplot
table001 = overview_RIA_source_2_1.copy()

# Set the positions of the bars on the x-axis
r1_0 = np.arange(len(table001['Source']))

# Define data for plotting
y1_0 = table001['Indexed_as_retracted']
y2_0 = table001['Covered_notindexed']
y3_0 = table001['Not_coveredinUnionlist']

ax0_1[0].bar(r1_0, y1_0, color='#377eb8', hatch='/', width=bar_width, edgecolor='white', label='Indexed as Retracted')
ax0_1[0].bar(r1_0, y2_0, bottom=y1_0, color='#f781bf', hatch='o', width=bar_width, edgecolor='white', label='Covered but Not Indexed as Retracted')
ax0_1[0].bar(r1_0, y3_0, bottom=y1_0 + y2_0, color='#ff7f00', hatch='+', width=bar_width, edgecolor='white', label='Not Covered')

# Set x-axis ticks with labels from 'Source' column
ax0_1[0].set_xticks(r1_0)
ax0_1[0].set_xticklabels(table001['Source'], rotation=45, ha="right")


# Second Subplot
table002 = overview_RIA_source_2_2.copy()
r1_1 = np.arange(len(table002['Source']))

# Define data for plotting
y1_1 = table002['Indexed_as_retracted']
y2_1 = table002['Covered_notindexed']
y3_1 = table002['Not_coveredinUnionlist']

ax0_1[1].bar(r1_1, y1_1, color='#377eb8', hatch='/', width=bar_width, edgecolor='white', label='Indexed as Retracted')
ax0_1[1].bar(r1_1, y2_1, bottom=y1_1, color='#f781bf', hatch='o', width=bar_width, edgecolor='white', label='Covered but Not Indexed as Retracted')
ax0_1[1].bar(r1_1, y3_1, bottom=y1_1 + y2_1, color='#ff7f00', hatch='+', width=bar_width, edgecolor='white', label='Not Covered')

# Set x-axis ticks with labels from 'Source' column
ax0_1[1].set_xticks(r1_1)
ax0_1[1].set_xticklabels(table002['Source'], rotation=45, ha="right")


# Third Subplot
table003 = overview_RIA_source.iloc[[0,7,8,9,13,16],:].copy()
r1_2 = np.arange(len(table003['Source']))

# Define data for plotting
y1_2 = table003['Indexed_as_retracted']
y2_2 = table003['Covered_notindexed']
y3_2 = table003['Not_coveredinUnionlist']

ax0_1[2].bar(r1_2, y1_2, color='#377eb8', hatch='/', width=bar_width, edgecolor='white', label='Indexed as Retracted')
ax0_1[2].bar(r1_2, y2_2, bottom=y1_2, color='#f781bf', hatch='o', width=bar_width, edgecolor='white', label='Covered but Not Indexed as Retracted')
ax0_1[2].bar(r1_2, y3_2, bottom=y1_2 + y2_2, color='#ff7f00', hatch='+', width=bar_width, edgecolor='white', label='Not Covered')

# Set x-axis ticks with labels from 'Source' column
ax0_1[2].set_xticks(r1_2)
ax0_1[2].set_xticklabels(table003['Source'], rotation=45, ha="right")

fig0_1.patch.set_facecolor('#ccd9ff')

ax0_1[0].set_ylabel("Number of Items", fontsize=14)
plt.subplots_adjust(left=0.1, right=.95, top=0.87, bottom=0.2) #top=0.95

# Change outline colors
axes = [ax0_1[0], ax0_1[1], ax0_1[2]]
for ax in axes:
    for spine in ax.spines.values():
        spine.set_edgecolor('black')
        spine.set_visible(True)


# Add legend above the graphs at the center
plt.legend(["Indexed as Retracted", "Covered but Not Indexed as Retracted", "Not Covered"], fontsize=10, loc='upper center', bbox_to_anchor=(-0.6, 1.18), ncol=1)
plt.show()

# Save result to folder
# plt.savefig(result_dir+'recordsineachsource_subplot.png')

In [None]:
"""
Retraction Indexing Agreement Score by Source 
"""

sns.set_theme()
fig01_1, ax01_1 = plt.subplots(figsize=(11, 6))

cmap01_1 = {
    'red': 25,
    'orange': 50,
    'blue': 75,
    'green': 100}

def get_color(value):
    for color, threshold in cmap01_1.items():
        if value <= threshold:
            return color

# Use the colormap to assign colors based on the percentages
overview_RIA_source['Color'] = overview_RIA_source['RetractionIndexingAgreement_SOURCE(%)'].apply(get_color)

# Assigning color to Sources
mycolor_01_1 = dict(zip(overview_RIA_source['Source'], overview_RIA_source['Color']))

overview_RIA_source.drop(['Color'],axis=1, inplace=True)

sns.scatterplot(data=overview_RIA_source, x="Source", y="RetractionIndexingAgreement_SOURCE(%)", hue="Source",
                palette=mycolor_01_1,
                size="RetractionIndexingAgreement_SOURCE(%)", legend=False, sizes=(20, 1000), ax=ax01_1)

ax01_1.set_yticks(range(0, 115, 10))
fig01_1.patch.set_facecolor('#ccd9ff')  # Set the background color of the figure
# ax01_1.set_title('Source RetractionIndexingAgreement Score', size=15)
plt.xticks(rotation=45, ha="right")  # Rotate labels for better visibility

# plt.xlabel("Source", fontsize=14)

ax01_1.tick_params(axis='x', labelsize=13)
plt.xlabel("Source", fontsize=14)


# Annotate points with percentages and add a background
for i, point in enumerate(overview_RIA_source.iterrows()):
    x = point[1]["Source"]
    y = point[1]["RetractionIndexingAgreement_SOURCE(%)"]
    label = f"{y:.2f}%"  # Format percentage label
    plt.annotate(label, (x, y), textcoords="offset points", xytext=(0, 7), ha='left', fontsize=11, color='black',
                 bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.1'))
    
fig01_1.patch.set_facecolor('#ccd9ff')

# Remove 'Source Label'
ax01_1.set_xlabel('') 

plt.tight_layout()
plt.show()

# Save results to folder
# plt.savefig(result_dir+'ria_scores_colored1.png')

In [None]:
"""
Retraction Indexing Agreement Score by source (excluding non-indexed source only & Retraction Watch )
"""

# Selecting source that indexed retracted publication only (excluding RetractionWatch)
overview_RIA_source_2= overview_RIA_source.iloc[[1,2,3,4,5,6,10,11,14,15],:].copy()

sns.set_theme()
fig01_2, ax01_2 = plt.subplots(figsize=(11, 6))

cmap01_1 = {
    'red': 25,
    'orange': 50,
    '#1f77b4': 75,
    'green': 100}

def get_color(value):
    for color, threshold in cmap01_1.items():
        if value <= threshold:
            return color

# Use the colormap to assign colors based on the percentages
overview_RIA_source_2['Color'] = overview_RIA_source_2['RetractionIndexingAgreement_SOURCE(%)'].apply(get_color)

# Assigning color to Sources
mycolor_01_1 = dict(zip(overview_RIA_source_2['Source'], overview_RIA_source_2['Color']))

overview_RIA_source_2.drop(['Color'],axis=1, inplace=True)

sns.scatterplot(data=overview_RIA_source_2, x="Source", y="RetractionIndexingAgreement_SOURCE(%)", hue="Source",
                palette=mycolor_01_1,
                size="RetractionIndexingAgreement_SOURCE(%)", legend=False, sizes=(20, 1000), ax=ax01_2)

ax01_2.set_yticks(range(0, 105, 10))
fig01_2.patch.set_facecolor('#ccd9ff')  # Set the background color of the figure
# ax01_2.set_title('Source RetractionIndexingAgreement Score', size=15)
plt.xticks(rotation=45, ha="right")  # Rotate labels for better visibility

# plt.xlabel("Source", fontsize=14)

ax01_2.tick_params(axis='x', labelsize=13)
plt.xlabel("Source", fontsize=14)


# Annotate points with percentages and add a background
for i, point in enumerate(overview_RIA_source.iterrows()):
    x = point[1]["Source"]
    y = point[1]["RetractionIndexingAgreement_SOURCE(%)"]
    label = f"{y:.2f}%"  # Format percentage label
    plt.annotate(label, (x, y), textcoords="offset points", xytext=(0, 7), ha='left', fontsize=11, color='black',
                 bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.1'))

    
fig01_2.patch.set_facecolor('#ccd9ff')

# Remove 'Source Label'
ax01_2.set_xlabel('') 

plt.tight_layout() 
plt.show()

# Save results to folder
# plt.savefig(result_dir+'ria_scores_colored2.png')

### 2.3  Pairwise Calculation of RetractionIndexAgreement Between Two Sources

In [None]:
"""
Calculating pairwise RetractionIndexAgreement between two sources
(i) Pair up sources to check their pairwise RetractionIndexAgreement -> unique_pairs
(ii) Calculating pairwise RetractionIndexAgreement 
"""

from itertools import combinations

sources_indexed= ['BCI', 'BIOABS', 'CCC','Compendex','Crossref', 'GEOBASE', 'Medline',
                   'PubMed', 'Retraction Watch', 'Scopus', 'WoS_Core']


pairswise2_RIA_Scores = []

# Generate unique pairs
unique_pairs = []
for pair in combinations(sources_indexed, 2):
    if pair not in unique_pairs and pair[::-1] not in unique_pairs:
        unique_pairs.append(pair)

# Print unique pairs
for pair in unique_pairs:

    source1= pair[0]
    source2= pair[1]


    # Checking DOIs that are present in both pairs sources in 
    # their corresponding columns of indexing and coverage columns
    source1_covered= unionlist3_adjusted[unionlist3_adjusted['source_new'].str.contains(source1)]['DOI']
    source2_covered= unionlist3_adjusted[unionlist3_adjusted['source_new'].str.contains(source2)]['DOI']

    source1_indexed= unionlist3_adjusted[unionlist3_adjusted['source_old'].str.contains(source1)]['DOI']
    source2_indexed= unionlist3_adjusted[unionlist3_adjusted['source_old'].str.contains(source2)]['DOI']

    pairwise_covered= len(set(source1_covered) & set(source2_covered))
    pairwise_indexed= len(set(source1_indexed) & set(source2_indexed))

    
    indexingAgreement_score=0
    
    # Handling of ZeroDivision error
    if pairwise_indexed== 0:
        indexingAgreement_score=0
    else:
        # Calculating the pairswise_RIA_Scores between two sourcs
        indexingAgreement_score= round((pairwise_indexed/pairwise_covered) *100,2)
    
#     print(indexingAgreement_score)
#     print('********')
    
    pairswise2_RIA_Scores.append([source1,source2, indexingAgreement_score])
    
# pairswise2_RIA_Scores
    

In [None]:
pairwise_ria= pd.DataFrame(pairswise2_RIA_Scores,columns=['source1','source2','RetractionIndexingAgreement_SOURCE(%)' ])

pairwise_ria.replace('Medline','MEDLINE', inplace=True)
pairwise_ria.replace('WoS_Core','Web of Science Core', inplace=True)

pairwise_ria\
#             .to_csv(result_dir+'pairwise_ria_score_sources_data.csv')

In [None]:
"""
HeatMap for Pairwise Retraction Indexing Agreement Between Pair Sources
"""
from matplotlib.colors import ListedColormap
# Assuming pairwise_ria has the required data

heatmap_data = pairwise_ria.pivot(index='source1', columns='source2', values='RetractionIndexingAgreement_SOURCE(%)')

# Create a figure and axis
fig02, ax02 = plt.subplots(figsize=(12, 13))

cmap = ListedColormap(['red', 'orange', '#1f77b4', 'green'])

# Create the heatmap using Seaborn and the ax object
cax = sns.heatmap(heatmap_data, annot=True, cmap=cmap, fmt='.2f', linewidths=0.0, vmin=0, vmax=100, 
                  ax=ax02, annot_kws={"size": 18})

# Set labels and title
ax02.set_xlabel('')
ax02.set_ylabel('')

# ax02.set_title('Pairwise RetractionIndexingAgreement Between Sources', size=15)

# Set the font size of x-axis and y-axis labels
ax02.tick_params(axis='x', labelsize=13)  # Font size for x-axis labels
ax02.tick_params(axis='y', labelsize=13)  # Font size for y-axis labels

# Hide the default color bar
cax.collections[0].colorbar.remove()

# Show the colorbar and set its label
cbar = cax.figure.colorbar(cax.collections[0], ax=ax02, ticks=[0, 25, 50, 75, 100])
cbar.set_label('RetractionIndexingAgreement (%)',fontsize= 13)

# Set the background color of the figure
fig02.patch.set_facecolor('#ccd9ff')

plt.subplots_adjust(left=0.05, right=1.05, top=0.95, bottom=0.05)  # Remove space outside the plot

ax02.grid(False)

plt.tight_layout() 
plt.show()

# Save to folder
# plt.savefig(result_dir+'pairwise_RetractionIndexingAgreement_sources.png')


## Section 3: Calculation of RetractionIndexAgreement by Item

In [None]:
"""
Recall:
v = unique retraction indexing agreement scores from unionlist3_adjusted, stored as an array
c = count of items with unique retraction indexing agreement score, stored as an array
"""
v, c = np.unique(unionlist3_adjusted['RetractionIndexingAgreement_ITEM(%)'], return_counts=True)
v, c

In [None]:
"""
Overall Retraction Indexing Agreement by Items
"""

sns.set_theme()

# Create a figure and axis
fig3, ax3 = plt.subplots(figsize = (10, 10))

# Use Seaborn's barplot with 'counts' on the y-axis and 'unique RetractionIndexingAgreement scores' on the x-axis
sns.barplot(x=v, y=c,  ax=ax3, orient='h', color="#1f77b4", ci= None) #errorbar=None

ax3.set_xticks(range(0, 101,5))

# Display the numbers on the bars
for p in ax3.patches:
    ax3.text(p.get_width(), p.get_y() + p.get_height() / 2, f'{int(p.get_width())}',
            ha='left',va='center')

# Define the positions for vertical dashed lines
vertical_lines = [25,50,75]

# Plot vertical dashed lines at the specified positions
for line in vertical_lines:
    ax3.axvline(x=line, color='red', linestyle='--', linewidth=0.7)

# Set plot labels and title
ax3.set_xlabel('RetractionIndexingAgreement_ITEM(%)')
ax3.set_ylabel('Number of Items') #Number of DOIs
# ax3.set_title('RetractionIndexingAgreement Score By Item')

plt.show()

# Save to folder
# plt.savefig(result_dir+'indexing_agreement_score_by_items.png')

In [None]:
"""
Mapping Color to the Count of Items in Retraction Indexing Agreement by Items
"""

items_score_df= pd.DataFrame(v,c).reset_index()
items_score_df= items_score_df.rename(columns={'index':'noDOIs',0: 'RIA_Score(%)'})


cmap03_0 = {
    'red': 24,
    'orange': 49,
    '#1f77b4': 74, #blue
    'green': 100}

def get_color2(value):
    for color, threshold in cmap03_0.items():
        if value <= threshold:
            return color

# # Use the colormap to assign colors based on the percentages
# overview_RIA_source['Color'] = overview_RIA_source['RetractionIndexingAgreement_SOURCE(%)'].apply(get_color)

# # Assigning color to Sources
# mycolor_01_1 = dict(zip(overview_RIA_source['Source'], overview_RIA_source['Color']))

items_score_df['Fraction_in_Unionlist(%)'] = round((items_score_df['noDOIs'] / len(unionlist_indexed)) * 100, 2)

# Use the colormap to assign colors based on the percentages
items_score_df['Color']= items_score_df['RIA_Score(%)'].apply(get_color2)

items_score_df

In [None]:
"""
Fetching RetractionIndexingAgreement_ITEM(%) for Each Source
"""
sources_ria_df= pd.DataFrame(v, columns=['RIA_Score'])

for source in sources_indexed:
    #print(source)
    filter_source= unionlist3_adjusted[unionlist3_adjusted['source_old'].str.contains(source)]
    
    ria_score, nDOI = np.unique(filter_source['RetractionIndexingAgreement_ITEM(%)'], return_counts=True)
    
    tempo_df= pd.DataFrame({'RIA_Score': ria_score, source: nDOI})#columns=['RIA_Score',source]
    
    sources_ria_df= pd.merge(sources_ria_df,tempo_df,on='RIA_Score',how='left')


sources_ria_df= sources_ria_df.fillna(0.0)
sources_ria_df.set_index('RIA_Score', inplace= True)

sources_ria_df_= sources_ria_df.transpose()
sources_ria_df_ 

In [None]:
# Unindexing the RIA_Score and renaming to Source
sources_ria_df_.columns
sources_ria_df_.reset_index(inplace=True)
sources_ria_df_.rename(columns={'index':'Source'}, inplace=True)
sources_ria_df_

"""
Normalizing the raw count into percentage
"""
row_sums = sources_ria_df_.iloc[:, 1:].sum(axis=1)
numeric_df = sources_ria_df_.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')
result_item_score = pd.concat([sources_ria_df_['Source'], (numeric_df.div(row_sums, axis=0))*100], axis=1)

result_item_score 

In [None]:
"""
Retraction Indexing Agreement by Items: Fraction of Items in the Union list 
"""
fig3_0, ax3_0 = plt.subplots(figsize=(12, 8))

# Create the bar plot
sns.barplot(x=items_score_df['RIA_Score(%)'], y=items_score_df['Fraction_in_Unionlist(%)'],
            ax=ax3_0, color="#1f77b4", ci=None)

# Add text labels to the bars
# for index, row in items_score_df.iterrows():
#     ax3_0.text(row['RIA_Score(%)'], row['Fraction_in_Unionlist(%)'], f"{row['Fraction_in_Unionlist(%)']:.2f}%",
#                ha='center', va='top', fontsize=9, color='black')

# Set labels and title
ax3_0.set_xlabel("RIA Score (%)")
ax3_0.set_ylabel("Fraction in Unionlist (%)")
# ax3_0.set_title("Percentage of DOIs with their RIA_Score in Unionlist", size=13)

plt.show()

# Save to folder
# plt.savefig(result_dir+'indexing_agreement_score_by_items_inUnionlist.png')


In [None]:
"""
Retraction Indexing Agreement by Items: Fraction of Items in the Union list  (Coloring)
"""

plt.style.use('bmh')
fig3_01_1, ax3_01_1 = plt.subplots(figsize=(14, 8))

mycolor_3_01_1 = dict(zip(items_score_df['RIA_Score(%)'].astype(str), items_score_df['Color']))

# Create the bar plot
sns.barplot(x=items_score_df['RIA_Score(%)'],
            y=items_score_df['Fraction_in_Unionlist(%)'],
            ax=ax3_01_1, 
            color="#1f77b4", 
            palette=mycolor_3_01_1,
            ci=None)

#Add custom legend
legend_labels = {
    'red': '<25%',
    'orange': '25% to <50%',
    '#1f77b4': '50% to <75%',
    'green': '75% to 100%'
}


legend_handles = [mpatches.Patch(color=color, label=label) for color, label in legend_labels.items()]

ax3_01_1.legend(handles=legend_handles, title="RIA Score Range", loc="upper left")


# Set labels and title
ax3_01_1.set_xlabel("RetractionIndexingAgreement_Score(%)")
ax3_01_1.set_ylabel("Fraction of Items in Unionlist(%)")

fig3_01_1.patch.set_facecolor('#ccd9ff') 
plt.tight_layout(pad=2.0)
plt.show()

# Save to folder
# plt.savefig(result_dir+'indexing_agreement_score_by_items_inUnionlist_colored.png')


In [None]:
"""
Grouping RIA score: '<25%' '25% to <50%' '50% to <75%' '75% to <100%' '100%'
"""
less_than_25=items_score_df[items_score_df['RIA_Score(%)']<25].iloc[:,:-1]
btw_25_50=items_score_df[(items_score_df['RIA_Score(%)']>=25) & (items_score_df['RIA_Score(%)']<50)].iloc[:,:-1]
btw_50_75=items_score_df[(items_score_df['RIA_Score(%)']>=50) & (items_score_df['RIA_Score(%)']<75)].iloc[:,:-1]
greater_than_75=items_score_df[(items_score_df['RIA_Score(%)']>=75) & (items_score_df['RIA_Score(%)']<100)].iloc[:,:-1]
is_100=items_score_df[items_score_df['RIA_Score(%)']==100].iloc[:,:-1]


table3_01_2= pd.DataFrame([less_than_25.sum(),btw_25_50.sum(),btw_50_75.sum(),greater_than_75.sum(),is_100.sum()]).drop(['RIA_Score(%)'],axis=1)

table3_01_2['noDOIs']= table3_01_2['noDOIs'].astype(int)

table3_01_2['RIA_Score(%)'] = ["<25%",'25% to <50%','50% to <75%','75% to <100%', '100%']
table3_01_2['Color']= ['red','orange','#1f77b4','green','#00FF00']
table3_01_2

In [None]:
"""
Retraction Indexing Agreement by Items (in Group)
"""

plt.style.use('bmh')
fig3_01_2, ax3_01_2 = plt.subplots(figsize=(14, 8))

mycolor_3_01_2 = dict(zip(table3_01_2['RIA_Score(%)'], table3_01_2['Color']))

# Create the bar plot
sns.barplot(y=table3_01_2['Fraction_in_Unionlist(%)'], x=table3_01_2['RIA_Score(%)'],
            ax=ax3_01_2, color="#1f77b4",palette=mycolor_3_01_2, ci=None)

#Add custom legend
legend_labels = {
    'red': '<25%',
    'orange': '25% to <50%',
    '#1f77b4': '50% to <75%',
    'green': '75% to <100%',
    '#00FF00':'100%'
}

legend_handles = [mpatches.Patch(color=color, label=label) for color, label in legend_labels.items()]

ax3_01_2.legend(handles=legend_handles, title="RIA Score Range", loc="upper left")

# Label bars with noDOIs values
for bar, noDOIs in zip(ax3_01_2.patches, table3_01_2['noDOIs']):
    ax3_01_2.annotate(f"{noDOIs} Items", #f"{noDOIs} / {len(unionlist3)}"
                      xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                      xytext=(0, 3),  # 3 points vertical offset
                      textcoords="offset points",
                      ha='center', va='bottom')

# Set labels and title
ax3_01_2.set_xlabel("RetractionIndexingAgreement_Score(%)")
ax3_01_2.set_ylabel("Fraction of Items in Unionlist(%)")

fig3_01_2.patch.set_facecolor('#ccd9ff') 
plt.tight_layout(pad=2.0)
plt.show()

# Save to folder
# plt.savefig(result_dir+'indexing_agreement_score_by_items_inUnionlist_colored2.png')


In [None]:
"""
RIA by Items: Recategorizing the result into ranges by Source
"""
col_index = result_item_score.columns[1:]


index_0_to_49= [i for i in col_index if i < 50]
index_50_to_74 = [i for i in col_index if 50 <= i < 75]
index_75_to_99 = [i for i in col_index if 75 <= i < 100]
index_100 = [i for i in col_index if i == 100]

result_item_category= pd.DataFrame({'Source': result_item_score['Source'],
                                '< 50%':result_item_score[index_0_to_49].sum(axis=1),
                                '50% to <75%':result_item_score[index_50_to_74].sum(axis=1),
                                '75% to <100%':result_item_score[index_75_to_99].sum(axis=1),
                                '100%':result_item_score[index_100].sum(axis=1)})

result_item_category.sort_values(by='Source', ascending =True, inplace=True)
result_item_category= result_item_category.round(2)


result_item_category.replace('Medline', 'MEDLINE', inplace =True)
result_item_category.replace('WoS_Core', 'Web of Science Core', inplace =True)
result_item_category

In [None]:
"""
Retraction Indexing Agreemene by Items per Source (horizontal)
"""

plt.style.use('bmh')
fig3_1, ax3_1 =  plt.subplots(figsize=(14, 10))
# plt.subplots_adjust(left=0.8)

# Plot the horizontal stacked bar graph
result_item_category.plot(
    x='Source',
    kind='barh',
    stacked=True,
    #title='Percentage of RetractionIndexingAgreement_DOI in Sources',
    mark_right=True,
    legend=True,
    fontsize=14,
    color=['#ff0000', 'orange', 'green', "#1f77b4"],   # '#ff8000 -red
    ax=ax3_1)

# Loop through each bar to remove white lines
for patch in ax3_1.patches:
    patch.set_edgecolor('none')

# Place the legend outside the plot
ax3_1.legend(loc='upper left', bbox_to_anchor=(1, 1), title='Agreement Score', fontsize=8)

for p in ax3_1.patches:
    width = p.get_width()
    label_x = p.get_x() + width / 2
    label_y = p.get_y() + p.get_height() / 2
    ax3_1.text(label_x, label_y, f'{width:.1f}%', ha='center', va='center', color='black', fontsize=11)

# Set the background color of the figure
fig3_1.patch.set_facecolor('#ccd9ff')

# Rotate the y-axis labels
plt.setp(ax3_1.get_yticklabels(), rotation=45)

# Set the background color of the figure
fig3_1.patch.set_facecolor('#ccd9ff')

# Adjust layout to add space between y-axis labels and the graph
plt.tight_layout(pad=3.0)


# Add space between y-axis scale label and the graph
plt.subplots_adjust(left=0.12,bottom=0.03)

# Remove the number scale on the x-axis
plt.xticks([])

# Remove 'Source Label'
ax3_1.set_ylabel('') 

plt.show()

# Save to folder    
# plt.savefig(result_dir+'perecent_DOIs_RIA_in_sources.png')


In [None]:
"""
Retraction Indexing Agreemene by Items per Source (Vertical)
"""

plt.style.use('bmh')
fig3_2, ax3_2= plt.subplots(figsize=(10, 8))

"""
<25%: Light red or pink shades. Hexadecimal color code: #FFCCCC or #FFB6C1.
25% to <50%: Light orange or peach shades. Hexadecimal color code: #FFDAB9 or #FFA07A.
50% to <75%: Light yellow or gold shades. Hexadecimal color code: #FFFFCC or #FFD700.
75% to <100%: Light green or lime shades. Hexadecimal color code: #BDFCC9 or #ADFF2F.
100%: Bright green. Hexadecimal color code: #00FF00 or #7CFC00

"""

# Stack bars
bar1=ax3_2.bar(result_item_category['Source'], result_item_category['< 50%'], label='< 50%',color= 'orange' ) #orange
bar2= ax3_2.bar(result_item_category['Source'], result_item_category['50% to <75%'], bottom=result_item_category['< 50%'], label='50% to <75%',color='#1f77b4')
bar3= ax3_2.bar(result_item_category['Source'], result_item_category['75% to <100%'], bottom=result_item_category['< 50%'] + result_item_category['50% to <75%'], label='75% to <100%', color='green')
bar4= ax3_2.bar(result_item_category['Source'], result_item_category['100%'], bottom=result_item_category['< 50%'] + result_item_category['50% to <75%'] + result_item_category['75% to <100%'], label='100%',color='#00FF00')

# Rotate x-axis labels for better visibility
plt.xticks(rotation=45, ha='right')

# # Add text annotations
# for bar in [bar1, bar2, bar3, bar4]:
#     for rect in bar:
#         height = rect.get_height()
#         ax3_2.annotate(f'{height:.2f}%', xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 2),
#                        textcoords="offset points", ha='center', va='bottom')

# Loop through each bar to remove white lines
for patch in ax3_2.patches:
    patch.set_edgecolor('none')

# Add legend
ax3_2.legend(title='Agreement Score', bbox_to_anchor=(1, 1), fontsize=8)

# Set labels and title
ax3_2.set_ylabel('Percentage of Items')
# ax3_2.set_xlabel('Source')
# ax3_2.set_title('Stacked Percentage by Source')

# Set the background color of the figure
fig3_2.patch.set_facecolor('#ccd9ff')

plt.tight_layout()
plt.show()

# Save to folder
# plt.savefig(result_dir+'perecent_DOIs_RIA_in_sources2.png')


## Section 4: Calculation of Publication and Retraction Year Distribution

In [None]:
"""
Calculating cummulative percentage
"""
cummlative_totalpubs= unionlist3.groupby(['Year'])['DOI'].agg('count').reset_index()\
                        .rename(columns={'DOI': 'No_of_DOIs'})
cummlative_totalpubs['DOICumulativeFreq']= cummlative_totalpubs['No_of_DOIs'].cumsum()

cummlative_totalpubs.max = cummlative_totalpubs['DOICumulativeFreq'].max()

cummlative_totalpubs['CumulativePercentage'] = (cummlative_totalpubs['DOICumulativeFreq'] / cummlative_totalpubs.max * 100).round(2)

cummlative_totalpubs

In [None]:
# Calculating Cummulative Frequency of the Time to Retraction

mask1=(unionlist3['TimetoRetraction']>=0) 

cumulative_retractiongap = unionlist3[mask1]['TimetoRetraction'].value_counts().sort_index().cumsum()
print(cumulative_retractiongap)

cumulative_retractiongap= cumulative_retractiongap.reset_index().rename(
    columns={'count':'TimetoRetractionCumFreq'})

cumulative_retractiongap.max=  cumulative_retractiongap['TimetoRetractionCumFreq'].max()

cumulative_retractiongap['CumulativePercentage'] = (cumulative_retractiongap['TimetoRetractionCumFreq'] / cumulative_retractiongap.max * 100).round(2)

cumulative_retractiongap

In [None]:
"""
Exploding Indexed Retracted Sources by year
"""
# Copying dataframe 
df = unionlist3.copy()

# Exploding Sources in 'source_old'
explode_source_old = df.explode('source_old')
df_explode_source_old= explode_source_old.groupby(['Year', 'source_old'])['DOI'].agg('count').reset_index()

df_explode_source_old.replace('Medline','MEDLINE', inplace=True)
df_explode_source_old.replace('WoS_Core','Web of Science Core', inplace=True)

df_explode_source_old

In [None]:
"""
Plotting retraction indexing distribution in sources using HeatMap
"""

# Filtering early year with low retracted publications: between 2000 and 2024
df_explode_source_old_rs = df_explode_source_old[(df_explode_source_old['Year']>=2000)] 
df_explode_source_old_rs

sns.set_theme()

# Pivot the data
pivot_data03 = df_explode_source_old_rs.pivot_table(index='source_old', columns='Year', values='DOI', aggfunc='sum', fill_value=0)
pivot_data03.sort_values(by='source_old',ascending=False, inplace=True)

# Create the heatmap using seaborn
fig04= plt.figure(figsize=(18, 7))

# Update the heatmap with the new color bar
ax04= sns.heatmap(pivot_data03, annot=False, fmt='d', cmap='gist_heat_r', linewidths=0.05,vmin=0, vmax=5000, robust=True, )

ax04.set_xlabel('')
ax04.set_ylabel('')

# Set the font size of x-axis and y-axis labels
ax04.tick_params(axis='x', labelsize=12.)  # Font size for x-axis labels
ax04.tick_params(axis='y', labelsize=12.)  # Font size for y-axis labels

ax04.set_title('Distribution of Indexed Retracted Publications in the Sources', size=13)

# Set labels and title
ax04.set_xlabel('Publication Year', size=13)

# Remove the space outside the plot
plt.subplots_adjust(left=0.1, right=1.09, top=0.9, bottom=0.1)

# Rotate the y-axis labels
plt.setp(ax04.get_xticklabels(), rotation=360)
plt.setp(ax04.get_yticklabels(), rotation=360)

fig04.patch.set_facecolor('#ccd9ff')  # Set the background color of the figure

# Save to folder
# plt.savefig(result_dir+'indexed_per_sources_distribution_heatmap.png')

In [None]:
"""
Plotting Distribution of Total Retracted Publication
"""
fig04_0, ax04_0 = plt.subplots(figsize=(14, 10))

# Set the font size of x-axis and y-axis labels
ax04_0.tick_params(axis='x', labelsize=12.)  # Font size for x-axis labels
ax04_0.tick_params(axis='y', labelsize=12.) 

plt.bar(cummlative_totalpubs.Year, cummlative_totalpubs.No_of_DOIs) #(values, counts)
ax04_0.set_xticks(range(1940, 2024, 10))
plt.xlabel("Publication Year")
plt.ylabel("No. of Items")
plt.title("Publication Year Distribution of Retracted Items", size=14)

 # Add subplot inside highlighting retracted publication before 1991
sumDOIYear1= cummlative_totalpubs[cummlative_totalpubs['Year']<1991]
ax04_0 = fig04_0.add_axes([0.2, 0.4, 0.5, 0.35],  frameon=True,)  
for spine in ax04_0.spines.values():
    spine.set_edgecolor('black')
    
ax04_0.bar(sumDOIYear1.Year, sumDOIYear1.No_of_DOIs)
ax04_0.set_xticks(range(1940, 1991,10))
#ax0.xaxis.set_major_locator(MaxNLocator(integer=True))

# # Add labels and title
ax04_0.set_xlabel("Publication Year")
ax04_0.set_ylabel("No. of Items")

# Set the font size of x-axis and y-axis labels
ax04_0.tick_params(axis='x', labelsize=12.)  # Font size for x-axis labels
ax04_0.tick_params(axis='y', labelsize=12.) 

ax04_0.grid(False)

fig04_0.patch.set_facecolor('#ccd9ff') 

# Remove the space outside the plot
plt.subplots_adjust(left=0.1, right=0.9, top=0.95, bottom=0.1)

# Display the plot
plt.show()

# Saving
# plt.savefig(result_dir+'distribution_all_indexed_retracted_pubyear.png')

In [None]:
# Using this adjusted one
"""
Plotting Cumulative Time Between Publication Year and Retraction Year, 
And Plotting Distribution of Publication Year
"""

from matplotlib.ticker import PercentFormatter
fig05, ax05 = plt.subplots(figsize=(15,8))

"""
Plotting Cummulative data of Retraction Year
"""
ax05.plot(cumulative_retractiongap['TimetoRetraction'],cumulative_retractiongap['CumulativePercentage'], marker='o', linestyle='-')

#Set x- & y-axes ticks 
ax05.set_yticks(range(30, 110,10))
ax05.set_xticks([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80])

# Set y-axis ticks at intervals of 10%
ax05.yaxis.set_major_formatter(PercentFormatter(xmax=100, decimals=0))

#Customize the plot
# ax05.set_title('Cumulative Plot of Years Between Publication and Retraction', size=14)
ax05.set_xlabel('Years Between Publication and Retraction')
ax05.set_ylabel('Cumulative Percentage')

# Set the font size of x-axis and y-axis labels
ax05.tick_params(axis='x', labelsize=12.)  # Font size for x-axis labels
ax05.tick_params(axis='y', labelsize=12.) 


"""
The Larger Bar Chart
"""
#[0.3, 0.2, 0.55, 0.6]
ax05_0= fig05.add_axes([0.32, 0.2, 0.48, 0.58],  frameon=True,)  # [left, bottom, width, height]
for spine in ax05_0.spines.values():
    spine.set_edgecolor('black')

#Plotting bar for the Large Bar Chart
ax05_0.bar(cummlative_totalpubs.Year, cummlative_totalpubs.No_of_DOIs)
ax05_0.grid(False)

ax05_0.set_xticks(range(1940, 2024, 10))

# Add Title
ax05_0.set_title("Publication Year Distribution of Retracted Items", size=14)

# Add labels and title
ax05_0.set_xlabel("Publication Year")
ax05_0.set_ylabel("No. of Items")



fig05.patch.set_facecolor('#ccd9ff') 
ax05_0.set_xticks(range(1940, 2024, 10))

"""
The Smaller Bar Chart
"""
 # First new axes
sumDOIYear2= cummlative_totalpubs[cummlative_totalpubs['Year']<1991]
ax05_1 = fig05.add_axes([0.38, 0.38, 0.3, 0.32],  frameon=True)  
for spine1 in ax05_1.spines.values():
    spine1.set_edgecolor('black')
    
ax05_1.bar(sumDOIYear2.Year, sumDOIYear2.No_of_DOIs)
ax05_1.set_xticks(range(1940, 1991,10))

# Add labels and title
ax05_1.set_xlabel("Publication Year")
ax05_1.set_ylabel("No. of Items")

plt.show()

# Save to folder
# plt.savefig(result_dir+'distribution_all_indexed_pubyear_and_retractedyear.png')

In [None]:
def check_source(sources_col: pd.Series, source: str)-> int:
    """
    This function takes either indexed column (source_old) or covered column (source_new) and checks
    if a given source is present for each item record. If it's present return 1 else return 0
    
    :param sources_col: the column to search to search in
    :param source: name of the source to check in the 'sources_col' column
    :return: 1 or 0
    """
    return sources_col.apply(lambda sources: 1 if source in sources else 0)

In [None]:
"""
Preparing the dataset for indexing & coverage of retracted publications by year graph
"""
sources_stats = unionlist3_adjusted[['DOI','Year']].copy() #pd.DataFrame()
# copied 'DOI' and 'Year' columns to append with each source's indexing and coverage check result

for source in sources_indexed:
    
    if source == 'Retraction Watch':
        continue # We do not need Retraction Watch since it indexes retracted papers only
    #print(source)
    source_= re.sub(r'\s', '', source).lower()
    indexed= source_+'_idx' # Adding 'idx' to source column with indexed items
    covered= source_+'_cov'# Adding 'cov' to source column with covered items
    #print(source, index, covered)
    
    # Checking indexing and coverage for each item
    sources_stats[indexed]=  check_source(unionlist3_adjusted['source_old'], source)
    sources_stats[covered]=  check_source(unionlist3_adjusted['source_new'], source)

# sources_stats= sources_stats[sources_stats['Year']<2024] # Filtered out year 2024

sources_stats

In [None]:
"""
Plotting Distribution of indexing & coverage of retracted publications by year -1 
"""
sns.set_theme()
fig06, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 12))


for i in range(len(sources_indexed[:6])): # For visibility can only show 6 subplots per figure
    source=sources_indexed[i].split()
    source= ''.join(source)
    source = sources_indexed[i].lower()
    
    source_= sources_indexed[i]
    
    if source_ == 'Medline':
        source_= 'MEDLINE'
    
    source = source.lower()
    indexed = source + '_idx'  # Adding 'idx' to source column with indexed items
    covered = source + '_cov'

    

    f_sources_stats= sources_stats[(sources_stats['Year'] > 1990)] 
    i_data = f_sources_stats.groupby(['Year'])[indexed].agg('sum').reset_index()[indexed]
    c_data = f_sources_stats.groupby(['Year'])[covered].agg('sum').reset_index()[covered]
    x_data = f_sources_stats.groupby(['Year']).agg('count').reset_index()['Year']

    y1 = i_data
    y2 = c_data

    ax = axes[i//2, i%2]  # Get the appropriate subplot

    ax.bar(x_data, y1, label=f'Items indexed as retracted', color='red')  # crimson
    ax.bar(x_data, y2, bottom=y1, label=f'Items covered \n(not indexed as retracted)', alpha=0.8) #f'Items covered'

#   ax.set_xlabel('Publication Year')
    ax.set_ylabel('Count')
    ax.set_title(f'{source_}')
    
    # Set x-axis limits
    ax.set_xlim(1990, 2025)
    ax.legend(loc='upper left',  fontsize=10)
    
    # Only set x-axis label for subplots in the last row
    if i > 3:
        ax.set_xlabel('Publication Year')

# fig06.suptitle('Publication year and retraction year distribution of retracted publications')  
# ax05.grid(True)
fig06.patch.set_facecolor('#ccd9ff') 

plt.tight_layout()
plt.show()

# Save to folder
# plt.savefig(result_dir+'distribution_of_indexed_and_covered_each_source1.png')

In [None]:
"""
Plotting Distribution of indexing & coverage of retracted publications by year -2
"""
sns.set_theme()
fig06_1, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

sources_indexed2= sources_indexed[6:].copy()
sources_indexed2.remove('Retraction Watch') # Since all items in Retraction Watch are indexed as retracted

for i in range(len(sources_indexed2)):
    
    if sources_indexed2[i] == 'Retraction Watch':
        continue # We do not need Retraction Watch since it indexes retracted papers only. 
    
    source= sources_indexed2[i].split()
        
    source= ''.join(source)
    source = source.lower()
#    print(source)
    indexed = source + '_idx'  # Adding 'idx' to source column with indexed items
    covered = source + '_cov'
    
    source_= sources_indexed2[i]
    
    if source_ == 'Medline':
        source_= 'MEDLINE'
        
    if source_ == 'WoS_Core':
        source_= 'Web of Science Core'

    f_sources_stats= sources_stats[sources_stats['Year'] > 1990]
    i_data = f_sources_stats.groupby(['Year'])[indexed].agg('sum').reset_index()[indexed]
    c_data = f_sources_stats.groupby(['Year'])[covered].agg('sum').reset_index()[covered]
    x_data = f_sources_stats.groupby(['Year']).agg('count').reset_index()['Year']

    y1 = i_data
    y2 = c_data

    ax = axes[i//2, i%2]  # Get the appropriate subplot

    ax.bar(x_data, y1, label=f'Items indexed as retracted', color='red')  # crimson {source_}
    ax.bar(x_data, y2, bottom=y1, label=f'Items covered \n(not indexed as retracted)', alpha=0.8) # in {source_} 

#     ax.set_xlabel('Publication Year')
    ax.set_ylabel('Count')
    ax.set_title(f'{source_}')
    
    # Only set x-axis label for subplots in the last row
    if i > 1:
        ax.set_xlabel('Publication Year')

    
    # Set x-axis limits
    ax.set_xlim(1990, 2025)
    ax.legend(loc='upper left',  fontsize=10)

# fig06_1.suptitle('Publication year and retraction year distribution of retracted publications')

#ax05.grid(True)
fig06_1.patch.set_facecolor('#ccd9ff') 

plt.tight_layout()
plt.show()

# Save to folder
# plt.savefig(result_dir+'distribution_of_indexed_and_covered_each_source2.png')

In [None]:
"""
Plotting Distribution of indexing & coverage of retracted publications by year -3
"""
sns.set_theme()
# Create a figure and axis
fig06_2, ax06_2 = plt.subplots(figsize=(6, 4))

source_= 'Web of Science Core'
indexed= 'wos_core_idx'
covered= 'wos_core_cov'

f_sources_stats= sources_stats[sources_stats['Year'] > 1990]
i_data = f_sources_stats.groupby(['Year'])[indexed].agg('sum').reset_index()[indexed]
c_data = f_sources_stats.groupby(['Year'])[covered].agg('sum').reset_index()[covered]
x_data = f_sources_stats.groupby(['Year']).agg('count').reset_index()['Year']

y1 = i_data
y2 = c_data

# Create a stacked bar plot
ax06_2.bar(x_data, y1, label=f'Items indexed as retracted', color='red')  # crimson {source_}
ax06_2.bar(x_data, y2, bottom=y1, label=f'Items covered \n(not indexed as retracted) ', alpha=0.8) # in {source_}


ax06_2.set_xlabel('Publication Year')
ax06_2.set_ylabel('Count')
ax06_2.set_title(f'{source_}')

# Set x-axis limits
ax06_2.set_xlim(1990, 2025)
ax06_2.legend(loc='upper left',  fontsize=10)
fig06_2.subplots_adjust(bottom=0.15)

# ax06_2.set_xticks(range(x.min(), x.max(), 5))

fig06_2.patch.set_facecolor('#ccd9ff') 
plt.show()

# Save to folder
# plt.savefig(result_dir+'distribution_of_indexed_and_covered_each_source3.png')


In [None]:
# copied 'DOI',and 'Year' columns to append with each sources result of indexing and coverage checking
"""
Plotting Distribution of coverage of retracted publications by for sources without indexing mechanism
"""

sources_covered_only= ['ADS', 'GeoRef', 'IEEE', 'Inspec', 'ScienceDirect', 'ZOOREC']

sources_stats2 = unionlist3[['DOI','Year']].copy() #pd.DataFrame()

for source in sources_covered_only:
    # print(source)
    source_= re.sub(r'\s', '', source).lower()
    covered= source_+'_cov'# Adding 'cov' to source column with covered items
    # print(source, covered)
    
    # Checking coverage for each item
    sources_stats2[covered]=  check_source(unionlist3['source_new'], source)

# sources_stats= sources_stats[sources_stats['Year']<2024] # Filtered out year 2024

sources_stats2

In [None]:
"""
Plotting Distribution of coverage of retracted publications by year -4
"""
sns.set_theme()
fig06_3, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 12))

for i in range(len(sources_covered_only)):
    
    source= sources_covered_only[i].split()
    source= ''.join(source)
    source = source.lower()
#     print(source)
    
    source_= sources_covered_only[i]

    covered = source + '_cov'

    f_sources_stats= sources_stats2[sources_stats['Year'] > 1990]
    c_data = f_sources_stats.groupby(['Year'])[covered].agg('sum').reset_index()[covered]
    x_data = f_sources_stats.groupby(['Year']).agg('count').reset_index()['Year']

    y1 = i_data
    y2 = c_data

    ax = axes[i//2, i%2]  # Get the appropriate subplot

#     ax.bar(x_data, y1, label=f'Items indexed as retracted', color='red')  # crimson {source_}
    ax.bar(x_data, y2, label=f'Items covered \n(not indexed as retracted)', alpha=0.8) # in {source_}

#     ax.set_xlabel('Publication Year')
    ax.set_ylabel('Count')
    ax.set_title(f'{source_}')
    
    # Set x-axis limits
    ax.set_xlim(1990, 2025)
    ax.legend(loc='upper left',  fontsize=10)
#     ax.set_xlabel('Publication Year')
    
    # Only set x-axis label for subplots in the last row
    if i > 3:
        ax.set_xlabel('Publication Year')


# fig06_3.suptitle('Publication year and retraction year distribution of retracted publications')

#ax05.grid(True)
fig06_3.patch.set_facecolor('#ccd9ff') 

fig06_3.subplots_adjust(hspace=0.4, wspace=0.7)
plt.tight_layout()
plt.show()

# Save to folder
# plt.savefig(result_dir+'distribution_of_covered_only_each_source.png')


## Section 5: Intersection of Indexed Retracted Publication

In [None]:
"""
Computating the indexing status for sources
"""
unionlist3['source_new'] = unionlist3['source_new'].apply(str)
unionlist3['source_old'] = unionlist3['source_old'].apply(str)

compendex_doi= unionlist3[unionlist3['source_old'].str.contains('Compendex')]['DOI'].tolist()
crossref_doi= unionlist3[unionlist3['source_old'].str.contains('Crossref')]['DOI'].tolist()
geobase_doi= unionlist3[unionlist3['source_old'].str.contains('GEOBASE')]['DOI'].tolist()
pubmed_doi= unionlist3[unionlist3['source_old'].str.contains('PubMed')]['DOI'].tolist()
rw_doi= unionlist3[unionlist3['source_old'].str.contains('Retraction Watch')]['DOI'].tolist()
scopus_doi= unionlist3[unionlist3['source_old'].str.contains('Scopus')]['DOI'].tolist()
# wos_doi= unionlist3[unionlist3['source_old'].str.contains('Web of Science')]['DOI'].tolist()

bci_doi= unionlist3[unionlist3['source_old'].str.contains('BCI')]['DOI'].tolist()
bioabs_doi= unionlist3[unionlist3['source_old'].str.contains('BIOABS')]['DOI'].tolist()
medline_doi= unionlist3[unionlist3['source_old'].str.contains('Medline')]['DOI'].tolist()
ccc_doi= unionlist3[unionlist3['source_old'].str.contains('CCC')]['DOI'].tolist()
woscore_doi= unionlist3[unionlist3['source_old'].str.contains('WoS_Core')]['DOI'].tolist()


In [None]:
"""
Computating the coverage status for sources
"""

compendex_doi_cov= unionlist3[unionlist3['source_new'].str.contains('Compendex')]['DOI'].tolist()
crossref_doi_cov= unionlist3[unionlist3['source_new'].str.contains('Crossref')]['DOI'].tolist()
geobase_doi_cov= unionlist3[unionlist3['source_new'].str.contains('GEOBASE')]['DOI'].tolist()
pubmed_doi_cov= unionlist3[unionlist3['source_new'].str.contains('PubMed')]['DOI'].tolist()
rw_doi_cov= unionlist3[unionlist3['source_new'].str.contains('Retraction Watch')]['DOI'].tolist()
scopus_doi_cov= unionlist3[unionlist3['source_new'].str.contains('Scopus')]['DOI'].tolist()
# wos_doi_cov= unionlist3[unionlist3['source_new'].str.contains('Web of Science')]['DOI'].tolist()

bci_doi_cov= unionlist3[unionlist3['source_new'].str.contains('BCI')]['DOI'].tolist()
bioabs_doi_cov= unionlist3[unionlist3['source_new'].str.contains('BIOABS')]['DOI'].tolist()
medline_doi_cov= unionlist3[unionlist3['source_new'].str.contains('Medline')]['DOI'].tolist()
ccc_doi_cov= unionlist3[unionlist3['source_new'].str.contains('CCC')]['DOI'].tolist()
woscore_doi_cov= unionlist3[unionlist3['source_new'].str.contains('WoS_Core')]['DOI'].tolist()


In [None]:
# Check current indexing status for STI2023 study
sns.set_theme()
from upsetplot import from_contents,plot
upset_sourceindexed_STI2023 = from_contents({'Crossref': crossref_doi,
                                    'Retraction Watch': rw_doi, 
                                    'Scopus': scopus_doi,      
                                    'Web of Science Core': woscore_doi})


fig = plt.figure(figsize=(13, 6))
plot(upset_sourceindexed_STI2023, fig=fig, subset_size='count',show_counts=True,facecolor="#1f77b4",sort_categories_by='-input',element_size=None)

# plt.suptitle('Current indexing status of overlap of retracted items across databases in Schneider et al. (2023)', size=13)

plt.show()

# Saving
# plt.savefig(result_dir+'indexed_sources_intersection_status_of_STI2023.png')

In [None]:
# Check current coverage status for STI2023 study
sns.set_theme()

upset_sourcecovered_STI2023 = from_contents({'Crossref': crossref_doi_cov,
                                    'Retraction Watch': rw_doi_cov, 
                                    'Scopus': scopus_doi_cov,      
                                    'Web of Science Core': woscore_doi_cov})


fig = plt.figure(figsize=(13, 6))
plot(upset_sourcecovered_STI2023, fig=fig, subset_size='count',show_counts=True,facecolor="green",sort_categories_by='-input',element_size=None)

# plt.suptitle('Current coverage (supposed indexed) status of overlap of retracted items across databases in Schneider et al. (2023)', size=13)
plt.show()

# Saving
# plt.savefig(result_dir+'coverage_sources_intersection_status_of_STI2023.png')

In [None]:
# Intersections of indexing retracted publications in Web of Science Platform

# plt.style.use('bmh')
sns.set_theme()

upset_sourceindexed1 = from_contents({'BCI': bci_doi,
                                    'BIOABS': bioabs_doi, 
                                    'CCC': ccc_doi,
                                    'MEDLINE': medline_doi,       
                                    'Web of Science Core': woscore_doi})


fig = plt.figure(figsize=(13, 6))
plot(upset_sourceindexed1, fig=fig, subset_size='count',show_counts=True,facecolor="#1f77b4",sort_categories_by='-input',element_size=None)

# plt.suptitle('Overlap of retracted items across databases on the Web of Science Platform', size=13)
plt.show()

# Saving
# plt.savefig(result_dir+'indexed_sources_intersection_in_WoS.png')

In [None]:
# Intersections of indexing retracted publications in Web of Science Platform

# plt.style.use('bmh')
sns.set_theme()

upset_sourcecovereded1 = from_contents({'BCI': bci_doi_cov,
                                    'BIOABS': bioabs_doi_cov, 
                                    'CCC': ccc_doi_cov,
                                    'MEDLINE': medline_doi_cov,       
                                    'Web of Science Core': woscore_doi_cov})


fig = plt.figure(figsize=(16, 6))
plot(upset_sourcecovereded1, fig=fig, subset_size='count',show_counts=True,facecolor="green",sort_categories_by='-input',element_size=None)

# plt.suptitle('Overlap of coverage status of retracted items across databases on the Web of Science Platform', size=13)
plt.show()

# Saving
# plt.savefig(result_dir+'coverage_sources_intersection_in_WoS.png')


In [None]:
# Intersections of indexing retracted publications in Subject-Specific Databases

sns.set_theme()
upset_sourceindexed2 = from_contents({'BCI': bci_doi,
                                    'BIOABS': bioabs_doi, 
                                    'CCC': ccc_doi,
                                    'Compendex': compendex_doi,
                                    'GEOBASE': geobase_doi, 
                                    'MEDLINE': medline_doi,       
                                    })


fig = plt.figure(figsize=(13, 6))
plot(upset_sourceindexed2, fig=fig, subset_size='count',show_counts=True,facecolor="#1f77b4",sort_categories_by='-input',element_size=None)

# plt.suptitle('Overlap of retracted items across subject-specific databases', size=13)
plt.show()

# Saving
# plt.savefig(result_dir+'overlap_between-subject_sources.png')

In [None]:
# Intersections indexed retracted items of Elsevier databases
sns.set_theme()

upset_sourceindexed3 = from_contents({'Compendex': compendex_doi,
                                        'GEOBASE': geobase_doi, 
                                        'Scopus': scopus_doi,})

fig = plt.figure(figsize=(12, 6))
plot(upset_sourceindexed3, fig=fig, subset_size='count',show_counts=True,facecolor="#1f77b4",sort_categories_by='-input',element_size=None)

# plt.suptitle('Overlap of retracted items across Elsevier databases', size=13)

# fig.patch.set_facecolor('#ccd9ff') 

plt.show()
             
# Saving
# plt.savefig(result_dir+'indexed_sources_intersection_in_Elsevier.png')

In [None]:
# Intersections coverage status of indexed retracted items of Elsevier databases
sns.set_theme()

upset_sourceindexed3 = from_contents({'Compendex': compendex_doi_cov,
                                        'GEOBASE': geobase_doi_cov, 
                                        'Scopus': scopus_doi_cov,})

fig = plt.figure(figsize=(12, 6))
plot(upset_sourceindexed3, fig=fig, subset_size='count',show_counts=True,facecolor="green",sort_categories_by='-input',element_size=None)

# plt.suptitle('Overlap of coverage status of retracted items across Elsevier databases', size=13)

# fig.patch.set_facecolor('#ccd9ff') 

plt.show()

# Saving
# plt.savefig(result_dir+'coverage_sources_intersection_in_Elsevier.png')

In [None]:
# Intersections coverage status of indexed retracted items of Elsevier  and Crossref databases
sns.set_theme()

upset_sourcecovered3 = from_contents({'Compendex': compendex_doi_cov,
                                      'Crossref': crossref_doi_cov,
                                    'GEOBASE': geobase_doi_cov, 
                                    'Scopus': scopus_doi_cov,})

fig = plt.figure(figsize=(12, 6))
plot(upset_sourcecovered3, fig=fig, subset_size='count',show_counts=True,facecolor="green",sort_categories_by='-input',element_size=None)

# plt.suptitle('Overlap of retracted items across databases of Elsevier and Crossref', size=13)

# fig.patch.set_facecolor('#ccd9ff') 

plt.show()

# Saving
# plt.savefig(result_dir+'coverage_sources_intersection_in_Elsevier_and_Crossref.png')

In [None]:
# Intersections of indexed retracted items of Elsevier and PubMed databases

sns.set_theme()

upset_sourceindexed4 = from_contents({'Compendex': compendex_doi,
                                    'GEOBASE': geobase_doi, 
                                    'PubMed': pubmed_doi,
                                    'Scopus': scopus_doi,})

fig = plt.figure(figsize=(12, 6))
plot(upset_sourceindexed4, fig=fig, subset_size='count',show_counts=True,facecolor="#1f77b4",sort_categories_by='-input',element_size=None)

# plt.suptitle('Overlap of retracted items across databases of Elsevier and PubMed', size=13)

# fig.patch.set_facecolor('#ccd9ff') 

plt.show()

# Saving
# plt.savefig(result_dir+'indexed_sources_intersection_in_Elsevier_and_PubMed.png')

In [None]:
# Intersections coverage status of indexed retracted items of Elsevier and PubMed databases

sns.set_theme()

upset_sourcecovered4 = from_contents({'Compendex': compendex_doi_cov,
                                    'GEOBASE': geobase_doi_cov, 
                                    'PubMed': pubmed_doi_cov,
                                    'Scopus': scopus_doi_cov,})

fig = plt.figure(figsize=(12, 6))
plot(upset_sourcecovered4, fig=fig, subset_size='count',show_counts=True,facecolor="green",sort_categories_by='-input',element_size=None)

# plt.suptitle('Overlap of retracted items across databases of Elsevier and PubMed', size=13)

# fig.patch.set_facecolor('#ccd9ff') 

plt.show() 

# Saving
# plt.savefig(result_dir+'covered_sources_intersection_in_Elsevier_and_PubMed.png')

In [None]:
# Intersections of indexed retracted items of Elsevier and Web of Science Core databases
sns.set_theme()

upset_sourceindexed5 = from_contents({'Compendex': compendex_doi,
                                    'GEOBASE': geobase_doi, 
                                    'Scopus': scopus_doi,
                                     'Web of Science Core': woscore_doi})

fig = plt.figure(figsize=(12, 6))
plot(upset_sourceindexed5, fig=fig, subset_size='count',show_counts=True,facecolor="#1f77b4",sort_categories_by='-input',element_size=None)

# plt.suptitle('Overlap of retracted items across databases of Elsevier and Web of Science Core', size=13)

# fig.patch.set_facecolor('#ccd9ff') 

plt.show()

# Saving
# plt.savefig(result_dir+'indexed_sources_intersection_in_Elsevier_and_WoS.png')

In [None]:
# Intersections of Multidisciplinary Sources Vs. PubMed
sns.set_theme()

upset_sourceindexed2 = from_contents({
                                    'MEDLINE': medline_doi,
                                    'PubMed': pubmed_doi, 
                                    'Retraction Watch': rw_doi,       
                                    'Scopus': scopus_doi, 
                                    'Web of Science Core': woscore_doi})

fig = plt.figure(figsize=(17,6))
plot(upset_sourceindexed2, fig=fig, subset_size='count',show_counts=True,facecolor="#1f77b4",sort_categories_by='-input',element_size=None)

# plt.suptitle('Overlap of retracted items across multidisciplinary and PubMed sources', size=13)

# fig1_1.patch.set_facecolor('#ccd9ff') 

plt.show()

# Saving
# plt.savefig(result_dir+'indexed_sources_intersection_multidisc+pubmed.png')

In [None]:
# Intersections of Multidisciplinary Sources Vs. MEDLINE and PubMed
sns.set_theme()

upset_sourceindexed2 = from_contents({
                                    'Crossref': crossref_doi, 
                                    'MEDLINE': medline_doi,
                                    'Retraction Watch': rw_doi,       
                                    'PubMed': pubmed_doi, 
                                    })

fig = plt.figure(figsize=(14, 6))
plot(upset_sourceindexed2, fig=fig, subset_size='count',show_counts=True,facecolor="#1f77b4",sort_categories_by='-input',element_size=None)


# plt.suptitle('Overlap of retracted items across multidisciplinary, MEDLINE and PubMed sources', size=13)

plt.show()

# Saving
# plt.savefig(result_dir+'indexed_sources_intersection_multidisc_and_Medline+PubMed.png')


In [None]:
# Intersections of Major Multidisciplinary Sources
sns.set_theme()

upset_sourceindexed2 = from_contents({
                                    'Crossref': crossref_doi, 
                                    'Retraction Watch': rw_doi,       
                                    'Scopus': scopus_doi, 
                                    'Web of Science': woscore_doi})

fig = plt.figure(figsize=(14, 6))
plot(upset_sourceindexed2, fig=fig, subset_size='count',show_counts=True,facecolor="#1f77b4",sort_categories_by='-input',element_size=None)


# plt.suptitle('Overlap of retracted items across major multidisciplinary sources', size=13)

plt.show()

# Saving
# plt.savefig(result_dir+'indexed_sources_intersection_multidisc.png')

## Section 6: Journal Categorization Distribution

In [None]:
# Replace uncategorized fill items with "notcategorized"
unionlist3_adjusted['MainCategory']= unionlist3_adjusted['MainCategory'].fillna('notcategorized')

In [None]:
cat = ['General', 'Health Science', 'Life Science', 'Physical Science', 'Social Science','notcategorized']
cat_3 = ['General', 'Health \nScience', 'Life \nScience', 'Physical \nScience', 'Social \nScience', 'Not \nCategorized']

count=[]
for i in cat:
    counts = unionlist3_adjusted.MainCategory.str.count(i).sum()
    count.append(counts)

df_field = pd.DataFrame({'Fields of Study': cat_3, 'Count': count}).sort_values('Count', ascending=False)#.set_index(df.columns[0])
df_field

In [None]:
"""
Journal & Conference paper MainCategory Counts plot
"""
sns.set_theme()

# Create the figure and axis
fig07 = plt.figure(figsize=(18, 7))
ax07 = fig07.add_subplot(111)

# Plotting the bar chart
bars = ax07.bar(df_field['Fields of Study'], df_field['Count'], color='#1f77b4')

# Adding labels and title
ax07.set_xlabel('Fields of Study', fontsize=13)
ax07.set_ylabel('Count')
# ax07.set_title('Count of Fields of Study')

# Rotating x-axis labels for better readability
plt.xticks(rotation=0)

#  Adding labels to bars
for bar in bars:
    yval = bar.get_height()
    ax07.text(bar.get_x() + bar.get_width()/2, yval, int(yval), va='bottom', ha='center')


fig07.patch.set_facecolor('#ccd9ff')

# Remove the space outside the plot
plt.subplots_adjust(left=0.05, right=0.98, top=0.9, bottom=0.1)

# Show plot
plt.show()

# Saving
# plt.savefig(result_dir+'journalandconferencecategory_count.png')

## Section 7: Investigating 100% RetractionIndexingAgreement

In [None]:
"""
Investigating number source across 100% RetractionIndexingAgreement
"""

def ria100_distribution(source):
    """
    It finds the distribution of 100% RetractionIndexingAgreement_ITEM(%) for a given source: finding out
    how many databases also index retracted IDs alongside
    :param source: database to check 
    :return: dataframe of indexed sources distribution
    """
    
    df100 = unionlist3_adjusted[unionlist3_adjusted['RetractionIndexingAgreement_ITEM(%)']==100].copy()
    s100= df100[df100.source_new.apply(lambda x: source in x )].copy()
    s100['size_source_old']= s100['source_old'].apply(lambda x: len(list(x.split(','))))
    s_count= dict(Counter(s100['size_source_old']))

    df = pd.DataFrame(s_count.items(), columns=['Category', 'Count'])
    
    df['LogCount'] = np.log(df['Count'] + 1)
    
    # Calculate the percentage of log count
    total_log_count = df['LogCount'].sum()
    df['LogCountPercentage'] = (df['LogCount'] / total_log_count) * 100
    
    # Compute the percentage
    total_count = df['Count'].sum()
    df['Percentage'] = (df['Count'] / total_count) * 100
    df.reset_index(drop='na', inplace=True)
    return df
    

In [None]:
"""
Plotting shared # of IDs per source across 100% RetractionIndexingAgreement -All
"""

fig08, ax08 = plt.subplots(nrows=6, ncols=2,figsize=(15, 15))

for i, ax in enumerate(ax08.flat):  # Flatten the 3x2 array of axes
    if i < len(sources_indexed[:]):
        source = sources_indexed[i]
        df = ria100_distribution(source)

        # Define the desired order of categories

        # Create the bar plot          
        bars = ax.barh(df['Category'], df['Count'], align='center')

        # Annotate each bar with the count
        for bar, noDOIs in zip(bars, df['Count']):
            ax.annotate(f'{noDOIs} Items',
                        xy=(bar.get_width(), bar.get_y() + bar.get_height() / 2),
                        xytext=(3, 0),  # 5 points horizontal offset
                        textcoords='offset points',fontsize=8,
                        ha='left', va='center',) #rotation=90
        
        if source == 'Medline':
            source= source.upper()
        if source== 'WoS_Core':
            source='Web of Science Core'
                
        ax.set_title(f'{source}')
        ax.set_ylabel(f"{source} with \nnumbers of other databases", fontsize=10)
        
        # Set the y-axis labels to the correct order
        ax.set_yticks(df['Category'])
        ax.set_yticklabels(df['Category'])
        ax.margins(y=0.01,x=0.1)  #ax.margins(0.2)  
        
        # Only set x-axis label for subplots in the last row
        if i > 8:
            ax.set_xlabel('Number of retracted publications')
            
    else:
        fig08.delaxes(ax)

# Adjust margins to fit the annotations
plt.subplots_adjust(left=0.2, right=0.3,) # bottom=0.1, wspace=0.2, hspace=1

# fig08.suptitle('100% Retraction Indexing Agreement: Number of IDs co-indexed with other databases') 
fig08.patch.set_facecolor('#ccd9ff') 

plt.tight_layout()

# Show the plot
plt.show()

# Save to folder
# plt.savefig(result_dir+'distribution_of_RIA100_by_source_all.png')

In [None]:
"""
Plotting shared # of IDs per source across 100% RetractionIndexingAgreement -1
"""

fig08_0, ax08_0 = plt.subplots(nrows=5, ncols=2, figsize=(14, 14))

for i, ax in enumerate(ax08_0.flat):  # Flatten the 3x2 array of axes
    if i < len(sources_indexed[:10]):
        source = sources_indexed[i]
        df = ria100_distribution(source)

        # Create the bar plot
        bars = ax.barh(df['Category'], df['Count'], align='center')

        # Annotate each bar with the count
        for bar, noDOIs in zip(bars, df['Count']):
            ax.annotate(f'{noDOIs} Items',
                        xy=(bar.get_width(), bar.get_y() + bar.get_height() / 2),
                        xytext=(3, 0),  # 5 points horizontal offset
                        textcoords='offset points',fontsize=8,
                        ha='left', va='center',) #rotation=90
        
        if source == 'Medline':
            source= source.upper()
        ax.set_title(f'{source}')
        ax.set_ylabel(f"{source} with \nnumbers of other databases", fontsize=10)
        # Set the y-axis labels to the correct order
        ax.set_yticks(df['Category'])
        ax.set_yticklabels(df['Category'])
        ax.margins(y=0.01,x=0.1)  #ax.margins(0.2)  
        
            # Only set x-axis label for subplots in the last row
        if i > 7:
            ax.set_xlabel('Number of retracted publications')
        
# Adjust margins to fit the annotations
plt.subplots_adjust(left=0.2, right=0.3,) # bottom=0.1, wspace=0.2, hspace=1

# fig08_0.suptitle('100% Retraction Indexing Agreement: Number of IDs co-indexed with other databases') 
fig08_0.patch.set_facecolor('#ccd9ff') 

plt.tight_layout()

# Show the plot
plt.show()

# Save to folder
# plt.savefig(result_dir+'distribution_of_RIA100_by_source1.png')

In [None]:
"""
Plotting shared # of IDs per source across 100% RetractionIndexingAgreement -2
"""

fig08_1, ax08_1 = plt.subplots(nrows=1, ncols=1, figsize=(6, 4))
df_ws = ria100_distribution('WoS_Core')

bars = ax08_1.barh(df_ws['Category'], df_ws['Count'], align='center')

# Annotate each bar with the count
for bar, noDOIs in zip(bars, df_ws['Count']):
    ax08_1.annotate(f'{noDOIs} Items',
    xy=(bar.get_width(), bar.get_y() + bar.get_height() / 2),
    xytext=(3, 0),  # 5 points horizontal offset
    textcoords='offset points',fontsize=8,
    ha='left', va='center',) #rotation=90
        
source= 'Web of Science Core'
ax08_1.set_title(f'{source}')
ax08_1.set_ylabel(f"{source} with \nnumbers of other databases", fontsize=10)
        
# Set the y-axis labels to the correct order
ax08_1.set_yticks(df['Category'])
ax08_1.set_yticklabels(df['Category'])
ax08_1.margins(y=0.01,x=0.1)  #ax.margins(0.2)  
ax08_1.set_xlabel('Number of retracted publications')
        
# Adjust margins to fit the annotations
plt.subplots_adjust(left=0.2, right=0.3,) # bottom=0.1, wspace=0.2, hspace=1

# fig08_1.suptitle('100% Retraction Indexing Agreement: Number of IDs Co-Indexed with Other Databases') 
fig08_1.patch.set_facecolor('#ccd9ff') 

plt.tight_layout()
plt.show()

# Save to folder
# plt.savefig(result_dir+'distribution_of_RIA100_by_source2.png')

### END