# Step 3: Perform Data Analysis and Visualizations

In [None]:
!pip install dataframe-image

In [None]:
import pandas as pd
import dataframe_image as dfi
import numpy as np
import matplotlib.pyplot as plt
from datetime import date, datetime as dt

In [None]:
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
from matplotlib.ticker import MaxNLocator

%matplotlib notebook

In [None]:
today = str(date.today())

In [None]:
# Set path --- Link to the box folder with your name
# Download Box Desktop to copy the pathname

# Input
# Folder name: step1-inputfile
box_path = '/Users/lirou/Library/CloudStorage/Box-Box/RetractionWatch2023-03-28/testing/Laura/step1-inputfile/'
# Folder name: step1-outputfile
box_path_1 = '/Users/lirou/Library/CloudStorage/Box-Box/RetractionWatch2023-03-28/testing/Laura/step1-outputfile/'
# Folder name: step2-outputfile
box_path_2 = '/Users/lirou/Library/CloudStorage/Box-Box/RetractionWatch2023-03-28/testing/Laura/step2-outputfile/'
# Folder name: step3-inputfile
box_path_3 = '/Users/lirou/Library/CloudStorage/Box-Box/RetractionWatch2023-03-28/testing/Laura/step2-outputfile/'


# Output
# Folder name: step3-outputfile
box_path_4 = '/Users/lirou/Library/CloudStorage/Box-Box/RetractionWatch2023-03-28/testing/Laura/step3-outputfile/'

## 1. Concatenate files and generate a new list for data analysis 
We import the output files from the previous notebook and combine them with the Known Retraction List. When records are discovered to be present in (“covered by”) a source, the source is added to the records, and a new list is created. 

In [None]:
# Input Files:  
# One CSV file of the known retraction list  
# Three CSV files (one for each source) 
# of the items from the known retraction list 
# that are covered by a given source but not indexed as retracted in that source 


# Known Retraction List
knownretraction = pd.read_csv(box_path_1 + '2023-04-12-knownretractionlist-1.csv').drop(['Unnamed: 0'], axis=1).sort_values('Year', ascending=False).reset_index(drop=True)
knownretraction

print(knownretraction.shape)
print(knownretraction.info())
print(knownretraction.head())

# Scopus
scopus = pd.read_csv(box_path_2 + '2023-04-09-notindexedasretracted-scopus.csv').drop(['Unnamed: 0'], axis=1).sort_values('Year', ascending=False).reset_index(drop=True).rename(columns={'Database': 'source'})
scopus['source'] = 'Scopus'

print(scopus.shape)
print(scopus.info())
print(scopus.head())


# Web of Science
wos = pd.read_csv(box_path_2 + '2023-04-08-notindexedasretracted-webofscience.csv').drop(['Unnamed: 0'], axis=1).sort_values('Year', ascending=False).reset_index(drop=True).rename(columns={'Database': 'source'})
wos['source'] = 'Web of Science'

print(wos.shape)
print(wos.info())
print(wos.head())


# Crossref
crossref = pd.read_csv(box_path_2 + '2023-04-09-notindexedasretracted-crossref.csv').drop(['Unnamed: 0'], axis=1).sort_values('Year', ascending=False).reset_index(drop=True).rename(columns={'Database': 'source'})
crossref['source'] = 'Crossref'

print(crossref.shape)
print(crossref.info())
print(crossref.head())

In [None]:
# Concat the input files into one dataframe

merged_withdoi = pd.concat([knownretraction, scopus, wos, crossref])

knownretraction_2 = merged_withdoi.groupby('DOI').agg({'Author':'first', 
                              'Title': 'first',
                              'Year': 'first', 
                              'Journal': 'first',                    
                              'source':'; '.join, 
                              'PubMedID':'first'}).reset_index()

knownretraction_2.fillna('', inplace=True)

knownretraction_2.head()

In [None]:
# clean dataframe-- removing duplicate values in 'source' column

dedup_source = []

for i in knownretraction_2['source']:
    sourceinlist = i.split("; ")
    unique_list = pd.Series(sourceinlist).drop_duplicates().tolist()
    dedup_source.append(sorted(unique_list))

knownretraction_2['source_new'] = dedup_source
knownretraction_2 = knownretraction_2.drop(['source'], axis=1)

# Print the new Known Retraction List
knownretraction_3 = pd.merge(knownretraction_2, knownretraction[['DOI','source']].rename(columns={'source':'source_old'}), on='DOI')

In [None]:
# store value in source_old in list format

source_old = []

for i in knownretraction_3['source_old']:
    sourceinlist = i.split("; ")
    source_old.append(sourceinlist)
    
knownretraction_3['source_old'] = source_old
knownretraction_3 = knownretraction_3.sort_values(by='source_old')

knownretraction_3

#changesourcenew/old 

In [None]:
# Output File for Step 3.5
knownretraction_3.to_csv(box_path_4 + today + '-knownretractionlist-2.csv')

**Run Step3.5 then return to below**

In [None]:
# Add retraction year to known retraction list
# Import files

cr_retractionyear = pd.read_table(box_path_3 + '-retractionyear-crossref.csv').rename(columns={'retraction-date':'RetractionYear'})

retractionwatch_retracted = pd.read_csv(box_path_1 + '2023-04-09-recordswithdoi-retractionwatch.csv')
retractionwatch_retracted["RetractionYear"] = pd.to_datetime(retractionwatch_retracted["RetractionDate"]).dt.strftime("%Y").fillna(0).astype(int)
rw_retractionyear = retractionwatch_retracted[['DOI', 'RetractionYear']]

print(journalcategory.info())
print(cr_retractionyear.info())
print(rw_retractionyear.info())

In [None]:
# Add retraction year to known retraction list
# Merge dataframes

retractionyear_merged = pd.merge(cr_retractionyear,rw_retractionyear, on='DOI', how='outer')
retractionyear_merged['RetractionYear_x'] = retractionyear_merged['RetractionYear_x'].fillna(0).astype(int)
retractionyear_merged['RetractionYear_y'] = retractionyear_merged['RetractionYear_y'].fillna(0).astype(int)
retractionyear_merged['RetractionYear'] = retractionyear_merged['RetractionYear_x'] + retractionyear_merged['RetractionYear_y']
retractionyear_merged['RetractionYear'] = retractionyear_merged['RetractionYear'].fillna(0).astype(int)

retractionyear_merged = retractionyear_merged[['DOI', 'RetractionYear']]
retractionyear_merged.info()

In [None]:
# Add retraction year to known retraction list
knownretraction_4 = knownretraction_3.merge(retractionyear_merged, on = 'DOI', how='left')
knownretraction_4['RetractionYear'] = knownretraction_4['RetractionYear'].fillna(0).astype(int)

knownretraction_4.info()

In [None]:
# Create New column: TimetoRetraction
retractionyearfiltered = knownretraction_4[knownretraction_4['RetractionYear'] > 0]
retractionyearfiltered['TimetoRetraction'] = retractionyearfiltered['RetractionYear'] - retractionyearfiltered['Year']

retractionyearfiltered = retractionyearfiltered[['DOI', 'TimetoRetraction']]
retractionyearfiltered.info()

In [None]:
knownretraction_4 = pd.merge(knownretraction_4,retractionyearfiltered, on='DOI', how='left')
knownretraction_4['TimetoRetraction'] = knownretraction_4['TimetoRetraction'].fillna(0).astype(int)

# Create DOI link
knownretraction_4['DOILink'] = ['http://doi.org/']+ knownretraction_4['DOI']

knownretraction_4.info()

In [None]:
# Fix minor errors in dataframe
knownretraction_4['Journal_lower'] = knownretraction_4['Journal'].str.lower()
knownretraction_4 = knownretraction_4.merge(journalcategory, left_on= 'Journal_lower', right_on= 'JournalandConferenceProceedings', how='left')

knownretraction_4 = knownretraction_4.drop(columns=['Unnamed: 0', 'JournalandConferenceProceedings', 'Journal_lower'], axis=1)
knownretraction_4['MainCategory'] = knownretraction_4['MainCategory'].fillna('notcategorized').str.replace('Sciences', 'Science')

print(knownretraction_4.info())
knownretraction_4.head()

In [None]:
# Output File
# One CSV file with new source added to items  

knownretraction_4.to_csv(box_path_4 + today + '-knownretractionlist-3.csv')

## 2. Calculate the index score for each item and source

Then, we introduce our calculations for retraction indexing and use visualizations to enhance the analysis outcome. The part is divided into two sections: retraction index scores by publication and retraction index scores by source. 

### Section 1: Calculation by item

In [None]:
# set up the formula for the calculation
count_sources_covering_item = knownretraction_4.source_new.str.len()
count_sources_indexing_item_as_retracted = knownretraction_4.source_old.str.len()
RetractionIndexingDiscrepancy_ITEM = count_sources_indexing_item_as_retracted / count_sources_covering_item

# add and show the calculation score to dataframe
knownretraction_4['RetractionIndexingAgreement_ITEM(%)'] = ((RetractionIndexingDiscrepancy_ITEM)*100).astype(int)
knownretraction_4 = knownretraction_4.sort_index()
knownretraction_4.head()

In [None]:
# Export result to folder

v, c = np.unique(knownretraction_4['RetractionIndexingAgreement_ITEM(%)'], return_counts=True)
s = v.tolist()

for i in s:
    exp = knownretraction_4[knownretraction_4['RetractionIndexingAgreement_ITEM(%)']== i]
    exp.to_csv(box_path_4 + 'RetractionIndexingAgreement_ITEM/' + today + '-RetractionIndexingAgreement_ITEM-' + str(i) + '.csv' )

knownretraction_4.to_csv(box_path_4 + 'RetractionIndexingAgreement_ITEM/' + today + '-RetractionIndexingAgreement_ITEM-all.csv')

### Section 2: Calculation by source

In [None]:
# get the count of items that appear in a source
cr_rtr = len(knownretraction[knownretraction['source'].str.contains('Crossref', na=False)])
rw_rtr = len(knownretraction[knownretraction['source'].str.contains('Retraction Watch', na=False)])
sp_rtr = len(knownretraction[knownretraction['source'].str.contains('Scopus', na=False)])
wos_rtr = len(knownretraction[knownretraction['source'].str.contains('Web of Science', na=False)])

# get the count of items that does not appear in a source
cr_ntc = len(knownretraction[~knownretraction['source'].str.contains('Crossref', na=False)])
rw_ntc = len(knownretraction[~knownretraction['source'].str.contains('Retraction Watch', na=False)])
sp_ntc = len(knownretraction[~knownretraction['source'].str.contains('Scopus', na=False)])
wos_ntc = len(knownretraction[~knownretraction['source'].str.contains('Web of Science', na=False)])

# get the count of items in known retraction list
total_count = len(knownretraction)

In [None]:
# create a table showing the count showing 
# the ratio of the three indexing statuses:
# indexed_as_retracted, covered_but_not_indexed_as_retracted, not_covered
# and the calculation score by source

table = pd.DataFrame()
table['source'] = ['Scopus', 'Web of Science', 'Retraction Watch', 'Crossref']
table['indexed_as_retracted'] = [sp_rtr, wos_rtr, rw_rtr, cr_rtr]
table['covered_but_not_indexed_as_retracted'] = [len(scopus), len(wos), 0, len(crossref)]
table['not_covered'] = [sp_ntc-len(scopus), wos_ntc-len(wos), rw_ntc, cr_ntc-len(crossref)]

cal=[]

for i in range(0,4):
    D = table['indexed_as_retracted'][i] + table['covered_but_not_indexed_as_retracted'][i]   
    cal.append(round(((table['indexed_as_retracted'][i]/D)*100), 2))
    
table['RetractionIndexingAgreement_SOURCE(%)'] = [cal[0], cal[1], cal[2], cal[3]]

#set first column as index
table = table.sort_values(by='source') #.set_index(table.columns[0])
table

In [None]:
# Export table
dfi.export(table, box_path_4 + today + 'Numberofretractionsineachsource.png', table_conversion="matplotlib")

In [None]:
# Creat bar plot for the table
fig, ax = plt.subplots(layout='constrained')

x = np.arange(len(table['source']))
y1 = table['indexed_as_retracted']
y2 = table['covered_but_not_indexed_as_retracted']
width = 0.4


# plot data in grouped manner of bar type
rects1 = plt.bar(x-0.2, y1, width, color='#377eb8', hatch='/',edgecolor= 'white')
rects2 = plt.bar(x+0.2, y2, width, color='#f781bf',hatch='o',edgecolor= 'white')


# Add some text for labels, title and custom x-axis tick labels, etc.
plt.xticks(x, table['source'])    
plt.xlabel("Source")
plt.ylabel("Number of DOIs")
plt.legend(["indexed_as_retracted", "covered_but_not_indexed_as_retracted"],loc='upper left')
plt.margins(y=0.25)


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

fig.tight_layout()

plt.show()
plt.savefig(box_path_4 + today + '-recordsineachsource.png')

In [None]:
# Create Pie Chart for the table
new = table[['indexed_as_retracted', 'not_covered', 'covered_but_not_indexed_as_retracted']]

fig, axes = plt.subplots(2, 2, figsize=(10, 10), facecolor='#FFFFFF')

for i, (idx, row) in enumerate(new.iterrows()):
    ax = axes[i//2, i % 2]
    row = row[row.gt(row.sum() * .01)]
    
    patterns = [ "/", "+", "o"]

    piechart = ax.pie(row,
           #labels=row.values,
           #autopct='%.2f%%',
           pctdistance = 1.3, 
           labeldistance = 1,
           startangle = 90 * row[0],
           autopct=lambda x: f'{x:.2f}%\n({(x/100)*sum(row):.0f})',
           wedgeprops={'edgecolor': 'white'},
           textprops={'size': 'x-large'},
           colors=['#377eb8','#ff7f00', '#f781bf'])
    
    for i in range(len(piechart[0])):
        piechart[0][i].set_hatch(patterns[(i)%len(patterns)])

    #plt.setp(pcts, color='white', fontweight='bold')    
    ax.set_title(idx, fontsize=16)
        
    # create and show legend 
legend = plt.legend([x for x in row.index], 
                    loc='lower left',  
                    ncol=1) 
                    #fancybox=True)
    
fig.subplots_adjust(wspace=.2) # Space between charts
plt.show()

plt.savefig(box_path_4 + today + '-percentagebysource.png')

## 3. Create Visualizations
Last, we create visualizations to support the outcome of our analysis from the the known retraction list.

### Visualizaion 1 : Publication Year - Overview

In [None]:
# create an overview table

y, yc = np.unique(knownretraction_4['Year'], return_counts=True)

year_count = pd.DataFrame()
year_count['Year'] = y
year_count['Count'] = yc
year_count = year_count.set_index(year_count.columns[0])

year_count.sort_values(by='Count', ascending = False).head()

In [None]:
# creating the bar plot

fig = plt.figure(figsize = (10, 10))
plt.bar(y, yc)
 
plt.xlabel("Publication Year")
plt.ylabel("No. of Items")
plt.show()

plt.savefig(box_path_4 + today + '-itembyyear.png')

In [None]:
# Limit to specific year range: 1940-1990
# create an overview table

limityear = knownretraction_3.loc[(knownretraction_3['Year'] >= 1940) & (knownretraction_3['Year'] <= 1990)]

l_values, l_counts = np.unique(limityear['Year'], return_counts=True)

limityear_count = pd.DataFrame()
limityear_count['Year'] = l_values
limityear_count['Count'] = l_counts
limityear_count = limityear_count.set_index(limityear_count.columns[0])

limityear_count.sort_values(by='Count', ascending = False).head()

In [None]:
# Limit to specific year range: 1940-1990
# creating the bar plot

fig = plt.figure(figsize = (10, 10))
plt.bar(l_values, l_counts)
 
plt.xlabel("Publication Year")
plt.ylabel("No. of Items")
plt.show()

plt.savefig(box_path_4 + today + '-itembyyearlimited.png')

In [None]:
# Merge two plot into one plot

fig = plt.figure(figsize = (10, 10))

plt.bar(values, counts)
plt.xlabel("Publication Year")
plt.ylabel("No. of Items")

# First new axes
ax1 = fig.add_axes([0.2, 0.45, 0.4, 0.4])
ax1.bar(l_values, l_counts)

plt.xlabel("Publication Year")
plt.ylabel("No. of Items")


plt.savefig(box_path_4 + today + '-itembyyear-twoinone.png')
plt.show()

### Visualization 2: PublicationYear

In [None]:
# create a table showing the count of items in different RetractionIndexingAgreement_ITEM(%)

score = knownretraction_3['RetractionIndexingAgreement_ITEM(%)']
s_values, s_counts = np.unique(score, return_counts=True)

score_count = pd.DataFrame()
score_count['RetractionIndexingAgreement_ITEM(%)'] = s_values
score_count['Count'] = s_counts
score_count = score_count.set_index(score_count.columns[0])

score_count

In [None]:
# Create bar plots
itemscore = knownretraction_4[['RetractionIndexingAgreement_ITEM(%)', 'Year']]

fig_2, axes_2 = plt.subplots(3, 2, figsize=(10, 10), facecolor='#FFFFFF')
fig_2.subplots_adjust(hspace=0.5, wspace=0.25) # Space between charts

specscore = itemscore[itemscore['RetractionIndexingAgreement_ITEM(%)']== 25]
values, counts = np.unique(specscore['Year'], return_counts=True)
axes_2[0, 0].bar(values, counts)
axes_2[0, 0].set_title("RetractionIndexingAgreement_DOI = 25%")
axes_2[0, 0].set_xlabel("Publication Year")
axes_2[0, 0].set_ylabel("No. of Items")
axes_2[0, 0].set_xticks([2022])

specscore2 = itemscore[itemscore['RetractionIndexingAgreement_ITEM(%)']== 33]
values2, counts2 = np.unique(specscore2['Year'], return_counts=True)
axes_2[0, 1].bar(values2, counts2)
axes_2[0, 1].set_title("RetractionIndexingAgreement_DOI = 33%")
axes_2[0, 1].set_xlabel("Publication Year")
axes_2[0, 1].set_ylabel("No. of Items")
axes_2[0, 1].yaxis.set_major_locator(MaxNLocator(integer=True))
axes_2[0, 1].xaxis.set_major_locator(MaxNLocator(integer=True))

specscore3 = itemscore[itemscore['RetractionIndexingAgreement_ITEM(%)']== 50]
values3, counts3 = np.unique(specscore3['Year'], return_counts=True)
axes_2[1, 0].bar(values3, counts3)
axes_2[1, 0].set_title("RetractionIndexingAgreement_DOI = 50%")
axes_2[1, 0].set_xlabel("Publication Year")
axes_2[1, 0].set_ylabel("No. of Items")

specscore4 = itemscore[itemscore['RetractionIndexingAgreement_ITEM(%)']== 66]
values4, counts4 = np.unique(specscore4['Year'], return_counts=True)
axes_2[1, 1].bar(values4, counts4)
axes_2[1, 1].set_title("RetractionIndexingAgreement_DOI = 66%")
axes_2[1, 1].set_xlabel("Publication Year")
axes_2[1, 1].set_ylabel("No. of Items")

specscore5 = itemscore[itemscore['RetractionIndexingAgreement_ITEM(%)']== 75]
values5, counts5 = np.unique(specscore5['Year'], return_counts=True)
axes_2[2, 0].bar(values5, counts5)
axes_2[2, 0].set_title("RetractionIndexingAgreement_DOI = 75%")
axes_2[2, 0].set_xlabel("Publication Year")
axes_2[2, 0].set_ylabel("No. of Items")
axes_2[2, 0].yaxis.set_major_locator(MaxNLocator(integer=True))
axes_2[2, 0].xaxis.set_major_locator(MaxNLocator(integer=True))

specscore6 = itemscore[itemscore['RetractionIndexingAgreement_ITEM(%)']== 100]
values6, counts6 = np.unique(specscore6['Year'], return_counts=True)
axes_2[2, 1].bar(values6, counts6)
axes_2[2, 1].set_title("RetractionIndexingAgreement_DOI = 100%")
axes_2[2, 1].set_xlabel("Publication Year")
axes_2[2, 1].set_ylabel("No. of Items")

plt.show()
plt.savefig(box_path_4 + today + '-pubyearbyscore.png')

### Visualization 3: RetractionYear

In [None]:
# create a table showing the count of items in different RetractionIndexingAgreement_ITEM(%)

withretractiondate = knownretraction_4[knownretraction_4['RetractionYear'] > 0]
withoutretractiondate = knownretraction_4[knownretraction_4['RetractionYear'] == 0]

retractionyeardata = pd.DataFrame()
retractionyeardata['RetractionYear'] = ['withretractiondate', 'withoutretractiondate']
retractionyeardata['Count'] = [len(withretractiondate), len(withoutretractiondate)]

retractionyeardata.set_index(retractionyeardata.columns[0])

In [None]:
withoutretractiondate.source_new.value_counts()

In [None]:
# Create bar plots

fig_3, axes_3 = plt.subplots(3, 2, figsize=(10, 10), facecolor='#FFFFFF')
fig_3.subplots_adjust(hspace=0.5, wspace=0.25) # Space between charts

specscore = knownretraction_4[(knownretraction_4['RetractionIndexingAgreement_ITEM(%)']== 25) & (knownretraction_4['RetractionYear'] > 0)]
values, counts = np.unique(specscore['RetractionYear'], return_counts=True)
axes_3[0, 0].bar(values, counts)
axes_3[0, 0].set_title("RetractionIndexingAgreement_DOI = 25%")
axes_3[0, 0].set_xlabel("Retraction Year")
axes_3[0, 0].set_ylabel("No. of Items")
axes_3[0, 0].xaxis.set_major_locator(MaxNLocator(integer=True))

specscore2 = knownretraction_4[(knownretraction_4['RetractionIndexingAgreement_ITEM(%)']== 33) & (knownretraction_4['RetractionYear'] > 0)]
values2, counts2 = np.unique(specscore2['RetractionYear'], return_counts=True)
axes_3[0, 1].bar(values2, counts2)
axes_3[0, 1].set_title("RetractionIndexingAgreement_DOI = 33%")
axes_3[0, 1].set_xlabel("Retraction Year")
axes_3[0, 1].set_ylabel("No. of Items")


specscore3 = knownretraction_4[(knownretraction_4['RetractionIndexingAgreement_ITEM(%)']== 50) & (knownretraction_4['RetractionYear'] > 0)]
values3, counts3 = np.unique(specscore3['RetractionYear'], return_counts=True)
axes_3[1, 0].bar(values3, counts3)
axes_3[1, 0].set_title("RetractionIndexingAgreement_DOI = 50%")
axes_3[1, 0].set_xlabel("Retraction Year")
axes_3[1, 0].set_ylabel("No. of Items")

specscore4 = knownretraction_4[(knownretraction_4['RetractionIndexingAgreement_ITEM(%)']== 66) & (knownretraction_4['RetractionYear'] > 0)]
values4, counts4 = np.unique(specscore4['RetractionYear'], return_counts=True)
axes_3[1, 1].bar(values4, counts4)
axes_3[1, 1].set_title("RetractionIndexingAgreement_DOI = 66%")
axes_3[1, 1].set_xlabel("Retraction Year")
axes_3[1, 1].set_ylabel("No. of Items")

specscore5 = knownretraction_4[(knownretraction_4['RetractionIndexingAgreement_ITEM(%)']== 75) & (knownretraction_4['RetractionYear'] > 0)]
values5, counts5 = np.unique(specscore5['RetractionYear'], return_counts=True)
axes_3[2, 0].bar(values5, counts5)
axes_3[2, 0].set_title("RetractionIndexingAgreement_DOI = 75%")
axes_3[2, 0].set_xlabel("Retraction Year")
axes_3[2, 0].set_ylabel("No. of Items")
axes_3[2, 0].xaxis.set_major_locator(MaxNLocator(integer=True))

specscore6 = knownretraction_4[(knownretraction_4['RetractionIndexingAgreement_ITEM(%)']== 100 )& (knownretraction_4['RetractionYear'] > 0)]
values6, counts6 = np.unique(specscore6['RetractionYear'], return_counts=True)
axes_3[2, 1].bar(values6, counts6)
axes_3[2, 1].set_title("RetractionIndexingAgreement_DOI = 100%")
axes_3[2, 1].set_xlabel("Retraction Year")
axes_3[2, 1].set_ylabel("No. of Items")

plt.show()
plt.savefig(box_path_4 + today + '-retractionyearbyscore.png')

### Visualization 4: TimetoRetraction

In [None]:
timetoretraction_filtered = knownretraction_4[knownretraction_4['RetractionYear']!= 0]
timetoretraction_filtered.info()

In [None]:
# Create bar plots

fig_4, axes_4 = plt.subplots(3, 2, figsize=(10, 10), facecolor='#FFFFFF')
fig_4.subplots_adjust(hspace=0.5, wspace=0.25) # Space between charts

specscore = timetoretraction_filtered[timetoretraction_filtered['RetractionIndexingAgreement_ITEM(%)']== 25]
values, counts = np.unique(specscore['TimetoRetraction'], return_counts=True)
axes_4[0, 0].bar(values, counts)
axes_4[0, 0].set_title("RetractionIndexingAgreement_DOI = 25%")
axes_4[0, 0].set_xlabel("Time to Retraction")
axes_4[0, 0].set_ylabel("No. of Items")
axes_4[0, 0].xaxis.set_major_locator(MaxNLocator(integer=True))

specscore2 = timetoretraction_filtered[timetoretraction_filtered['RetractionIndexingAgreement_ITEM(%)']== 33]
values2, counts2 = np.unique(specscore2['TimetoRetraction'], return_counts=True)
axes_4[0, 1].bar(values2, counts2)
axes_4[0, 1].set_title("RetractionIndexingAgreement_DOI = 33%")
axes_4[0, 1].set_xlabel("Time to Retraction")
axes_4[0, 1].set_ylabel("No. of Items")

specscore3 = timetoretraction_filtered[timetoretraction_filtered['RetractionIndexingAgreement_ITEM(%)']== 50]
values3, counts3 = np.unique(specscore3['TimetoRetraction'], return_counts=True)
axes_4[1, 0].bar(values3, counts3)
axes_4[1, 0].set_title("RetractionIndexingAgreement_DOI = 50%")
axes_4[1, 0].set_xlabel("Time to Retraction")
axes_4[1, 0].set_ylabel("No. of Items")
axes_4[1, 0].xaxis.set_major_locator(MaxNLocator(integer=True))

specscore4 = timetoretraction_filtered[timetoretraction_filtered['RetractionIndexingAgreement_ITEM(%)']== 66]
values4, counts4 = np.unique(specscore4['TimetoRetraction'], return_counts=True)
axes_4[1, 1].bar(values4, counts4)
axes_4[1, 1].set_title("RetractionIndexingAgreement_DOI = 66%")
axes_4[1, 1].set_xlabel("Time to Retraction")
axes_4[1, 1].set_ylabel("No. of Items")

specscore5 = timetoretraction_filtered[timetoretraction_filtered['RetractionIndexingAgreement_ITEM(%)']== 75]
values5, counts5 = np.unique(specscore5['TimetoRetraction'], return_counts=True)
axes_4[2, 0].bar(values5, counts5)
axes_4[2, 0].set_title("RetractionIndexingAgreement_DOI = 75%")
axes_4[2, 0].set_xlabel("Time to Retraction")
axes_4[2, 0].set_ylabel("No. of Items")
axes_4[2, 0].xaxis.set_major_locator(MaxNLocator(integer=True))

specscore6 = timetoretraction_filtered[timetoretraction_filtered['RetractionIndexingAgreement_ITEM(%)']== 100]
values6, counts6 = np.unique(specscore6['TimetoRetraction'], return_counts=True)
axes_4[2, 1].bar(values6, counts6)
axes_4[2, 1].set_title("RetractionIndexingAgreement_DOI = 100%")
axes_4[2, 1].set_xlabel("Time to Retraction")
axes_4[2, 1].set_ylabel("No. of Items")

plt.show()
plt.savefig(box_path_4 + today + '-timetoretraction.png')

### Visualization 5: Field of Study - Overview

In [None]:
cat_2 = cat + ['notcategorized']
print(cat)
print(cat_2)

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))

cat_3 = ['General', 'Health \nScience', 'Life \nScience', 'Physical \nScience', 'Social \nScience', 'Not \nCategorized']

count=[]
for i in cat_2:
    counts = knownretraction_4.MainCategory.str.count(i).sum()
    count.append(counts)

df = pd.DataFrame({'Fields of Study': cat_3, 'Count': count}).sort_values('Count', ascending=False)#.set_index(df.columns[0])

#ax.bar('Fields of Study', 'Count', data=df)
x = np.arange(len(df['Fields of Study'])) 

ax.set_xlabel("Fields of Study")
ax.set_ylabel("Number of DOIs")
ax.set_xticklabels([0, 'Life \nScience',
 'Physical \nScience',
 'Health \nScience',
 'Not \nCategorized',
 'Social \nScience',
 'General'])

pps = ax.bar(x/2, df['Count'], width, label='No. of Items')
for p in pps:
    height = p.get_height()
    ax.text(x=p.get_x() + p.get_width() / 2, y=height+0.1,
            s="{}".format(height),
            ha='center')


plt.show()
plt.savefig(box_path_4 + today + '-fieldofstudytotal.png')

### Visualization 6: Field of Study

In [None]:
knownretraction_5 = knownretraction_4[knownretraction_4['MainCategory'] != 'notcategorized']
knownretraction_5['MainCategory'] = knownretraction_5['MainCategory'].str.replace('Sciences', 'Science')
knownretraction_5.head()

In [None]:
cat = ['General', 'Health Science', 'Life Science', 'Physical Science', 'Social Science']

In [None]:
fig_5, axes_5 = plt.subplots(3, 2, figsize=(10, 10), facecolor='#FFFFFF')
fig_5.subplots_adjust(hspace=0.5, wspace=0.3) # Space between charts

cat_1 = ['General', 'Health \nScience', 'Life \nScience', 'Physical \nScience', 'Social \nScience']

specscore = knownretraction_5[knownretraction_5['RetractionIndexingAgreement_ITEM(%)']== 25]
category=[]
num=[]
for i in cat:
    count = specscore.MainCategory.str.count(i).sum()
    category.append(i)
    num.append(count)
axes_5[0, 0].bar(cat_1, num)
axes_5[0, 0].set_title("RetractionIndexingAgreement_DOI = 25%")
axes_5[0, 0].set_xlabel("Field of Study")
axes_5[0, 0].set_ylabel("No. of Items")

specscore2 = knownretraction_5[knownretraction_5['RetractionIndexingAgreement_ITEM(%)']== 33]
category2=[]
num2=[]
for i in cat:
    count = specscore2.MainCategory.str.count(i).sum()
    category2.append(i)
    num2.append(count)
axes_5[0, 1].bar(cat_1, num2)
axes_5[0, 1].set_title("RetractionIndexingAgreement_DOI = 33%")
axes_5[0, 1].set_xlabel("Field of Study")
axes_5[0, 1].set_ylabel("No. of Items")

specscore3 = knownretraction_5[knownretraction_5['RetractionIndexingAgreement_ITEM(%)']== 50]
category3=[]
num3=[]
for i in cat:
    count = specscore3.MainCategory.str.count(i).sum()
    category3.append(i)
    num3.append(count)
axes_5[1, 0].bar(cat_1, num3)
axes_5[1, 0].set_title("RetractionIndexingAgreement_DOI = 50%")
axes_5[1, 0].set_xlabel("Field of Study")
axes_5[1, 0].set_ylabel("No. of Items")

specscore4 = knownretraction_5[knownretraction_5['RetractionIndexingAgreement_ITEM(%)']== 66]
category4=[]
num4=[]
for i in cat:
    count = specscore4.MainCategory.str.count(i).sum()
    category4.append(i)
    num4.append(count)
axes_5[1, 1].bar(cat_1, num4)
axes_5[1, 1].set_title("RetractionIndexingAgreement_DOI = 66%")
axes_5[1, 1].set_xlabel("Field of Study")
axes_5[1, 1].set_ylabel("No. of Items")

specscore5 = knownretraction_5[knownretraction_5['RetractionIndexingAgreement_ITEM(%)']== 75]
category5=[]
num5=[]
for i in cat:
    count = specscore5.MainCategory.str.count(i).sum()
    category5.append(i)
    num5.append(count)
axes_5[2, 0].bar(cat_1, num5)
axes_5[2, 0].set_title("RetractionIndexingAgreement_DOI = 75%")
axes_5[2, 0].set_xlabel("Field of Study")
axes_5[2, 0].set_ylabel("No. of Items")

specscore6 = knownretraction_5[knownretraction_5['RetractionIndexingAgreement_ITEM(%)']== 100]
category6=[]
num6=[]
for i in cat:
    count = specscore6.MainCategory.str.count(i).sum()
    category6.append(i)
    num6.append(count)
axes_5[2, 1].bar(cat_1, num6)
axes_5[2, 1].set_title("RetractionIndexingAgreement_DOI = 100%")
axes_5[2, 1].set_xlabel("Field of Study")
axes_5[2, 1].set_ylabel("No. of Items")

plt.show()
plt.savefig(box_path_4 + today + '-fieldofstudybyscore.png')