Connect to drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd '/content/drive/MyDrive/Applied Data Science/Thesis/Code'

/content/drive/MyDrive/Applied Data Science/Thesis/Code


Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re

Load csv files of retracted and non-retracted articles

In [None]:
papers = pd.read_csv('Data (CSV)/preprocessing_part_1.csv', encoding="utf-8-sig")

In [None]:
#papers[papers['Discussion / Conclusion PP L'].isnull()]['Discussion / Conclusion PP L']

## Preprocessing (V1) for all except Bert: Lowercasing, retracted_word removal, numbers removal, journal removal, author removal

#### Retracted word removal

In [None]:
papers['Title + Abstract'] = papers['Title + Abstract'].str.replace('Retraction|retraction|Retracted|retracted|retract|Retract|withdrawn|Withdraw', '', regex = True)
papers['Main content'] = papers['Main content'].str.replace('Retraction|retraction|Retracted|retracted|retract|Retract|withdrawn|Withdraw', '', regex = True)
papers['Discussion / Conclusion'] = papers['Discussion / Conclusion'].str.replace('Retraction|retraction|Retracted|retracted|retract|Retract|withdrawn|Withdraw', '', regex = True)
papers['References'] = papers['References'].str.replace('Retraction|retraction|Retracted|retracted|retract|Retract|withdrawn|Withdraw', '', regex = True)

#### Number removal

In [None]:
papers['Title + Abstract'] = papers['Title + Abstract'].str.replace(r'\d', '', regex = True)
papers['Main content'] = papers['Main content'].str.replace(r'\d', '', regex = True)
papers['Discussion / Conclusion'] = papers['Discussion / Conclusion'].str.replace(r'\d', '', regex = True)
papers['References'] = papers['References'].str.replace(r'\d', '', regex = True)

#### Author, Journal, Space, Stop word, Punctuation removal + lowercasing & lemmatization
Note: only abbreviations remain 

In [None]:
import spacy 
import re

nlp = spacy.load("en_core_web_sm")

Preprocessed Large

In [None]:
sections = ['Title + Abstract', 'Main content', 'Discussion / Conclusion', 'References']
for section in sections:
  texts = list(papers[section].values)
  processed_texts = [text for text in nlp.pipe(texts, disable=["ner", "parser"])]

  preprocessed_texts = [[word.lemma_.lower() for word in processed_text if not word.is_stop and not word.is_punct and not word.pos_ == "PROPN" and not word.is_space] 
                        for processed_text in processed_texts]

  newcolumn = section + ' ' + 'PP L'

  papers[newcolumn] = preprocessed_texts
  papers = papers.astype({newcolumn: str}, errors='raise') 
  papers[newcolumn] = papers[newcolumn].replace({',': ''}, regex = True)
  papers[newcolumn] = papers[newcolumn].replace({'\'': ''}, regex = True)
  papers[newcolumn] = papers[newcolumn].replace({'\[': ''}, regex = True)
  papers[newcolumn] = papers[newcolumn].replace({'\]': ''}, regex = True)

  # This is added back by InteractiveShellApp.init_path()


Preprocessed Small - only removal of proper nouns to remove all author names, journal titles and white spaces

In [None]:
sections = ['Title + Abstract', 'Main content', 'Discussion / Conclusion', 'References']
for section in sections:
  texts = list(papers[section].values)
  processed_texts = [text for text in nlp.pipe(texts, disable=["ner", "parser"])]

  preprocessed_texts = [[word for word in processed_text if not word.pos_ == "PROPN" and not word.is_space] 
                        for processed_text in processed_texts]

  newcolumn = section + ' ' + 'PP S'

  papers[newcolumn] = preprocessed_texts
  papers = papers.astype({newcolumn: str}, errors='raise') 
  papers[newcolumn] = papers[newcolumn].replace({',': ''}, regex = True)
  papers[newcolumn] = papers[newcolumn].replace({'\'': ''}, regex = True)
  papers[newcolumn] = papers[newcolumn].replace({'\[': ''}, regex = True)
  papers[newcolumn] = papers[newcolumn].replace({'\]': ''}, regex = True)

  # This is added back by InteractiveShellApp.init_path()


In [None]:
papers.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,DOI_x,Retracted,ID,Unnamed: 0.1.1,Unnamed: 0.1.1.1,Publication Type,Authors,Book Authors,...,Main content,Title + Abstract,Title + Abstract PP L,Main content PP L,Discussion / Conclusion PP L,References PP L,Title + Abstract PP S,Main content PP S,Discussion / Conclusion PP S,References PP S
0,0,15,nr580.pdf,0,nr580,579,580.0,J,"Juahir, Y; Morris, BM; Lim, D; Supian, FL",,...,Schiff bases are aldehyde - or ketone-like c...,Journal of Fundamental and Applied S...,sensor publish online novel ligand work amphip...,schiff basis aldehyde ketone like compound car...,isotherm surface pressure area isotherm float ...,n highly sensitive selective fluorescent chemo...,of and A OF FOR SENSOR of of and of of an...,Schiff bases are aldehyde - or ketone - like c...,.. - ( - A ) Isotherms of - - The surface pres...,N . A highly sensitive and selective fluo...
1,1,195,nr601.pdf,0,nr601,600,601.0,J,"Umar, R; Abidin, ZZ; Ibrahim, ZA; Kamarudin, M...",,...,Radio astronomical observation is affected b...,Journal of Fundamental and Applied S...,library criterion decision analysis integrate ...,radio astronomical observation affect source t...,datum parameter combine form layer technique p...,radio frequency interference profile determina...,of and . Libraries - CRITERIA DECISION ANALYSI...,Radio astronomical observation is affected by ...,Each data for each parameter was combined to f...,of radio frequency interference ( ) profile...


#### Creating common column for journal name

In [None]:
papers['Journal_Name'] = papers['Journal'].fillna(papers['Source Title'])

  """Entry point for launching an IPython kernel.


In [None]:
papers['Journal_Name']  = papers['Journal_Name'].str.lower()

In [None]:
papers['Journal_Name'] = papers['Journal_Name'].str.replace('acta crystallographica section e-crystallographic communications', 'acta crystallographica. section e, crystallographic communications', regex = True)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('international journal of electrical engineering & education', 'international journal of electrical engineering education', regex = True)

In [None]:
papers['Journal_Name'].value_counts()

arabian journal of geosciences                               279
journal of cellular biochemistry                             249
plos one                                                     239
rsc advances                                                 147
journal of fundamental and applied sciences                   59
oncotargets and therapy                                       44
international journal of electrical engineering education     32
journal of clinical anesthesia                                19
Name: Journal_Name, dtype: int64

In [None]:
papers.groupby(['Journal_Name', 'Retracted', ]).size()

Journal_Name                                               Retracted
arabian journal of geosciences                             0            196
                                                           1             83
international journal of electrical engineering education  0             32
journal of cellular biochemistry                           0            171
                                                           1             78
journal of clinical anesthesia                             0             19
journal of fundamental and applied sciences                0             47
                                                           1             12
oncotargets and therapy                                    0             31
                                                           1             13
plos one                                                   0            179
                                                           1             60
rsc advances       

#### Balance data based on journal

In [None]:
#r_acta  = papers[(papers['Journal_Name'].str.contains("acta crystallographica. section e, crystallographic communications", na=False)) & (papers['Retracted']== 1)].iloc[:50]
#nr_acta  = papers[(papers['Journal_Name'].str.contains("acta crystallographica. section e, crystallographic communications", na=False)) & (papers['Retracted']== 0)].iloc[:50]

r_arabian  = papers[(papers['Journal_Name'].str.contains("arabian journal of geosciences", na=False)) & (papers['Retracted']== 1)].iloc[:83]
nr_arabian  = papers[(papers['Journal_Name'].str.contains("arabian journal of geosciences", na=False)) & (papers['Retracted']== 0)].iloc[:83]

#r_engineering  = papers[(papers['Journal_Name'].str.contains("international journal of electrical engineering education", na=False)) & (papers['Retracted']== 1)].iloc[:13]
#nr_engineering  = papers[(papers['Journal_Name'].str.contains("international journal of electrical engineering education", na=False)) & (papers['Retracted']== 0)].iloc[:13]

r_cellular_biochem  = papers[(papers['Journal_Name'].str.contains("journal of cellular biochemistry", na=False)) & (papers['Retracted']== 1)].iloc[:78]
nr_cellular_biochem  = papers[(papers['Journal_Name'].str.contains("journal of cellular biochemistry", na=False)) & (papers['Retracted']== 0)].iloc[:78]

#r_clin_anesthesia  = papers[(papers['Journal_Name'].str.contains("journal of clinical anesthesia", na=False)) & (papers['Retracted']== 1)].iloc[:80]
#nr_clin_anesthesia  = papers[(papers['Journal_Name'].str.contains("journal of clinical anesthesia", na=False)) & (papers['Retracted']== 0)].iloc[:80]

r_fund_app  = papers[(papers['Journal_Name'].str.contains("journal of fundamental and applied sciences", na=False)) & (papers['Retracted']== 1)].iloc[:12]
nr_fund_app  = papers[(papers['Journal_Name'].str.contains("journal of fundamental and applied sciences", na=False)) & (papers['Retracted']== 0)].iloc[:12]

r_onco  = papers[(papers['Journal_Name'].str.contains("oncotargets and therapy", na=False)) & (papers['Retracted']== 1)].iloc[:13]
nr_onco  = papers[(papers['Journal_Name'].str.contains("oncotargets and therapy", na=False)) & (papers['Retracted']== 0)].iloc[:13]

r_plos  = papers[(papers['Journal_Name'].str.contains("plos one", na=False)) & (papers['Retracted']== 1)].iloc[:60]
nr_plos  = papers[(papers['Journal_Name'].str.contains("plos one", na=False)) & (papers['Retracted']== 0)].iloc[:60]

r_rsc  = papers[(papers['Journal_Name'].str.contains("rsc advances", na=False)) & (papers['Retracted']== 1)].iloc[:72]
nr_rsc  = papers[(papers['Journal_Name'].str.contains("rsc advances", na=False)) & (papers['Retracted']== 0)].iloc[:72]


#### Creating subset (7/2) 

Note: so we can also see if classifiers are accurate for distinguishing on retraction for journals they are not trained on

In [None]:
two_journal_test_data_set = pd.concat([r_plos, nr_plos, r_rsc, nr_rsc])
four_journal_train_data_set = pd.concat([r_cellular_biochem, nr_cellular_biochem, r_fund_app, nr_fund_app, r_onco, nr_onco, r_arabian, nr_arabian])

In [None]:
two_journal_test_data_set.groupby(['Journal_Name', 'Retracted', ]).size()

Journal_Name  Retracted
plos one      0            60
              1            60
rsc advances  0            72
              1            72
dtype: int64

In [None]:
four_journal_train_data_set.groupby(['Journal_Name', 'Retracted', ]).size()

Journal_Name                                 Retracted
arabian journal of geosciences               0            83
                                             1            83
journal of cellular biochemistry             0            78
                                             1            78
journal of fundamental and applied sciences  0            12
                                             1            12
oncotargets and therapy                      0            13
                                             1            13
dtype: int64

In [None]:
two_journal_test_data_set.to_csv('/content/drive/MyDrive/Applied Data Science/Thesis/Code/Data (CSV)/two_journal_test_data_set.csv')
four_journal_train_data_set.to_csv('/content/drive/MyDrive/Applied Data Science/Thesis/Code/Data (CSV)/four_journal_train_data_set.csv')