Connect to drive

In [63]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [64]:
cd '/content/drive/MyDrive/Applied Data Science/Thesis/Code'

/content/drive/MyDrive/Applied Data Science/Thesis/Code


Import Libraries

In [65]:
import pandas as pd
import numpy as np
import re

Load csv files of retracted and non-retracted articles

In [66]:
non_retracted = pd.read_csv('Data (CSV)/balanced_non_retracted_11_journals.csv', encoding="utf-8-sig")

In [67]:
retracted = pd.read_csv('Data (CSV)/balanced_retracted_11_journals.csv', encoding="utf-8-sig")

Some words have been cut off with a - and new line. To get the original words, the -\n is removed

In [68]:
non_retracted = non_retracted.replace({'-\n': ''}, regex = True)

In [69]:
retracted = retracted.replace({'-\n': ''}, regex = True)

Remove \n for both retracted and non-retracted papers

In [70]:
non_retracted = non_retracted.replace({'\n': ' '}, regex = True)

In [71]:
retracted = retracted.replace({'\n': ' '}, regex = True)

Concat to one dataframe

In [72]:
papers = pd.concat([non_retracted, retracted], ignore_index=True)

Remove rows with missing values

In [73]:
papers['Content'].dropna(inplace=True)

Original scraped content is kept as OG_content.

In [74]:
papers['All content'] = papers['Content']

Try to remove retraction notice

In [75]:
len(papers[papers['All content'].str.contains('Notice of Retraction')==True])

0

In [76]:
papers.Content = papers.Content.str.replace(r'^Notice of Retraction.+ieee\.org\.', '', regex = True)

Remove papers from non-retracted that are actually retracted

In [77]:
len(papers)

2575

In [78]:
non_retracted = papers[papers.Retracted == 0]
doi_toremove = non_retracted[non_retracted["Content"].str.contains('Retraction|retraction|Retracted|retracted|retract|Retract|withdrawn|Withdraw')==True].DOI_x
papers = papers[~papers.DOI_x.isin(doi_toremove)]

In [79]:
len(papers)

2528

Splitting the content into 4 parts: 1) Abstract, 2) main content (Introduction-Results), 3) Discussion/Conclusion and 4) References. 

In [80]:
papers['References'] = papers['Content'].str.split(r'References|R E F E R E N C E S|REFERENCES|r e f e r e n c e s|Reference List', 2, expand = True)[1]
papers['Content'] = papers['Content'].str.split(r'References|R E F E R E N C E S|REFERENCES|r e f e r e n c e s|Reference List', 2, expand = True)[0]

In [81]:
papers['Discussion / Conclusion'] = papers['Content'].str.split(r'Discussion|D I S C U S S I O N|DISCUSSION|d i s c u s s i o n|Conclusion|C O N C L U S I O N|CONCLUSION|c o n c l u s i o n', 2, expand = True)[1]
papers['Content'] = papers['Content'].str.split(r'Discussion|D I S C U S S I O N|DISCUSSION|d i s c u s s i o n|Conclusion|C O N C L U S I O N|CONCLUSION|c o n c l u s i o n', 2, expand = True)[0]

In [82]:
papers['Main content'] = papers['Content'].str.split(r'Introduction|I N T R O D U C T I O N|INTRODUCTION|i n t r o d u c t i o n', 2, expand = True)[1]
papers['Title + Abstract'] = papers['Content'].str.split(r'Introduction|I N T R O D U C T I O N|INTRODUCTION|i n t r o d u c t i o n', 2, expand = True)[0]
papers.drop('Content', axis=1, inplace=True)

In [83]:
papers

Unnamed: 0.2,Unnamed: 0,DOI_x,Retracted,ID,Unnamed: 0.1,Unnamed: 0.1.1,Publication Type,Authors,Book Authors,Book Editors,...,OriginalPaperPubMedID,RetractionNature,Reason,Paywalled,Notes,All content,References,Discussion / Conclusion,Main content,Title + Abstract
0,15,nr580.pdf,0,nr580,579,580.0,J,"Juahir, Y; Morris, BM; Lim, D; Supian, FL",,,...,,,,,,Journal of Fundamental and Applied S...,"[1] Roy N, Pramanik HA, Paul PC, Singh TS. A...",3.1. Surface Pressure-Area (Π-A) Isotherms ...,Schiff bases are aldehyde - or ketone-like c...,Journal of Fundamental and Applied S...
1,195,nr601.pdf,0,nr601,600,601.0,J,"Umar, R; Abidin, ZZ; Ibrahim, ZA; Kamarudin, M...",,,...,,,,,,Journal of Fundamental and Applied S...,"[1] Hamidi Z, Shariff N. Investigation of ra...",Each data for each parameter was combined t...,Radio astronomical observation is affected b...,Journal of Fundamental and Applied S...
2,199,nr577.pdf,0,nr577,576,577.0,J,"Salehi, A; Goljah, MN; Baladehi, AS",,,...,,,,,,THE STUDY OF DECORATIVE ELEMENTS OF SIAMIA...,Guide of Historical-Cultural Works of Mazand...,"Essentially, architecture deals with somethi...",If one presumes a body and a soul for arch...,THE STUDY OF DECORATIVE ELEMENTS OF SIAMIA...
3,205,nr589.pdf,0,nr589,588,589.0,J,"Yong, SK; Annuar, NAK; Ariff, MJM",,,...,,,,,,Journal of Fundamental and Applied ...,"[1] Yong S K, Mohd Z S N, Mad A M J. Effects...",2.1. Physico-chemical Properties for SMS an...,Shooting range contains high content of lead...,Journal of Fundamental and Applied ...
4,239,nr543.pdf,0,nr543,542,543.0,J,"Gabsalikhova, LM; Sadygova, GR; Makarova, IV; ...",,,...,,,,,,THE PROSPECTS OF USE OF ALTERNATIVE TY...,1. Chris Woodford. Air pollution. URL: http:...,"The implementation of scientific approach, w...","In the Russian Federation, as in other devel...",THE PROSPECTS OF USE OF ALTERNATIVE TY...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2570,488,,1,r937,937,,,,,,...,0.0,Retraction,+Concerns/Issues about Referencing/Attribution...,No,,Original Article Prefabricated construction pr...,"1. Zhiqiang B, Kejian M and Wenfeng L. Random...",s On the basis of the research on the construc...,The large span and high load of traditional m...,Original Article Prefabricated construction pr...
2571,531,,1,r926,926,,,,,,...,0.0,Retraction,+Concerns/Issues about Referencing/Attribution...,No,,Original Article Landscape interaction design ...,"1. Hugo L, Marleen G, Ronnie B, et al. Engine...","s Like ordinary people, children with sensory ...","In the past half century, the career of speci...",Original Article Landscape interaction design ...
2572,584,,1,r928,928,,,,,,...,0.0,Retraction,+Concerns/Issues about Referencing/Attribution...,No,,Original Article Mechanism analysis on ground ...,"1. Wu YD, Diao HG, Liu J, et al. Field study ...","s In this study, a stress-controlled triaxial ...",Soft clayey soils often possess poor modulus ...,Original Article Mechanism analysis on ground ...
2573,604,,1,r938,938,,,,,,...,0.0,Retraction,+Concerns/Issues about Referencing/Attribution...,No,,Original Article Proportional-resonant control...,"1. Bindeshwar S, Singh SP, Singh J, et al. Pe...",The open loop boost and reboost converter–inv...,Solar system presented a structure and equipm...,Original Article Proportional-resonant control...


Check for null values (NaN/None) in References: if references are missing, it is an indicator that not the original article was scraped but some text relating to the article. Thus, we decide to exclude those rows with missing values for references.

In [84]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

Total papers: 2528
Missing main content: 1199
Missing discussion/conclusion: 665
Missing references: 305


In [85]:
no_references = papers[papers['References'].isnull()]
papers = papers[~papers.ID.isin(no_references.ID)]

In [86]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

Total papers: 2223
Missing main content: 968
Missing discussion/conclusion: 453
Missing references: 0


In [87]:
no_introduction = papers[papers['Main content'].isnull()]
papers = papers[~papers.ID.isin(no_introduction.ID)]

In [88]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

Total papers: 1255
Missing main content: 0
Missing discussion/conclusion: 109
Missing references: 0


In [89]:
no_discussion = papers[papers['Discussion / Conclusion'].isnull()]
papers = papers[~papers.ID.isin(no_discussion.ID)]

In [90]:
print("Total papers:", len(papers))
print("Missing main content:", papers['Main content'].isnull().sum())
print("Missing discussion/conclusion:", papers['Discussion / Conclusion'].isnull().sum())
print("Missing references:", papers['References'].isnull().sum())

Total papers: 1146
Missing main content: 0
Missing discussion/conclusion: 0
Missing references: 0


Remove duplicate entries

In [91]:
duplicates = []
count = 0
for i in range(len(papers['Main content'])):
  count += 1
  for j in range(len(papers['Main content'])):
    try:
      if papers['Main content'][i] == papers['Main content'][j] and i < j and str(papers['Main content'][i]) != 'None':
        duplicates.append(papers['ID'][i])
    except:
      pass

papers = papers[~papers.ID.isin(duplicates)]

In [92]:
print("Total papers after duplicate removal:", len(papers))

Total papers after duplicate removal: 884


Investigate the final dataframe

In [93]:
papers['Journal_Name'] = papers['Journal'].fillna(papers['Source Title'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [94]:
papers['Journal_Name']  = papers['Journal_Name'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [95]:
papers['Journal_Name'] = papers['Journal_Name'].str.replace('acta crystallographica section e-crystallographic communications', 'acta crystallographica. section e, crystallographic communications', regex = True)
papers['Journal_Name'] = papers['Journal_Name'].str.replace('international journal of electrical engineering & education', 'international journal of electrical engineering education', regex = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [96]:
papers['Journal_Name'].value_counts()

plos one                                                     218
journal of cellular biochemistry                             192
rsc advances                                                 147
arabian journal of geosciences                               130
international journal of electrical engineering education    110
oncotargets and therapy                                       44
journal of fundamental and applied sciences                   24
journal of clinical anesthesia                                19
Name: Journal_Name, dtype: int64

In [97]:
papers.groupby(['Journal_Name', 'Retracted', ]).size()

Journal_Name                                               Retracted
arabian journal of geosciences                             0             47
                                                           1             83
international journal of electrical engineering education  0             32
                                                           1             78
journal of cellular biochemistry                           0            114
                                                           1             78
journal of clinical anesthesia                             0             19
journal of fundamental and applied sciences                0             12
                                                           1             12
oncotargets and therapy                                    0             31
                                                           1             13
plos one                                                   0            158
                   

Save the output to CSV

In [98]:
papers.to_csv('/content/drive/MyDrive/Applied Data Science/Thesis/Code/Data (CSV)/preprocessing_part_1.csv')