Connect to drive

Sources:
https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34
https://www.analyticsvidhya.com/blog/2021/01/a-guide-to-the-naive-bayes-algorithm/

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
cd '/content/drive/MyDrive/Applied Data Science/Thesis/Code'

/content/drive/MyDrive/Applied Data Science/Thesis/Code


Import libraries

In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score,KFold
from sklearn import naive_bayes
from sklearn.feature_extraction.text import CountVectorizer

Load csv files of retracted and non-retracted articles

In [17]:
four_journal_train_data_set = pd.read_csv('/content/drive/MyDrive/Applied Data Science/Thesis/Code/Data (CSV)/four_journal_train_data_set.csv', encoding="utf-8-sig")
two_journal_test_data_set = pd.read_csv('/content/drive/MyDrive/Applied Data Science/Thesis/Code/Data (CSV)/two_journal_test_data_set.csv', encoding="utf-8-sig")

Classifier: Countvectorizer

In [18]:
paper_sections = ['Title + Abstract PP S', 'Main content PP S', 'Discussion / Conclusion PP S', 'References PP S']

for section in paper_sections:
  print('\n########### ' + section + ": \n")

  if four_journal_train_data_set[section].isnull().values.any():
    nan_values = four_journal_train_data_set[four_journal_train_data_set[section].isnull()]
    four_journal_train_data_set = four_journal_train_data_set[~four_journal_train_data_set.ID.isin(nan_values.ID)]

  raw_X = list(four_journal_train_data_set[section].values) # the texts --> X
  X = []
  y = list(four_journal_train_data_set.Retracted.values) # the labels we want to predict --> Y

  for i in raw_X:
    if len(i.split(" ")) > 420:
      head = i.split(" ")[0:210]
      tail = i.split(" ")[-210:]
      headandtail = " ".join(head) + " ".join(tail) 
    else:
      headandtail = i
    X.append(headandtail)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.34, random_state = 1)


  le = LabelEncoder()

  y_train = le.fit_transform(y_train)
  y_test = le.fit_transform(y_test)

  Tfidf_vect = TfidfVectorizer()
  Tfidf_vect.fit(X)
  Train_X_Tfidf = Tfidf_vect.transform(X_train)
  Test_X_Tfidf = Tfidf_vect.transform(X_test)

  Naive = naive_bayes.MultinomialNB()
  Naive.fit(Train_X_Tfidf,y_train)
  predictions_NB = Naive.predict(Test_X_Tfidf)

  print(classification_report(y_test, predictions_NB))

  print(confusion_matrix(y_test, predictions_NB))

  print("\n####### NOW RUNNING ON THE EXTERNAL DATA SET #########\n")

  if two_journal_test_data_set[section].isnull().values.any():
    nan_values = two_journal_test_data_set[two_journal_test_data_set[section].isnull()]
    two_journal_test_data_set = two_journal_test_data_set[~two_journal_test_data_set.ID.isin(nan_values.ID)]

  raw_X = list(two_journal_test_data_set[section].values) # the texts --> X
  X = []
  y = list(two_journal_test_data_set.Retracted.values) # the labels we want to predict --> Y

  for i in raw_X:
    if len(i.split(" ")) > 420:
      head = i.split(" ")[0:210]
      tail = i.split(" ")[-210:]
      headandtail = " ".join(head) + " ".join(tail) 
    else:
      headandtail = i
    X.append(headandtail)

  Test_X_Tfidf = Tfidf_vect.transform(X)

  predictions_NB = Naive.predict(Test_X_Tfidf)
  print(confusion_matrix(y, predictions_NB))

  print(classification_report(y, predictions_NB))


########### Title + Abstract PP S: 

              precision    recall  f1-score   support

           0       0.96      0.63      0.76        71
           1       0.68      0.96      0.79        56

    accuracy                           0.78       127
   macro avg       0.82      0.80      0.78       127
weighted avg       0.83      0.78      0.78       127

[[45 26]
 [ 2 54]]

####### NOW RUNNING ON THE EXTERNAL DATA SET #########

[[ 51  81]
 [  0 132]]
              precision    recall  f1-score   support

           0       1.00      0.39      0.56       132
           1       0.62      1.00      0.77       132

    accuracy                           0.69       264
   macro avg       0.81      0.69      0.66       264
weighted avg       0.81      0.69      0.66       264


########### Main content PP S: 

              precision    recall  f1-score   support

           0       1.00      0.51      0.67        71
           1       0.62      1.00      0.76        56

    accurac

### Further Textual Analysis

In [19]:
df = four_journal_train_data_set

In [20]:
df[df['Title + Abstract PP L'].str.contains("cell")].Retracted.value_counts()

1    83
0    37
Name: Retracted, dtype: int64

In [21]:
df[df['Title + Abstract PP L'].str.contains("cancer")].Retracted.value_counts()

1    32
0    18
Name: Retracted, dtype: int64

In [22]:
df[df['Title + Abstract PP L'].str.contains("protein")].Retracted.value_counts()

1    56
0     9
Name: Retracted, dtype: int64

In [23]:
df[df['Title + Abstract PP L'].str.contains("tumor")].Retracted.value_counts()

1    31
0    11
Name: Retracted, dtype: int64

In [24]:
words = list(feature_importance.sort_values('coef', ascending=False).word)
for i in words:
  if len(df[(df['Title + Abstract PP L'].str.contains(i)) & (df['Retracted'] == 0)]) > 40 and len(df[(df['Title + Abstract PP L'].str.contains(i)) & (df['Retracted'] == 1)]) < 30:
    #print(df[df['References PP L'].str.contains(i)].Retracted.value_counts())
    print(i, " #### 0:", len(df[(df['Title + Abstract PP L'].str.contains(i)) & (df['Retracted'] == 0)]), "1:", len(df[(df['Title + Abstract PP L'].str.contains(i)) & (df['Retracted'] == 1)]))
    #print(len(df[(df['References PP L'].str.contains(i)) & (df['Retracted'] == 1)]))
    #print(i)

rock  #### 0: 46 1: 8
sediment  #### 0: 43 1: 1
indicate  #### 0: 45 1: 24
include  #### 0: 47 1: 17
publish  #### 0: 83 1: 21
follow  #### 0: 46 1: 25
present  #### 0: 60 1: 29
hr  #### 0: 48 1: 29
geo  #### 0: 50 1: 26
part  #### 0: 48 1: 29
graph  #### 0: 45 1: 22
assoc  #### 0: 43 1: 23
lc  #### 0: 52 1: 16


In [25]:
words = list(feature_importance.sort_values('coef', ascending=False).word)
for i in words:
  if len(df[(df['Title + Abstract PP L'].str.contains(i)) & (df['Retracted'] == 0)]) < 30 and len(df[(df['Title + Abstract PP L'].str.contains(i)) & (df['Retracted'] == 1)]) > 40:
    #print(df[df['References PP L'].str.contains(i)].Retracted.value_counts())
    print(i, " #### 0:", len(df[(df['Title + Abstract PP L'].str.contains(i)) & (df['Retracted'] == 0)]), "1:", len(df[(df['Title + Abstract PP L'].str.contains(i)) & (df['Retracted'] == 1)]))
    #print(len(df[(df['References PP L'].str.contains(i)) & (df['Retracted'] == 1)]))
    #print(i)

doi  #### 0: 22 1: 83
expression  #### 0: 7 1: 82
protect  #### 0: 8 1: 53
group  #### 0: 15 1: 49
model  #### 0: 29 1: 79
human  #### 0: 13 1: 57
target  #### 0: 18 1: 56
pathway  #### 0: 8 1: 45
promote  #### 0: 3 1: 64
protein  #### 0: 9 1: 56
signal  #### 0: 5 1: 42
proliferation  #### 0: 1 1: 47
inhibit  #### 0: 14 1: 63
regulate  #### 0: 5 1: 42
apoptosis  #### 0: 3 1: 55
growth  #### 0: 17 1: 42
control  #### 0: 18 1: 47
compare  #### 0: 27 1: 54
function  #### 0: 9 1: 42
time  #### 0: 29 1: 64
level  #### 0: 22 1: 59
characteristic  #### 0: 29 1: 46
mechanism  #### 0: 15 1: 43
improve  #### 0: 25 1: 63
jcb  #### 0: 5 1: 77
increase  #### 0: 26 1: 77
information  #### 0: 16 1: 46
tissue  #### 0: 1 1: 49
research  #### 0: 29 1: 67
life  #### 0: 9 1: 58
detect  #### 0: 13 1: 66
technology  #### 0: 12 1: 47
western  #### 0: 28 1: 43
assay  #### 0: 2 1: 41
decrease  #### 0: 11 1: 62
action  #### 0: 28 1: 45
difference  #### 0: 12 1: 42
express  #### 0: 9 1: 85
lead  #### 0: 25 1: 56