In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd '/content/drive/MyDrive/Applied Data Science/Thesis/Code'

/content/drive/MyDrive/Applied Data Science/Thesis/Code


Import libraries and install transformers

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 4.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 24.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 57.4 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstallin

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification
import numpy as np
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

Load csv files of retracted and non-retracted articles

In [None]:
four_journal_train_data_set = pd.read_csv('/content/drive/MyDrive/Applied Data Science/Thesis/Code/Data (CSV)/four_journal_train_data_set.csv', encoding="utf-8-sig")
two_journal_test_data_set = pd.read_csv('/content/drive/MyDrive/Applied Data Science/Thesis/Code/Data (CSV)/two_journal_test_data_set.csv', encoding="utf-8-sig")

Inspect the columns

In [None]:
four_journal_train_data_set.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1', 'DOI_x', 'Retracted',
       'ID', 'Unnamed: 0.1.1.1', 'Unnamed: 0.1.1.1.1', 'Publication Type',
       'Authors',
       ...
       'Title + Abstract', 'Title + Abstract PP L', 'Main content PP L',
       'Discussion / Conclusion PP L', 'References PP L',
       'Title + Abstract PP S', 'Main content PP S',
       'Discussion / Conclusion PP S', 'References PP S', 'Journal_Name'],
      dtype='object', length=115)

## Train/test/val split

In [None]:
paper_sections = ['Title + Abstract PP S', 'Main content PP S', 'Discussion / Conclusion PP S', 'References PP S']
for section in paper_sections:
  print('\n########### ' + section + ": \n")

  if four_journal_train_data_set[section].isnull().values.any():
      nan_values = four_journal_train_data_set[four_journal_train_data_set[section].isnull()]
      four_journal_train_data_set = four_journal_train_data_set[~four_journal_train_data_set.ID.isin(nan_values.ID)]

  raw_X = list(four_journal_train_data_set[section].values) # the texts --> X
  X = []
  y = list(four_journal_train_data_set.Retracted.values) # the labels we want to predict --> Y

  for i in raw_X:
    if len(i.split(" ")) > 420:
      head = i.split(" ")[0:210]
      tail = i.split(" ")[-210:]
      headandtail = " ".join(head) + " ".join(tail) 
    else:
      headandtail = i
    X.append(headandtail)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.34, random_state=1)
  X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

  tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
  train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=510) # convert input strings to BERT encodings
  test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=510)
  val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=510)

  train_dataset = tf.data.Dataset.from_tensor_slices((
      dict(train_encodings),
      y_train
  )).shuffle(buffer_size = 1000, seed=1).batch(16) # convert the encodings to Tensorflow objects
  val_dataset = tf.data.Dataset.from_tensor_slices((
      dict(val_encodings),
      y_val
  )).batch(64)
  test_dataset = tf.data.Dataset.from_tensor_slices((
      dict(test_encodings),
      y_test
  )).batch(64)

  model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', 
                                                            num_labels=len(labels))
  callbacks = [
          tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, 
                        mode='min', baseline=None, 
                        restore_best_weights=True)]

  optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

  model.compile(optimizer=optimizer, loss=loss)

  history = model.fit(train_dataset, 
              epochs=5,
            callbacks=callbacks, 
            validation_data=val_dataset,
            batch_size=16)

  logits = model.predict(test_dataset)
  y_preds = np.argmax(logits[0], axis=1)
  print(classification_report(y_test, y_preds))

  print(confusion_matrix(y_test, y_preds))

  amountofpapers = len(two_journal_test_data_set)
  two_journal_test_data_set = two_journal_test_data_set.sample(amountofpapers)

  print("\n####### NOW RUNNING ON THE EXTERNAL DATA SET #########\n")

  if two_journal_test_data_set[section].isnull().values.any():
    nan_values = two_journal_test_data_set[two_journal_test_data_set[section].isnull()]
    two_journal_test_data_set = two_journal_test_data_set[~two_journal_test_data_set.ID.isin(nan_values.ID)]

  raw_X = list(two_journal_test_data_set[section].values[0:amountofpapers]) # the texts --> X
  X = []
  y = list(two_journal_test_data_set.Retracted.values[0:amountofpapers]) # the labels we want to predict --> Y

  for i in raw_X:
    if len(i.split(" ")) > 420:
      head = i.split(" ")[0:210]
      tail = i.split(" ")[-210:]
      headandtail = " ".join(head) + " ".join(tail) 
    else:
      headandtail = i
    X.append(headandtail)

  examples_encodings = tokenizer(X, truncation=True, padding=True)
  examples_encodings = tf.data.Dataset.from_tensor_slices((
                      dict(examples_encodings)
                        )).batch(64)
  pred_logits = model.predict(examples_encodings)

  predictions = []
  for i, logits in enumerate(pred_logits[0]):
      prediction = np.argmax(logits)
      predictions.append(prediction)

  print(confusion_matrix(y, predictions))

  print(classification_report(y, predictions))
  
  save_directory = '/content/drive/MyDrive/Applied Data Science/Thesis/Code/Classifiers/BERT/saved_models/' +  section
  model.save_pretrained(save_directory)


########### Title + Abstract PP S: 



Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_transform', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_79']
You should probably TRAIN this model on a down-stream task to be able to use it fo

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.86      0.94      0.90        33
           1       0.93      0.83      0.88        30

    accuracy                           0.89        63
   macro avg       0.89      0.89      0.89        63
weighted avg       0.89      0.89      0.89        63

[[31  2]
 [ 5 25]]

####### NOW RUNNING ON THE EXTERNAL DATA SET #########

[[ 88  44]
 [ 32 100]]
              precision    recall  f1-score   support

           0       0.73      0.67      0.70       132
           1       0.69      0.76      0.72       132

    accuracy                           0.71       264
   macro avg       0.71      0.71      0.71       264
weighted avg       0.71      0.71      0.71       264


########### Main content PP S: 



Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_transform', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_99']
You should probably TRAIN this model on a down-stream task to be able to use it fo

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       1.00      0.88      0.94        33
           1       0.88      1.00      0.94        30

    accuracy                           0.94        63
   macro avg       0.94      0.94      0.94        63
weighted avg       0.94      0.94      0.94        63

[[29  4]
 [ 0 30]]

####### NOW RUNNING ON THE EXTERNAL DATA SET #########





[[ 89  43]
 [ 22 110]]
              precision    recall  f1-score   support

           0       0.80      0.67      0.73       132
           1       0.72      0.83      0.77       132

    accuracy                           0.75       264
   macro avg       0.76      0.75      0.75       264
weighted avg       0.76      0.75      0.75       264


########### Discussion / Conclusion PP S: 



Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_transform', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['dropout_119', 'pre_classifier', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it f

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.94      0.97      0.96        33
           1       0.97      0.93      0.95        30

    accuracy                           0.95        63
   macro avg       0.95      0.95      0.95        63
weighted avg       0.95      0.95      0.95        63

[[32  1]
 [ 2 28]]

####### NOW RUNNING ON THE EXTERNAL DATA SET #########





[[ 83  49]
 [ 27 105]]
              precision    recall  f1-score   support

           0       0.75      0.63      0.69       132
           1       0.68      0.80      0.73       132

    accuracy                           0.71       264
   macro avg       0.72      0.71      0.71       264
weighted avg       0.72      0.71      0.71       264


########### References PP S: 



Some layers from the model checkpoint at distilbert-base-cased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_transform', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['dropout_139', 'pre_classifier', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it f

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.79      0.94      0.86        33
           1       0.92      0.73      0.81        30

    accuracy                           0.84        63
   macro avg       0.86      0.84      0.84        63
weighted avg       0.85      0.84      0.84        63

[[31  2]
 [ 8 22]]

####### NOW RUNNING ON THE EXTERNAL DATA SET #########

[[97 35]
 [91 41]]
              precision    recall  f1-score   support

           0       0.52      0.73      0.61       132
           1       0.54      0.31      0.39       132

    accuracy                           0.52       264
   macro avg       0.53      0.52      0.50       264
weighted avg       0.53      0.52      0.50       264



In [None]:
section = 'Main content PP S'
save_directory = '/content/drive/MyDrive/Applied Data Science/Thesis/Code/Classifiers/BERT/saved_models/' +  section
loaded_model = TFDistilBertForSequenceClassification.from_pretrained(save_directory)

print("\n####### NOW RUNNING ON THE EXTERNAL DATA SET #########\n")

if two_journal_test_data_set[section].isnull().values.any():
  nan_values = two_journal_test_data_set[two_journal_test_data_set[section].isnull()]
  two_journal_test_data_set = two_journal_test_data_set[~two_journal_test_data_set.ID.isin(nan_values.ID)]

raw_X = list(two_journal_test_data_set[section].values[0:amountofpapers]) # the texts --> X
X = []
y = list(two_journal_test_data_set.Retracted.values[0:amountofpapers]) # the labels we want to predict --> Y
labels = ['non retracted', 'retracted']

for i in raw_X:
  if len(i.split(" ")) > 420:
    head = i.split(" ")[0:210]
    tail = i.split(" ")[-210:]
    headandtail = " ".join(head) + " ".join(tail) 
  else:
    headandtail = i
  X.append(headandtail)

examples_encodings = tokenizer(X, truncation=True, padding=True)
examples_encodings = tf.data.Dataset.from_tensor_slices((
                    dict(examples_encodings)
                      )).batch(64)
pred_logits = loaded_model.predict(examples_encodings)

predictions = []
for i, logits in enumerate(pred_logits[0]):
    prediction = np.argmax(logits)
    predictions.append(prediction)

print(confusion_matrix(y, predictions))

print(classification_report(y, predictions))

Some layers from the model checkpoint at /content/drive/MyDrive/Applied Data Science/Thesis/Code/Classifiers/BERT/saved_models/Main content PP S were not used when initializing TFDistilBertForSequenceClassification: ['dropout_99']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Applied Data Science/Thesis/Code/Classifiers/BERT/saved_models/Main content PP S and are newly initialized: ['dropout_179']
You should


####### NOW RUNNING ON THE EXTERNAL DATA SET #########

[[ 89  43]
 [ 22 110]]
              precision    recall  f1-score   support

           0       0.80      0.67      0.73       132
           1       0.72      0.83      0.77       132

    accuracy                           0.75       264
   macro avg       0.76      0.75      0.75       264
weighted avg       0.76      0.75      0.75       264

