In [1]:
!pip install datasets
!pip install accelerate -U



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import pandas as pd
import torch
import numpy as np
from tqdm.auto import tqdm
import torch.nn.functional as F

from transformers import BertForSequenceClassification, BertTokenizer, AutoTokenizer, AutoModelForSequenceClassification, pipeline
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import torch
import numpy as np
import tempfile
from huggingface_hub import HfFolder, Repository
import os

from datasets import load_dataset, Dataset, DatasetDict
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from transformers import EarlyStoppingCallback

import itertools
from datasets import load_metric


In [4]:
base_path = '/content/drive/Shareddrives/World Bank/NLP Project/'

In [5]:
df_gpt_negative = load_dataset("notrichardren/gpt_generated_10k")
gpt_data = pd.DataFrame(df_gpt_negative ['train'])
gpt_data = gpt_data[['claim']]
gpt_data['data_origin'] = 'https://huggingface.co/datasets/notrichardren/gpt_generated_10k'
gpt_data['label'] = 0
gpt_data.rename(columns = {"claim":"text"}, inplace=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
gpt_data.shape

(10869, 3)

In [6]:
file_name = 'Data/train_data_v2.xlsx'
train_data = pd.read_excel(base_path + file_name)

file_name = 'Data/val_data_v2.xlsx'
val_data = pd.read_excel(base_path + file_name)

In [7]:
train_data = pd.concat([train_data, gpt_data], ignore_index=True)

In [8]:
train_data['label'].value_counts()

0    14667
1     8952
Name: label, dtype: int64

In [9]:
val_data['label'].value_counts()

0    4467
1     254
Name: label, dtype: int64

# Modelling

In [10]:
model_name = 'DatastilBERT_v2'
model_save_path = base_path + "Trained Models/" + model_name


In [11]:
dataset_dict = DatasetDict(
    train=Dataset.from_pandas(train_data).class_encode_column("label"),
    val=Dataset.from_pandas(val_data).class_encode_column("label"),
)

Stringifying the column:   0%|          | 0/23619 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/23619 [00:00<?, ? examples/s]

Stringifying the column:   0%|          | 0/4721 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/4721 [00:00<?, ? examples/s]

In [12]:
MODEL_ID = "distilbert-base-cased"
# MODEL_ID = "finetuned-gpt2"

if MODEL_ID == "finetuned-gpt2":
    _MODEL_ID = "fine-tuned-gpt-models/9d083cbabe54c79b8705de407cac120c567b0f43ad95b778858a6ffce40a1455"
    tokenizer = AutoTokenizer.from_pretrained(_MODEL_ID)
else:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


tokenized_dataset_dict = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/23619 [00:00<?, ? examples/s]

Map:   0%|          | 0/4721 [00:00<?, ? examples/s]

In [13]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

def compute_metrics(eval_pred):
    """
    Computes various classification metrics based on predictions and actual labels.

    Args:
        eval_pred: A tuple of model predictions (logits) and actual labels.

    Returns:
        dict: A dictionary containing accuracy, precision, recall, f1 score, and ROC-AUC.
    """
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Ensure you have probabilities for the positive class in binary classification
    # logits are assumed to be for binary classification here (shape [batch_size, 2])
    probabilities = torch.nn.functional.softmax(torch.tensor(logits), dim=1).numpy()[:, 1]

    # Now compute the metrics
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='binary')
    recall = recall_score(labels, predictions, average='binary')
    f1 = f1_score(labels, predictions, average='binary')
    roc_auc = roc_auc_score(labels, probabilities)  # AUC expects probabilities for the positive class

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
    }

In [15]:
id2label = {0: "NO_DATA", 1: "WITH_DATA"}
label2id = {"NO_DATA": 0, "WITH_DATA": 1}

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_ID, num_labels=2, id2label=id2label, label2id=label2id,
        ignore_mismatched_sizes=True)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
LR = 1e-3
EPOCHS = 2000
BATCH_SIZE = 8
OPTIMIZER = "AdamW"

training_args = TrainingArguments(
    output_dir=model_save_path,
    warmup_ratio=0.2,
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=2,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    metric_for_best_model = "roc_auc",
    # push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_dict["train"],
    eval_dataset=tokenized_dataset_dict["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

trainer.train()
trainer.save_model()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
100,0.6908,0.695843,0.405846,0.067187,0.779528,0.123711,0.621425
200,0.6764,0.677595,0.840712,0.128358,0.338583,0.186147,0.666543
300,0.6617,0.653905,0.935607,0.331081,0.192913,0.243781,0.701188
400,0.618,0.622254,0.92777,0.322449,0.311024,0.316633,0.730088
500,0.5048,0.567096,0.850879,0.189227,0.53937,0.280164,0.776319
600,0.2761,0.327271,0.935819,0.409594,0.437008,0.422857,0.785244
700,0.0825,0.181227,0.945562,0.491892,0.358268,0.414579,0.761841
800,0.0235,0.200178,0.937725,0.422481,0.429134,0.425781,0.761363
900,0.0134,0.231107,0.928405,0.367089,0.456693,0.407018,0.76241


# Re-evaluate training data

In [18]:
class BERTClassifier:
    def __init__(self, model_path="jamesliounis/DataBERT", tokenizer_name="bert-base-uncased"):
        """
        Initializes the BERTClassifier with a specified model and tokenizer.

        This constructor creates a classification pipeline using a pre-trained BERT model and tokenizer.
        The pipeline abstracts the details of preprocessing, prediction, and postprocessing,
        allowing for straightforward text classification.

        Args:
            model_path (str): Path or identifier for the pre-trained model.
            tokenizer_name (str): Path or identifier for the pre-trained tokenizer.
        """
        self.classifier = pipeline("text-classification", model=model_path, tokenizer=tokenizer_name)

    def predict(self, input_texts):
        """
        Generates predictions for a batch of input texts using the BERT model.

        This method utilizes the classification pipeline to predict the class of each input text.
        It simplifies the process by abstracting away direct model and tokenizer interactions.

        Args:
            input_texts (list of str): The batch of texts for which predictions are to be generated.

        Returns:
            List[dict]: A list of dictionaries, where each dictionary contains the label and score
                        for the corresponding input text.
        """
        return self.classifier(input_texts)

In [22]:
clf = BERTClassifier(model_path = model_save_path,
                     tokenizer_name = MODEL_ID)


classifier = lambda x: clf.predict(x)

In [25]:
classifier('"Rural road construction programs in India have received considerable attention in recent years as part of the governments efforts to promote regional integration and economic development, and we use data from the Indian Human Development Survey to study their effects on social mobility."')

[{'label': 'WITH_DATA', 'score': 0.882865846157074}]

In [30]:
file_name = 'Data/Annotated Data/training_data_annotated_v1.xlsx'
train_data_v1 = pd.read_excel(base_path + file_name)
train_data_v1 = train_data_v1.query("data_origin.str.startswith('Coleridge') == False")
train_data_v1 = pd.concat([train_data_v1, gpt_data.head(8353)], ignore_index = True)
train_data_v1.head(1)


Unnamed: 0.1,Unnamed: 0,text,label,data_origin,DatastilBERT_v1_classification
0,0.0,We analyze the relationship between climate ch...,1,GPT-generated data (old),"('WITH_DATA', 0.958536684513092)"


In [32]:
tqdm.pandas()

train_data_v1['DatastilBERT_v2_classification'] = train_data_v1['text'].progress_apply(classifier)

  0%|          | 0/25059 [00:00<?, ?it/s]

In [33]:
path = base_path + '/Data/Annotated Data/training_data_annotated_v2.xlsx'

train_data_v1.to_excel(path, index=False)

In [34]:
train_data_v1.head()

Unnamed: 0.1,Unnamed: 0,text,label,data_origin,DatastilBERT_v1_classification,DatastilBERT_v2_classification
0,0.0,We analyze the relationship between climate ch...,1,GPT-generated data (old),"('WITH_DATA', 0.958536684513092)","[{'label': 'WITH_DATA', 'score': 0.92627954483..."
1,1.0,Using a dataset from the World Trade Organizat...,1,GPT-generated data (old),"('WITH_DATA', 0.9534607529640198)","[{'label': 'WITH_DATA', 'score': 0.90745252370..."
2,2.0,This study examines the impact of school-based...,1,GPT-generated data (old),"('WITH_DATA', 0.9466453790664673)","[{'label': 'WITH_DATA', 'score': 0.89949506521..."
3,3.0,We explore the impact of electricity access on...,1,GPT-generated data (old),"('WITH_DATA', 0.9585261940956116)","[{'label': 'WITH_DATA', 'score': 0.92568415403..."
4,4.0,To investigate the relationship between social...,1,GPT-generated data (old),"('WITH_DATA', 0.9394758343696594)","[{'label': 'WITH_DATA', 'score': 0.89283215999..."


In [35]:
train_data_v1['DatastilBERT_v2_classification'].value_counts()

KeyboardInterrupt: 

In [None]:
train_data_v1['']