In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report
import numpy as np
import json
from collections import defaultdict

In [2]:
SEED = 123
torch.manual_seed(SEED)
np.random.seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [13]:
train_path = '/kaggle/input/conv_derailment_data/datasets/cmv/dynamic/train.json'
test_path = '/kaggle/input/conv_derailment_data/datasets/cmv/dynamic/test.json'
val_path = '/kaggle/input/conv_derailment_data/datasets/cmv/dynamic/valid.json'

train_data = []
with open(train_path,'r') as f:
  for line in f:
    train_data.append(json.loads(line))

test_data = []
with open(test_path,'r') as f:
  for line in f:
    test_data.append(json.loads(line))

val_data = []
with open(val_path,'r') as f:
  for line in f:
    val_data.append(json.loads(line))

In [None]:
# grouped_train = defaultdict(list)
# for item in train_data:
#     convo_id = item['convo_id']
#     grouped_train[convo_id].append(item)


# # duplicated convo_ids
# duplicated_convo_ids = [k for k, v in grouped_train.items() if len(v) > 1]
# print(duplicated_convo_ids)

In [14]:
train_data[0]

{'src': ' you decided to edit it after 5 months. why would we leave up an empty comment?',
 'reply': [101,
  1045,
  2074,
  2064,
  1005,
  1056,
  2903,
  2017,
  2024,
  11347,
  2075,
  1019,
  3204,
  7928,
  1045,
  2081,
  1010,
  1045,
  2031,
  1037,
  10928,
  2017,
  2024,
  20070,
  2033,
  1998,
  19597,
  2009,
  1005,
  1055,
  14888,
  1012,
  102],
 'tgt': True,
 'convo_id': 3023845,
 'comment_id': 8,
 'is_last': False}

In [15]:
from transformers import BertTokenizer
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
for item in train_data:
    item['src'] = item['src'].replace('[SEP]', '[EOS]')
    item['src'] = item['src'].replace('[UNK]', ' ')
    item['reply'] = tokenizer_bert.decode(item['reply'], skip_special_tokens=True)

for item in val_data:
    item['src'] = item['src'].replace('[SEP]', '[EOS]')
    item['src'] = item['src'].replace('[UNK]', ' ')
    item['reply'] = tokenizer_bert.decode(item['reply'], skip_special_tokens=True)

for item in test_data:
    item['src'] = item['src'].replace('[SEP]', '[EOS]')
    item['src'] = item['src'].replace('[UNK]', ' ')
    item['reply'] = tokenizer_bert.decode(item['reply'], skip_special_tokens=True)


2024-04-19 02:49:17.229138: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-19 02:49:17.229237: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-19 02:49:17.346469: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [19]:
train_data[10]

{'src': '##s? [EOS]  so someone needs to invent a self - cleaning prosthetic device.',
 'reply': "until that happens, it's a ludicrous proposition that women should use urinals.",
 'tgt': False,
 'convo_id': 4276573,
 'comment_id': 6,
 'is_last': False}

In [20]:
def data_for_gpt2(data):
  src_text = data['src']
  reply = data['reply']
  tgt = data['tgt']
  is_last = data['is_last']
  combined_text = f"<SRC>: {src_text}  <REPLY>: {reply} <IS_LAST>: {is_last}"
  return combined_text, tgt

In [21]:
train_data_gpt2 = []
tgt_train = []
for item in train_data:
  formatted_text, tgt = data_for_gpt2(item)
  train_data_gpt2.append(formatted_text)
  tgt_train.append(tgt)

In [22]:
tgt_train[0]

True

In [23]:
train_data_gpt2[0]

"<SRC>:  you decided to edit it after 5 months. why would we leave up an empty comment?  <REPLY>: i just can't believe you are browsing 5 month comments i made, i have a suspicion you are stalking me and frankly it's disturbing. <IS_LAST>: False"

In [24]:
val_data_gpt2 = []
tgt_val = []
for item in val_data:
  formatted_text, tgt = data_for_gpt2(item)
  val_data_gpt2.append(formatted_text)
  tgt_val.append(tgt)

In [25]:
val_data_gpt2[0]

'<SRC>: doesn\'t reflect who they consider themselves to be. this is more information, not less.  <REPLY>: but it isn\'t unclear who the pronoun is referring to : " they " had been both singular and plural long before this special snowflake syndrome phenomena. the only reason to change this is to ostensibly insert identity politics into casual language. eg : " have you met terry? they will be at the party tonight. " pronoun has successfully informed you that terry will be at the party. _ _ _ edit : additional edited text from deleted comment : i\'m talking about the word " they " ; there is nothing exclusive about " they ". it doesn\'t even have to refer to humans : " my dog is sick, they got into my chocolate. " it doesn\'t even have to refer to animate objects : " i got some groceries, they are in the car ". " they " can even refer to abstract concepts : " ever have discussions with trolls on the internet? they are so tiring. " i\'ll paraphrase again what i\'ve said elsewhere in this

In [26]:
tgt_val[0]

True

In [28]:
test_data[0]

{'src': " & gt ; it seems like every way of making a living which is based around humanitarianism and activism leaves people struggling to get by. anything that promotes human welfare falls under humanitarian. want to make the next facebook? humanitarian. loan people money for a car? humanitarian. loan people money for tractors and drought resistant seeds?... point being almost every market activity is humanitarian because it increases the welfare of consumers. activism is much more marketing than it is anything else and marketing is a transaction cost, which is to say it doesn't change welfare other than the resources it takes to produce. from that perspective you need to have a party who is willing to gain more from the activism than the resources it takes to change people's minds. chances are that activism will leave you poor because generally the people who want social and cultural change are people who are most marginalized by society and thus have little to give and not much fina

In [29]:
def test_data_for_gpt2(data):
  src_text = data['src']
  reply = data['reply']
  tgt = data['tgt']
  combined_text = f"<SRC>: {src_text}  <REPLY>: {reply} "
  return combined_text, tgt

In [30]:
test_data_gpt2 = []
tgt_test = []
for item in test_data:
  formatted_text, tgt = test_data_for_gpt2(item)
  test_data_gpt2.append(formatted_text)
  tgt_test.append(tgt)

In [31]:
from transformers import AutoTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm import tqdm

In [32]:
from transformers import GPT2ForSequenceClassification, GPT2Config
import torch
import torch.nn.functional as F

class GPT2ForSequenceClassificationMeanPooling(GPT2ForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.gpt2 = self.transformer  # Assuming the GPT2ForSequenceClassification model has a 'transformer' attribute
        self.score = torch.nn.Linear(config.hidden_size, config.num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        labels=None,
        **kwargs
    ):
        outputs = self.gpt2(
            input_ids=input_ids,
            attention_mask=attention_mask,
            **kwargs,
        )

        # Get the hidden states from the last layer
        hidden_states = outputs.last_hidden_state

        # Apply mean pooling on the hidden states
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
        sum_hidden_states = torch.sum(hidden_states * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        mean_hidden_states = sum_hidden_states / sum_mask

        # Pass the pooled hidden states through the classification head
        logits = self.score(mean_hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    loss_fct = torch.nn.MSELoss()
                    loss = loss_fct(logits.view(-1), labels.view(-1))
                else:
                    loss_fct = torch.nn.CrossEntropyLoss()
                    loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            else:
                raise ValueError("You should supply an instance of `PreTrainedModel` or a `config`")

        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

In [33]:
### Loading Model and Tokenizers
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
                                         add_eos_token=True)
model = GPT2ForSequenceClassificationMeanPooling.from_pretrained('gpt2', num_labels=2)
print(model)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassificationMeanPooling were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassificationMeanPooling(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=True)
  (gpt2): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop)

GPT2ForSequenceClassificationMeanPooling(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=True)
  (gpt2): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop)

In [35]:
len(tokenizer)

50257

In [36]:
tokenizer.sep_token = '[EOS]'

In [37]:
tokenizer.sep_token

'[EOS]'

In [38]:
tokenizer.pad_token = tokenizer.eos_token

In [39]:
tokenizer.pad_token

'<|endoftext|>'

In [40]:
tokenizer.eos_token

'<|endoftext|>'

In [41]:
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'sep_token': '[EOS]', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [42]:
tokenizer.sep_token_id

50256

In [43]:
len(tokenizer)

50257

In [44]:
## Defining Special Tokens

special_tokens = {'additional_special_tokens': ['<SRC>:', ' <REPLY>:', '[UNK]','<IS_LAST>']}
num_added = tokenizer.add_special_tokens(special_tokens)
print(num_added)
model.resize_token_embeddings(len(tokenizer))

4


Embedding(50261, 768)

In [45]:
# Tokenizing Data
train_encodings = tokenizer(train_data_gpt2, truncation=False, padding=False)

In [46]:
tokenizer.decode(train_encodings['input_ids'][1])

"<SRC>:  comments i made, i have a suspicion you are stalking me and frankly it's disturbing.   <REPLY>:  no, we have an automoderator that checks for rule violations when a comment is posted or edited, and you showed up on the list. i really have no idea who you are and couldn't care less.  <IS_LAST> : False"

In [47]:
max_length = max(len(x) for x in train_encodings.input_ids)

In [48]:
max_length

573

In [49]:
val_encodings = tokenizer(val_data_gpt2, truncation=False, padding=False)

In [50]:
val_max_length = max(len(x) for x in val_encodings.input_ids)
val_max_length

573

In [51]:
test_encodings = tokenizer(test_data_gpt2, truncation=False, padding=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (1106 > 1024). Running this sequence through the model will result in indexing errors


In [52]:
test_max_length = max(len(x) for x in test_encodings.input_ids)
test_max_length

1106

In [53]:
token_test = tokenizer(train_data_gpt2, truncation=True, padding=True, max_length=1024)

In [54]:
## Redefining Tokenizer with padding and max length
# tokenizer.pad_token = tokenizer.eos_token
train_inputs = tokenizer(train_data_gpt2, truncation=True, padding=True, max_length=1024, return_tensors='pt')

# Create torch dataset
input_ids = train_inputs['input_ids']
attention_mask = train_inputs['attention_mask']
labels = torch.tensor([0 if tgt == False else 1 for tgt in tgt_train])
train_dataset = TensorDataset(input_ids, attention_mask, labels)

In [55]:
print(input_ids[0])

tensor([50257,   220,   345,  3066,   284,  4370,   340,   706,   642,  1933,
           13,  1521,   561,   356,  2666,   510,   281,  6565,  2912,    30,
          220, 50258,  1312,   655,   460,   470,  1975,   345,   389, 23182,
          642,  1227,  3651,  1312,   925,    11,  1312,   423,   257, 15123,
          345,   389, 34683,   502,   290, 17813,   340,   338, 14851,    13,
          220, 50260,    25, 10352, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 

In [56]:
len(input_ids[0])

573

In [57]:
val_inputs = tokenizer(val_data_gpt2, truncation=True, padding=True, max_length=1024, return_tensors='pt')
val_input_ids = val_inputs['input_ids']
val_attention_mask = val_inputs['attention_mask']
val_labels = torch.tensor([0 if tgt == False else 1 for tgt in tgt_val])
val_dataset = TensorDataset(val_input_ids, val_attention_mask, val_labels)

In [58]:
test_inputs = tokenizer(test_data_gpt2, truncation=True, padding=True, max_length=1024, return_tensors='pt')
test_input_ids = test_inputs['input_ids']
test_attention_mask = test_inputs['attention_mask']
test_labels = torch.tensor([0 if tgt == False else 1 for tgt in tgt_test])
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)

In [59]:
## CREATING DATALOADERS
batch_size = 4
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [60]:
from torch.optim import AdamW

In [61]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True)


In [62]:
import copy

In [63]:
from sklearn.metrics import f1_score
from tqdm import tqdm

# Training and validation loop
num_epochs = 5
train_losses = []
val_losses = []
f1_scores = []
best_val_loss = float('inf')
best_model_state = None

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1} of {num_epochs}")
    train_loss = 0
    
    model.train()
    for batch in tqdm(train_dataloader, desc='Training Progress', miniters=10):
        input_ids, attention_mask, batch_labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=batch_labels)
        loss = outputs[0]
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_train_loss = train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)
    print(f"Training loss: {avg_train_loss:.2f}")

    # Validation phase
    val_loss = 0
    model.eval()
    val_preds = []
    val_true_labels = []
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc='Validation Progress', miniters=10):
            input_ids, attention_mask, batch_labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=batch_labels)
            loss = outputs[0]
            val_loss += loss.item()
            logits = outputs[1]
            preds = torch.argmax(torch.nn.functional.softmax(logits, dim=1), dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_true_labels.extend(batch_labels.cpu().numpy())

        avg_val_loss = val_loss / len(val_dataloader)
        val_losses.append(avg_val_loss)
        f1 = f1_score(val_true_labels, val_preds, average='macro')
        f1_scores.append(f1)
        print(f"Validation loss: {avg_val_loss:.2f}, F1 score: {f1:.2f}")

    scheduler.step(avg_val_loss)

    # Check if the current model is the best one
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model_state = model.state_dict()
        torch.save(best_model_state, 'CMV_Dynamic_model_GPT2.pth')  # Save the best model
        print("Saved new best model")


Epoch 1 of 5


Training Progress: 100%|██████████| 2968/2968 [14:32<00:00,  3.40it/s]


Training loss: 0.70


Validation Progress: 100%|██████████| 992/992 [01:35<00:00, 10.39it/s]


Validation loss: 0.65, F1 score: 0.61
Saved new best model
Epoch 2 of 5


Training Progress: 100%|██████████| 2968/2968 [14:31<00:00,  3.40it/s]


Training loss: 0.62


Validation Progress: 100%|██████████| 992/992 [01:35<00:00, 10.39it/s]


Validation loss: 0.68, F1 score: 0.56
Epoch 3 of 5


Training Progress: 100%|██████████| 2968/2968 [14:31<00:00,  3.40it/s]


Training loss: 0.57


Validation Progress: 100%|██████████| 992/992 [01:35<00:00, 10.39it/s]


Validation loss: 0.68, F1 score: 0.61
Epoch 00003: reducing learning rate of group 0 to 1.0000e-06.
Epoch 4 of 5


Training Progress: 100%|██████████| 2968/2968 [14:31<00:00,  3.40it/s]


Training loss: 0.45


Validation Progress: 100%|██████████| 992/992 [01:35<00:00, 10.39it/s]


Validation loss: 0.71, F1 score: 0.63
Epoch 5 of 5


Training Progress: 100%|██████████| 2968/2968 [14:31<00:00,  3.40it/s]


Training loss: 0.41


Validation Progress: 100%|██████████| 992/992 [01:35<00:00, 10.38it/s]


Validation loss: 0.75, F1 score: 0.63
Epoch 00005: reducing learning rate of group 0 to 1.0000e-07.


In [64]:
model.eval()
test_loss = 0
test_preds = []
test_true_labels = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc='Test Progress', miniters = 10):
        input_ids, attention_mask, batch_labels = batch
        input_ids, attention_mask, batch_labels = input_ids.to(device), attention_mask.to(device), batch_labels.to(device)
        model.config.pad_token_id = tokenizer.eos_token_id
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=batch_labels)
        loss = outputs[0]  # Extract the loss value from the tuple
        test_loss += loss.item()
        logits = outputs[1] 
        probs = torch.nn.functional.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_true_labels.extend(batch_labels.cpu().numpy())
        
avg_test_loss = test_loss / len(test_dataloader)
f1 = f1_score(test_true_labels, test_preds, average='macro')
print(f"Test loss: {avg_test_loss:.2f}, F1 score: {f1:.2f}")

Test Progress: 100%|██████████| 2117/2117 [06:46<00:00,  5.21it/s]

Test loss: 0.86, F1 score: 0.56





In [65]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

F1_score = f1_score(test_true_labels, test_preds, average='macro')
accuracy = accuracy_score(test_true_labels, test_preds)
precision = precision_score(test_true_labels, test_preds, average='macro')
recall = recall_score(test_true_labels, test_preds, average='macro')

print(f"F1 score: {F1_score}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

F1 score: 0.5597581604117956
Accuracy: 0.559768485707536
Precision: 0.5612157740576851
Recall: 0.5611558388102705


In [66]:
conf_matrix = confusion_matrix(test_true_labels, test_preds)
classification_report = classification_report(test_true_labels, test_preds)

print(conf_matrix)
print(classification_report)

[[2390 1650]
 [2077 2349]]
              precision    recall  f1-score   support

           0       0.54      0.59      0.56      4040
           1       0.59      0.53      0.56      4426

    accuracy                           0.56      8466
   macro avg       0.56      0.56      0.56      8466
weighted avg       0.56      0.56      0.56      8466



In [67]:
data = {'True Labels': test_true_labels, 'Predictions': test_preds}
prediction_comparison_df = pd.DataFrame(data)

In [75]:
prediction_comparison_df[10:30]

Unnamed: 0,True Labels,Predictions
10,1,1
11,1,1
12,0,0
13,0,1
14,0,1
15,0,1
16,0,1
17,0,1
18,0,1
19,0,1


# LOADING BEST SAVED MODEL 

In [69]:
model_1 = GPT2ForSequenceClassificationMeanPooling.from_pretrained('gpt2', num_labels=2)
model_1.resize_token_embeddings(len(tokenizer)) 
model_1.load_state_dict(torch.load('/kaggle/working/CMV_Dynamic_model_GPT2.pth', map_location=device))
model_1.to(device)

Some weights of GPT2ForSequenceClassificationMeanPooling were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassificationMeanPooling(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=True)
  (gpt2): GPT2Model(
    (wte): Embedding(50261, 768)
    (wpe): Embedding(1024, 768)
    (drop)

In [70]:
model_1.eval()
test_loss = 0
test_preds = []
test_true_labels = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc='Test Progress', miniters = 10):
        input_ids, attention_mask, batch_labels = batch
        input_ids, attention_mask, batch_labels = input_ids.to(device), attention_mask.to(device), batch_labels.to(device)
        model_1.config.pad_token_id = tokenizer.eos_token_id
        outputs = model_1(input_ids=input_ids, attention_mask=attention_mask, labels=batch_labels)
        loss = outputs[0]  # Extract the loss value from the tuple
        test_loss += loss.item()
        logits = outputs[1] 
        probs = torch.nn.functional.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_true_labels.extend(batch_labels.cpu().numpy())
        
avg_test_loss = test_loss / len(test_dataloader)
f1 = f1_score(test_true_labels, test_preds, average='macro')
print(f"Test loss: {avg_test_loss:.2f}, F1 score: {f1:.2f}")

Test Progress: 100%|██████████| 2117/2117 [06:46<00:00,  5.21it/s]

Test loss: 0.71, F1 score: 0.50





In [71]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

F1_score = f1_score(test_true_labels, test_preds, average='macro')
accuracy = accuracy_score(test_true_labels, test_preds)
precision = precision_score(test_true_labels, test_preds, average='macro')
recall = recall_score(test_true_labels, test_preds, average='macro')

print(f"F1 score: {F1_score}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

F1 score: 0.49928328790779825
Accuracy: 0.5655563430191354
Precision: 0.5935003005158856
Recall: 0.5501124095690184


In [72]:
conf_matrix = confusion_matrix(test_true_labels, test_preds)
classification_report = classification_report(test_true_labels, test_preds)

print(conf_matrix)
print(classification_report)

[[ 854 3186]
 [ 492 3934]]
              precision    recall  f1-score   support

           0       0.63      0.21      0.32      4040
           1       0.55      0.89      0.68      4426

    accuracy                           0.57      8466
   macro avg       0.59      0.55      0.50      8466
weighted avg       0.59      0.57      0.51      8466



In [73]:
data = {'True Labels': test_true_labels, 'Predictions': test_preds}
prediction_comparison_df = pd.DataFrame(data)

In [74]:
prediction_comparison_df[10:100]

Unnamed: 0,True Labels,Predictions
10,1,1
11,1,1
12,0,0
13,0,1
14,0,1
...,...,...
95,1,0
96,0,1
97,0,1
98,0,1


In [76]:
from IPython.display import FileLink
FileLink(r'CMV_Dynamic_model_GPT2.pth')