In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report
import numpy as np
import json
from collections import defaultdict

In [3]:
SEED = 123
torch.manual_seed(SEED)
np.random.seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [5]:
train_path = '/kaggle/input/conv-forecast/conv_derailment_data/datasets/cga/dynamic/train.json'
test_path = '/kaggle/input/conv-forecast/conv_derailment_data/datasets/cga/dynamic/test.json'
val_path = '/kaggle/input/conv-forecast/conv_derailment_data/datasets/cga/dynamic/valid.json'

train_data = []
with open(train_path,'r') as f:
  for line in f:
    train_data.append(json.loads(line))

test_data = []
with open(test_path,'r') as f:
  for line in f:
    test_data.append(json.loads(line))

val_data = []
with open(val_path,'r') as f:
  for line in f:
    val_data.append(json.loads(line))

In [6]:
grouped_train = defaultdict(list)
for item in train_data:
    convo_id = item['convo_id']
    grouped_train[convo_id].append(item)


# duplicated convo_ids
duplicated_convo_ids = [k for k, v in grouped_train.items() if len(v) > 1]
print(duplicated_convo_ids)

[4569448, 5461434, 6708987, 118702, 7822121, 4808578, 2171855, 3783223, 671691, 7081680, 2451372, 8496171, 9425804, 8547848, 6670383, 8586917, 7179807, 2849532, 5781549, 4749111, 9663737, 7718965, 7755334, 7151598, 9568134, 4081781, 953289, 3423465, 1557881, 7758882, 8465328, 3307571, 4974541, 9614895, 8446079, 783001, 1301916, 8104437, 7715639, 3184679, 6161206, 5776415, 8560268, 3416742, 5419906, 6790839, 8273257, 2563851, 1147370, 7659667, 1229326, 2700420, 6813496, 3657214, 303605, 850933, 6214526, 831343, 7036369, 5629390, 4238255, 5223012, 9773466, 501989, 8902663, 941320, 8935006, 4327789, 8190543, 655689, 5092287, 8176573, 4866503, 9758805, 2483192, 1245571, 7629018, 2032032, 4641590, 9086429, 590340, 4045092, 715300, 6037847, 1398088, 6932031, 9431031, 8259390, 8613122, 5304815, 7748706, 7798952, 1407131, 7991627, 4199295, 6177090, 7765054, 7104469, 7527019, 7762027, 4519376, 6131128, 785391, 6069785, 125901, 7859434, 5505626, 7816717, 5357895, 1555504, 6132566, 3806996, 19478

In [7]:
train_data[0]

{'src': " i notice that earier that moved wiki _ link to bill chen citing wiki _ link, then you reverted this change, bill chen doesn't commonly go by william, his book is even penned as bill chen. from what i read in wp : commonname patrikr seems to be correct, examples given are names such as : * wiki _ link ( not wiki _ link ) * wiki _ link ( not wiki _ link ) i think this revert may have been a mistake unless you know otherwise? ▪ [UNK] ▪",
 'reply': [101,
  8802,
  2001,
  2124,
  1999,
  1996,
  11662,
  2088,
  2004,
  1000,
  2520,
  1000,
  2005,
  2086,
  2077,
  2002,
  2150,
  4141,
  2124,
  2004,
  1000,
  3021,
  1000,
  1012,
  1045,
  2904,
  2009,
  2067,
  2138,
  18949,
  2015,
  3784,
  2164,
  2224,
  7159,
  2024,
  5560,
  5020,
  1010,
  2498,
  2012,
  2035,
  2066,
  3021,
  7207,
  1998,
  2520,
  7207,
  1010,
  1998,
  1999,
  5020,
  3572,
  2478,
  1996,
  2613,
  2171,
  3849,
  1996,
  2190,
  3601,
  1012,
  1006,
  1996,
  6327,
  1035,
  4957,
  393

In [8]:
from transformers import BertTokenizer
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
def data_for_gpt2(data):
  src_text = data['src']
  reply_tokens = data['reply']
  tgt = data['tgt']
  decoded_reply = tokenizer_bert.decode(reply_tokens, skip_special_tokens=True)
  combined_text = f"<SRC>: {src_text}  <REPLY>: {decoded_reply} <|endofcontext|>"
  return combined_text, tgt

In [10]:
train_data_gpt2 = []
tgt_train = []
for item in train_data:
  formatted_text, tgt = data_for_gpt2(item)
  train_data_gpt2.append(formatted_text)
  tgt_train.append(tgt)

2024-04-18 01:58:13.530668: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-18 01:58:13.530731: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-18 01:58:13.532358: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [11]:
tgt_train[0]

False

In [12]:
train_data_gpt2[0]

'<SRC>:  i notice that earier that moved wiki _ link to bill chen citing wiki _ link, then you reverted this change, bill chen doesn\'t commonly go by william, his book is even penned as bill chen. from what i read in wp : commonname patrikr seems to be correct, examples given are names such as : * wiki _ link ( not wiki _ link ) * wiki _ link ( not wiki _ link ) i think this revert may have been a mistake unless you know otherwise? ▪ [UNK] ▪  <REPLY>: chen was known in the poker world as " william " for years before he became commonly known as " bill ". i changed it back because incidences online including usenet are roughly equal, nothing at all like bill clinton and william clinton, and in equal cases using the real name seems the best choice. ( the external _ link page is especially pschizo... willam in the page title, bill in the page text ). however i suppose the book is the trump card, so using the name on the book is probably best. <|endofcontext|>'

In [13]:
tgt_train[0]

False

In [14]:
val_data_gpt2 = []
tgt_val = []
for item in val_data:
  formatted_text, tgt = data_for_gpt2(item)
  val_data_gpt2.append(formatted_text)
  tgt_val.append(tgt)

In [15]:
val_data_gpt2[0]

'<SRC>:  you have been blocked for three hours for incivility. we cannot tolerate these types of edits :  <REPLY>: on the contrary, the edits you should not tolerate are the other side preaching their incorrect disambiguation. - - wiki _ link ) <|endofcontext|>'

In [16]:
tgt_val[0]

False

In [17]:
test_data_gpt2 = []
tgt_test = []
for item in test_data:
  formatted_text, tgt = data_for_gpt2(item)
  test_data_gpt2.append(formatted_text)
  tgt_test.append(tgt)

In [18]:
from transformers import AutoTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm import tqdm

In [19]:
from transformers import GPT2ForSequenceClassification, GPT2Config
import torch
import torch.nn.functional as F

class GPT2ForSequenceClassificationMeanPooling(GPT2ForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.gpt2 = self.transformer  # Assuming the GPT2ForSequenceClassification model has a 'transformer' attribute
        self.score = torch.nn.Linear(config.hidden_size, config.num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        labels=None,
        **kwargs
    ):
        outputs = self.gpt2(
            input_ids=input_ids,
            attention_mask=attention_mask,
            **kwargs,
        )

        # Get the hidden states from the last layer
        hidden_states = outputs.last_hidden_state

        # Apply mean pooling on the hidden states
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
        sum_hidden_states = torch.sum(hidden_states * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        mean_hidden_states = sum_hidden_states / sum_mask

        # Pass the pooled hidden states through the classification head
        logits = self.score(mean_hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    loss_fct = torch.nn.MSELoss()
                    loss = loss_fct(logits.view(-1), labels.view(-1))
                else:
                    loss_fct = torch.nn.CrossEntropyLoss()
                    loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            else:
                raise ValueError("You should supply an instance of `PreTrainedModel` or a `config`")

        output = (logits,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output

In [20]:
### Loading Model and Tokenizers
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2ForSequenceClassificationMeanPooling.from_pretrained('gpt2', num_labels=2)
print(model)
model.to(device)

Some weights of GPT2ForSequenceClassificationMeanPooling were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassificationMeanPooling(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=True)
  (gpt2): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop)

GPT2ForSequenceClassificationMeanPooling(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=True)
  (gpt2): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop)

In [21]:
## Defining Special Tokens

special_tokens = {'additional_special_tokens': ['<SRC>:', ' <REPLY>:', ' <|endofcontext|>','[UNK]','[SEP]']}
num_added = tokenizer.add_special_tokens(special_tokens)
print(num_added)
model.resize_token_embeddings(len(tokenizer))

5


Embedding(50262, 768)

In [22]:
# Tokenizing Data
tokenizer.pad_token = tokenizer.eos_token
train_encodings = tokenizer(train_data_gpt2, truncation=False, padding=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (1122 > 1024). Running this sequence through the model will result in indexing errors


In [23]:
max_length = max(len(x) for x in train_encodings.input_ids)

In [24]:
max_length

1189

In [25]:
val_encodings = tokenizer(val_data_gpt2, truncation=False, padding=False)

In [26]:
val_max_length = max(len(x) for x in val_encodings.input_ids)
val_max_length

1072

In [27]:
test_encodings = tokenizer(test_data_gpt2, truncation=False, padding=False)

In [28]:
test_max_length = max(len(x) for x in test_encodings.input_ids)
test_max_length

1101

In [29]:
## Redefining Tokenizer with padding and max length
tokenizer.pad_token = tokenizer.eos_token
train_inputs = tokenizer(train_data_gpt2, truncation=True, padding=True, max_length=1024, return_tensors='pt')

# Create torch dataset
input_ids = train_inputs['input_ids']
attention_mask = train_inputs['attention_mask']
labels = torch.tensor([0 if tgt == False else 1 for tgt in tgt_train])
train_dataset = TensorDataset(input_ids, attention_mask, labels)

In [30]:
len(input_ids[0])

1024

In [31]:
val_inputs = tokenizer(val_data_gpt2, truncation=True, padding=True, max_length=1024, return_tensors='pt')
val_input_ids = val_inputs['input_ids']
val_attention_mask = val_inputs['attention_mask']
val_labels = torch.tensor([0 if tgt == False else 1 for tgt in tgt_val])
val_dataset = TensorDataset(val_input_ids, val_attention_mask, val_labels)

In [32]:
test_inputs = tokenizer(test_data_gpt2, truncation=True, padding=True, max_length=1024, return_tensors='pt')
test_input_ids = test_inputs['input_ids']
test_attention_mask = test_inputs['attention_mask']
test_labels = torch.tensor([0 if tgt == False else 1 for tgt in tgt_test])
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)

In [33]:
## CREATING DATALOADERS
batch_size = 4
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [34]:
from torch.optim import AdamW

In [35]:
optimizer = AdamW(model.parameters(), lr=1e-5)

In [36]:
import copy

In [37]:
from sklearn.metrics import f1_score

# Training and validation loop
num_epochs = 5
train_losses = []
val_losses = []
f1_scores = []
best_val_loss = float('inf')
best_model_state = None

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1} of {num_epochs}")
    train_loss = 0
    
    model.train()
    for batch in tqdm(train_dataloader, desc='Training Progress', miniters=10):
        input_ids, attention_mask, batch_labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=batch_labels)
        loss = outputs[0]
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    
    avg_train_loss = train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)
    print(f"Training loss: {avg_train_loss:.2f}")

    # Validation phase
    val_loss = 0
    model.eval()
    val_preds = []
    val_true_labels = []
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc='Validation Progress', miniters=10):
            input_ids, attention_mask, batch_labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=batch_labels)
            loss = outputs[0]
            val_loss += loss.item()
            logits = outputs[1]
            preds = torch.argmax(torch.nn.functional.softmax(logits, dim=1), dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_true_labels.extend(batch_labels.cpu().numpy())

        avg_val_loss = val_loss / len(val_dataloader)
        val_losses.append(avg_val_loss)
        f1 = f1_score(val_true_labels, val_preds, average='macro')
        f1_scores.append(f1)
        print(f"Validation loss: {avg_val_loss:.2f}, F1 score: {f1:.2f}")

        # Check if the current model is the best one
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict()
            torch.save(best_model_state, 'CGA_Dynamic_model_GPT2.pth')  # Save the best model
            print("Saved new best model")

    model.train()

Epoch 1 of 5


Training Progress: 100%|██████████| 1817/1817 [17:34<00:00,  1.72it/s]


Training loss: 0.52


Validation Progress: 100%|██████████| 610/610 [01:57<00:00,  5.21it/s]


Validation loss: 0.79, F1 score: 0.58
Epoch 3 of 5


Training Progress: 100%|██████████| 1817/1817 [17:33<00:00,  1.72it/s]


Training loss: 0.30


Validation Progress: 100%|██████████| 610/610 [01:57<00:00,  5.21it/s]


Validation loss: 1.00, F1 score: 0.60
Epoch 4 of 5


Training Progress: 100%|██████████| 1817/1817 [17:34<00:00,  1.72it/s]


Training loss: 0.12


Validation Progress: 100%|██████████| 610/610 [01:57<00:00,  5.21it/s]


Validation loss: 1.45, F1 score: 0.59
Epoch 5 of 5


Training Progress: 100%|██████████| 1817/1817 [17:35<00:00,  1.72it/s]


Training loss: 0.06


Validation Progress: 100%|██████████| 610/610 [01:57<00:00,  5.20it/s]

Validation loss: 1.60, F1 score: 0.59





In [39]:
model = GPT2ForSequenceClassificationMeanPooling.from_pretrained('gpt2', num_labels=2)
model.resize_token_embeddings(len(tokenizer)) 
model.load_state_dict(torch.load('/kaggle/working/CGA_Dynamic_model_GPT2.pth', map_location=device))
model.to(device)

Some weights of GPT2ForSequenceClassificationMeanPooling were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.bias', 'score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassificationMeanPooling(
  (transformer): GPT2Model(
    (wte): Embedding(50262, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=True)
  (gpt2): GPT2Model(
    (wte): Embedding(50262, 768)
    (wpe): Embedding(1024, 768)
    (drop)

<All keys matched successfully>

In [40]:
model.eval()
test_loss = 0
test_preds = []
test_true_labels = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc='Test Progress', miniters = 10):
        input_ids, attention_mask, batch_labels = batch
        input_ids, attention_mask, batch_labels = input_ids.to(device), attention_mask.to(device), batch_labels.to(device)
        model.config.pad_token_id = tokenizer.eos_token_id
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=batch_labels)
        loss = outputs[0]  # Extract the loss value from the tuple
        test_loss += loss.item()
        logits = outputs[1] 
        probs = torch.nn.functional.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_true_labels.extend(batch_labels.cpu().numpy())
        
avg_test_loss = test_loss / len(test_dataloader)
f1 = f1_score(test_true_labels, test_preds, average='macro')
print(f"Validation loss: {avg_test_loss:.2f}, F1 score: {f1:.2f}")

Test Progress: 100%|██████████| 1069/1069 [03:25<00:00,  5.20it/s]

Validation loss: 0.69, F1 score: 0.57





In [45]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

F1_score = f1_score(test_true_labels, test_preds, average='macro')
accuracy = accuracy_score(test_true_labels, test_preds)
precision = precision_score(test_true_labels, test_preds, average='macro')
recall = recall_score(test_true_labels, test_preds, average='macro')

print(f"F1 score: {F1_score}")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

F1 score: 0.5677137881380231
Accuracy: 0.5960233918128655
Precision: 0.6129450899046678
Recall: 0.5875172587608757


In [46]:
conf_matrix = confusion_matrix(test_true_labels, test_preds)
classification_report = classification_report(test_true_labels, test_preds)

print(conf_matrix)
print(classification_report)

[[ 727 1333]
 [ 394 1821]]
              precision    recall  f1-score   support

           0       0.65      0.35      0.46      2060
           1       0.58      0.82      0.68      2215

    accuracy                           0.60      4275
   macro avg       0.61      0.59      0.57      4275
weighted avg       0.61      0.60      0.57      4275



In [47]:
data = {'True Labels': test_true_labels, 'Predictions': test_preds}
prediction_comparison_df = pd.DataFrame(data)

In [53]:
prediction_comparison_df[104:195]

Unnamed: 0,True Labels,Predictions
104,0,1
105,1,1
106,1,1
107,0,1
108,0,1
...,...,...
190,1,0
191,0,0
192,0,0
193,0,0
