# GPT

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, GPT2LMHeadModel
from sklearn.metrics import confusion_matrix, classification_report

In [40]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 4050 Laptop GPU


In [41]:
df = pd.read_csv(r'Dataset.csv')
texts = df['clause'].tolist()
labels = df['risk'].tolist()

In [42]:
tokenizer = AutoTokenizer.from_pretrained('bolbolzaban/gpt2-persian')
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [43]:
max_length = 64  # or any other suitable length

# Tokenize the texts
tokenized_texts = [tokenizer.encode(text, add_special_tokens=True) for text in texts]

# Truncate and pad sequences
padded_sequences = []
for seq in tokenized_texts:
    if len(seq) > max_length:
        # Truncate the sequence if it exceeds max_length
        seq = seq[:max_length]
    else:
        # Pad the sequence if it's shorter than max_length
        seq = seq + [tokenizer.pad_token_id] * (max_length - len(seq))
    padded_sequences.append(seq)

In [44]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = torch.LongTensor(self.texts[idx])
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return text, label

In [45]:
# Assuming you have 'labels' for your texts
dataset = TextClassificationDataset(padded_sequences, labels)

In [46]:
train_texts, test_texts, train_labels, test_labels = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

train_dataset = TextClassificationDataset(train_texts, train_labels)
test_dataset = TextClassificationDataset(test_texts, test_labels)

In [47]:
# Define batch size
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [49]:
# Load pre-trained GPT model and tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# model = GPT2Model.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('bolbolzaban/gpt2-persian')
# Use GPU, if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [50]:
# Freeze GPT model weights
for param in model.parameters():
    param.requires_grad = False

In [56]:
# # Add classification head
# class GPTClassifier(nn.Module):
#     def __init__(self, gpt_model):
#         super(GPTClassifier, self).__init__()
#         self.gpt = gpt_model
#         self.dropout = nn.Dropout(0.5)  # Add dropout layer with dropout rate 0.5
#         self.fc = nn.Linear(128, 1)

#     def forward(self, input_ids):
#         outputs = self.gpt(input_ids)[0]
#         pooled_output = outputs[:, :, 0]  # Take the first token [CLS]
#         pooled_output = self.dropout(pooled_output)  # Apply dropout
#         logits = self.fc(pooled_output)
#         return logits


# # Add classification head
# class GPTClassifier(nn.Module):
#     def __init__(self, gpt_model):
#         super(GPTClassifier, self).__init__()
#         self.gpt = gpt_model
#         self.fc = nn.Linear(128, 1)

#     def forward(self, input_ids):
#         outputs = self.gpt(input_ids)[0]

#         pooled_output = outputs[:,:,0]  # Take the first token [CLS]
#         logits = self.fc(pooled_output)
#         return logits


class GPTClassifier(nn.Module):
    def __init__(self, gpt_model, dropout_prob=0.5):
        super(GPTClassifier, self).__init__()
        self.gpt = gpt_model
        self.fc = nn.Linear(25000, 1)  # Assuming hidden_state dimension is 25000
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, input_ids):
        outputs = self.gpt(input_ids)[0]  # Getting hidden states
        pooled_output = outputs[:, 0, :]  # Take the first token [CLS] 
        # print("pooled_output.shape", pooled_output.shape)
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)

        return logits
    

In [57]:
def train(model, train_dataloader, val_dataloader, optimizer, criterion, num_epochs, device):
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0.0
        correct_train = 0
        total_train = 0

        for batch in train_dataloader:
            # input_ids, labels = batch
            input_ids = batch[0].to(device)
            labels = batch[1].to(device)
            optimizer.zero_grad()
            logits = model(input_ids)
            loss = criterion(logits.squeeze(-1), labels.float())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

            # Calculate training accuracy
            predictions = (logits > 0).long()
            correct_train += (predictions == labels).sum().item()
            total_train += labels.size(0)

        train_loss = epoch_loss / len(train_dataloader)
        train_accuracy = correct_train / total_train

        # Validation
        model.eval()
        val_loss = 0.0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for batch in val_dataloader:
                input_ids, labels = batch
                logits = model(input_ids)
                loss = criterion(logits.squeeze(-1), labels.float())
                val_loss += loss.item()

                # Calculate validation accuracy
                predictions = (logits > 0).long()
                correct_val += (predictions == labels).sum().item()
                total_val += labels.size(0)

        val_loss /= len(val_dataloader)
        val_accuracy = correct_val / total_val

        # Print training and validation statistics
        print(f'Epoch {epoch + 1}/{num_epochs}, '
              f'Train Loss: {train_loss:.4f}, '
              f'Train Acc: {train_accuracy:.4f}, '
              f'Val Loss: {val_loss:.4f}, '
              f'Val Acc: {val_accuracy:.4f}')

        # Store losses and accuracies for plotting
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accuracies.append(train_accuracy)
        val_accuracies.append(val_accuracy)

        return train_losses, val_losses, train_accuracies, val_accuracies
    

def plot_val_acc_per_epoch(train_losses, val_losses, train_accuracies, val_accuracies):

    # Plot losses
    plt.plot(train_losses, label='Train Loss')
    plt.plot(val_losses, label='Val Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    # Plot accuracies
    plt.plot(train_accuracies, label='Train Acc')
    plt.plot(val_accuracies, label='Val Acc')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

In [58]:
# Initialize the classifier
classifier = GPTClassifier(model)

In [59]:
# Define loss function and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-3)

In [60]:
train_losses, val_losses, train_accuracies, val_accuracies = train(classifier, train_dataloader, test_dataloader, optimizer, criterion, 10, device)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat1 in method wrapper_CUDA_addmm)

In [None]:
plot_val_acc_per_epoch(train_losses, val_losses, train_accuracies, val_accuracies)

In [172]:
# Set the model to evaluation mode
classifier.eval()

# Initialize lists to store true labels and predicted labels
true_labels = []
predicted_labels = []

# Iterate over the test dataset and generate predictions
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, labels = batch
        logits = classifier(input_ids)
        predictions = (logits > 0).long()  # Convert logits to binary predictions
        true_labels.extend(labels.numpy())
        predicted_labels.extend(predictions.numpy())

*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
*************outputs shape***********:  torch.Size([32, 128])
********

In [173]:
# Calculate the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Print the confusion matrix with 4 decimal points precision
print("Confusion Matrix (with 4 decimal points precision):")
print(np.round(conf_matrix, 4))

Confusion Matrix (with 4 decimal points precision):
[[313   7]
 [203   3]]


In [176]:
print(classification_report(true_labels, predicted_labels))

              precision    recall  f1-score   support

         0.0       0.61      0.98      0.75       320
         1.0       0.30      0.01      0.03       206

    accuracy                           0.60       526
   macro avg       0.45      0.50      0.39       526
weighted avg       0.49      0.60      0.47       526



In [339]:
# Model parameters visualization
params = list(model.named_parameters())

print('GPT-2 has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:2]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== 1st Transformer Layer ====\n')

for p in params[2:14]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== 2nd Transformer Layer ====\n')

for p in params[14:26]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== 3rd Transformer Layer ====\n')

for p in params[26:38]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-5:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

GPT-2 has 292 different named parameters.

==== Embedding Layer ====

transformer.wte.weight                                  (25000, 1024)
transformer.wpe.weight                                   (256, 1024)

==== 1st Transformer Layer ====

transformer.h.0.ln_1.weight                                  (1024,)
transformer.h.0.ln_1.bias                                    (1024,)
transformer.h.0.attn.c_attn.weight                      (1024, 3072)
transformer.h.0.attn.c_attn.bias                             (3072,)
transformer.h.0.attn.c_proj.weight                      (1024, 1024)
transformer.h.0.attn.c_proj.bias                             (1024,)
transformer.h.0.ln_2.weight                                  (1024,)
transformer.h.0.ln_2.bias                                    (1024,)
transformer.h.0.mlp.c_fc.weight                         (1024, 4096)
transformer.h.0.mlp.c_fc.bias                                (4096,)
transformer.h.0.mlp.c_proj.weight                       (4096, 1024