# Assignment 6.1: Implementing and Evaluating a Large LargeLanguage Model (LLM) for Text Classification

1. Install and Import Necessary Libraries

In [None]:
!pip install transformers

import numpy as np
import pandas as pd
import torch
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m101.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m86.0 MB/s[0m eta [36m0:00:00[0m
Coll

In [None]:
#Mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


2. Load and Preprocess the Data

In [None]:
data=pd.read_csv('/content/drive/MyDrive/complaints_processed.csv')

In [None]:
# Display the first few rows
data.head()

Unnamed: 0.1,Unnamed: 0,product,narrative
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...
3,3,credit_reporting,payment history missing credit report speciali...
4,4,credit_reporting,payment history missing credit report made mis...


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162421 entries, 0 to 162420
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  162421 non-null  int64 
 1   product     162421 non-null  object
 2   narrative   162411 non-null  object
dtypes: int64(1), object(2)
memory usage: 3.7+ MB


In [None]:
# Check the distribution of the product labels
label_distribution = data['product'].value_counts()

# Check for missing values
missing_values = data.isnull().sum()

label_distribution, missing_values

(credit_reporting       91179
 debt_collection        23150
 mortgages_and_loans    18990
 credit_card            15566
 retail_banking         13536
 Name: product, dtype: int64,
 Unnamed: 0     0
 product        0
 narrative     10
 dtype: int64)

In [None]:
# Drop rows with missing narratives
data_cleaned = data.dropna(subset=['narrative'])

# Verify the removal of missing values
missing_values_after_cleaning = data_cleaned.isnull().sum()

missing_values_after_cleaning

Unnamed: 0    0
product       0
narrative     0
dtype: int64

In [None]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162411 entries, 0 to 162420
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  162411 non-null  int64 
 1   product     162411 non-null  object
 2   narrative   162411 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.0+ MB


In [None]:
missing_values_after_cleaning.info()


<class 'pandas.core.series.Series'>
Index: 3 entries, Unnamed: 0 to narrative
Series name: None
Non-Null Count  Dtype
--------------  -----
3 non-null      int64
dtypes: int64(1)
memory usage: 156.0+ bytes


In [None]:
sample_data = data.sample(frac=0.01, random_state=42)
sample_data_cleaned = sample_data.dropna(subset=['narrative'])

In [None]:
sample_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1624 entries, 156566 to 152058
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1624 non-null   int64 
 1   product     1624 non-null   object
 2   narrative   1624 non-null   object
dtypes: int64(1), object(2)
memory usage: 50.8+ KB


3. Tokenize the Data Using BERT Tokenizer

In [None]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
inputs = tokenizer(
    sample_data_cleaned['narrative'].tolist(),
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

# Extract labels
labels = sample_data_cleaned['product'].astype('category').cat.codes
inputs['labels'] = torch.tensor(labels.tolist())


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

4. Train-Test Split:

In [None]:
# Split the indices
train_indices, test_indices = train_test_split(range(len(sample_data_cleaned)), test_size=0.2, random_state=42)

# Extract training and testing data using the indices
train_inputs = {key: value[train_indices] for key, value in inputs.items()}
test_inputs = {key: value[test_indices] for key, value in inputs.items()}

# Extract training and testing labels
train_labels = labels.iloc[train_indices].values
test_labels = labels.iloc[test_indices].values


5. Convert Data to DataLoader

In [None]:
# Convert tokenized inputs and labels into PyTorch DataLoader
train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], torch.tensor(train_labels, dtype=torch.long))
test_dataset = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], torch.tensor(test_labels, dtype=torch.long))

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)


6. Load Pre-trained Model

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


7. Manual Training Loop

In [None]:
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_function = CrossEntropyLoss()

num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = loss_function(logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch: {epoch+1}, Average Training Loss: {avg_train_loss:.4f}")


Epoch: 1, Average Training Loss: 1.1836
Epoch: 2, Average Training Loss: 0.8668
Epoch: 3, Average Training Loss: 0.5958


8. Evaluate the Model

In [None]:
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        all_predictions.extend(predictions.tolist())
        all_labels.extend(labels.tolist())

accuracy = accuracy_score(all_labels, all_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")


Accuracy: 0.7815, Precision: 0.7916, Recall: 0.7815, F1-score: 0.7764
