## DistilBERT Model

In [2]:
# Importing libraries and packages

In [4]:
!pip install transformers torch datasets



In [6]:
import pandas as pd
import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
) 
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

## Loading and Preprocessing the data

In [9]:
filename = 'complaints.csv' 
df = pd.read_csv(filename, nrows=100000)

In [13]:
# Defining columns and dropping NaNs
feature_column = 'Consumer complaint narrative'
label_column = 'Product'
df.dropna(subset=[feature_column], inplace=True)

In [15]:
# Merging Duplicate Categories
credit_categories = [
    'Credit reporting, credit repair services, or other personal consumer reports',
    'Credit reporting or other personal consumer reports',
    'Credit reporting'
]
clean_name = 'Credit Reporting' 
df[label_column] = df[label_column].replace(credit_categories, clean_name)

In [17]:
# Cleaning the "xxxx" redactions from X
X = df[feature_column]
y = df[label_column]

In [19]:
# Removing rare classes
class_counts = y.value_counts()
rare_classes = class_counts[class_counts < 2].index.tolist()
if len(rare_classes) > 0:
    keep_indices = y.isin(rare_classes) == False
    X_filtered = X[keep_indices]
    y_filtered = y[keep_indices]
else:
    X_filtered = X
    y_filtered = y

print(f"Filtered data size: {len(y_filtered)}")
print(f"Total categories: {y_filtered.nunique()}")

Filtered data size: 6867
Total categories: 16


In [21]:
# Creating Label Dictionaries as transformer models don't work with strings
# Converting "Mortgage" -> 0, "Debt collection" -> 1, etc.

# sorted list of unique category names
labels = sorted(y_filtered.unique())

# Creating a dictionary to map name to ID (e.g., 'Credit Reporting': 0)
label_to_id = {label: i for i, label in enumerate(labels)}

# Creating a reverse dictionary to map ID back to name (e.g., 0: 'Credit Reporting')
id_to_label = {i: label for i, label in enumerate(labels)}

# Adding the integer labels to our dataframe
df_final = pd.DataFrame({
    'text': X_filtered,
    'label_name': y_filtered
})
df_final['label'] = df_final['label_name'].map(label_to_id)

print("\nData prepared with integer labels:")
print(df_final.head())
print(f"\nExample label mapping: 'Credit Reporting' is {label_to_id['Credit Reporting']}")



Data prepared with integer labels:
                                                  text        label_name  \
62                          These are not my accounts.  Credit Reporting   
94   For the past few years I was in and out of hot...   Debt collection   
117  Kindly address this issue on my credit report....  Credit Reporting   
120  I AM FORMALLY REQUESTING THE IMMEDIATE REMOVAL...   Debt collection   
216  There are XXXX collections being reported to t...  Credit Reporting   

     label  
62       3  
94       6  
117      3  
120      6  
216      3  

Example label mapping: 'Credit Reporting' is 3


In [23]:
# Splitting data before tokenizing
train_df, val_df = train_test_split(
    df_final,
    test_size=0.2,
    stratify=df_final['label'],
    random_state=42
)

print(f"\nTraining set size: {len(train_df)}, Validation set size: {len(val_df)}")


Training set size: 5493, Validation set size: 1374


# Load the tokenizer and tokenize the dataset

In [26]:
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

print(f"Loaded tokenizer for {model_name}.")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Loaded tokenizer for distilbert-base-uncased.


In [28]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

print(f"Converted DataFrames to Datasets:")
print(train_dataset)

Converted DataFrames to Datasets:
Dataset({
    features: ['text', 'label_name', 'label', '__index_level_0__'],
    num_rows: 5493
})


In [30]:
def tokenize_function(examples):
    # This tokenizes the text. We truncate long complaints as we planned in EDA
    return tokenizer(
        examples['text'], 
        padding='max_length', # Pad shorter complaints to the max length
        truncation=True,      # Truncate complaints longer than the max length
        max_length=512        # Our chosen max length from the EDA
    )

In [32]:
# Applying the tokenizer to all entries in our datasets
print("\nTokenizing training data...")
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)

print("Tokenizing validation data...")
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)


Tokenizing training data...


Map:   0%|          | 0/5493 [00:00<?, ? examples/s]

Tokenizing validation data...


Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

In [34]:
# Cleaning the datasets
# The model only needs 'input_ids', 'attention_mask', and 'label'.
# Removing the text columns to save memory.
tokenized_train_dataset = tokenized_train_dataset.remove_columns(
    ['text', 'label_name', '__index_level_0__']
)
tokenized_val_dataset = tokenized_val_dataset.remove_columns(
    ['text', 'label_name', '__index_level_0__']
)

In [36]:
# Rename 'label' to 'labels' because the model expects this exact name
tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_val_dataset = tokenized_val_dataset.rename_column("label", "labels")

In [38]:
# Set the format to 'torch' so it returns PyTorch tensors
tokenized_train_dataset.set_format('torch')
tokenized_val_dataset.set_format('torch')

print("\nTokenization complete. Final training dataset features:")
print(tokenized_train_dataset)


Tokenization complete. Final training dataset features:
Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 5493
})


## Load the pre-trained model

In [41]:
num_labels = len(labels) 
print(f"Number of unique labels: {num_labels}")

Number of unique labels: 16


In [43]:
model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id_to_label,  
    label2id=label_to_id   
)

print(f"Successfully loaded {model_name} with a {num_labels}-class head.")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully loaded distilbert-base-uncased with a 16-class head.
