# BERT FOR PRIORITY CLASSIFICATION

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
df = pd.read_csv("output_with_priority.csv")
df.head()

Unnamed: 0,complaint_id,subject,date_received,date_sent_to_company,complaint,Topic,priority
0,3229299,write notification about debt,2019-05-01T12:00:00-05:00,2019-05-01T12:00:00-05:00,good morning my name be and I appreciate it ...,Payment and Billing,not urgent
1,3199379,other feature term or problem,2019-04-02T12:00:00-05:00,2019-04-02T12:00:00-05:00,I upgrade my card in 2018 and be tell by the...,Dispute reporting,not urgent
2,3233499,incorrect information on your report,2019-05-06T12:00:00-05:00,2019-05-06T12:00:00-05:00,chase card be report on 2019 however fraudulen...,Credit card Management,not urgent
3,3180294,incorrect information on your report,2019-03-14T12:00:00-05:00,2019-03-15T12:00:00-05:00,on 2018 while try to book a ticket I com...,Credit card Management,not urgent
4,3224980,manage an account,2019-04-27T12:00:00-05:00,2019-04-27T12:00:00-05:00,my grand son give I check for 160000 I deposit...,Retail Banking Operations,not urgent


In [None]:
texts = df['subject'] + " " + df['complaint']
texts

0        write notification about debt good morning my ...
1        other feature term or problem I upgrade my   c...
2        incorrect information on your report chase car...
3        incorrect information on your report on 2018 w...
4        manage an account my grand son give I check fo...
                               ...                        
21067    advertising and marketing include promotional ...
21068    other feature term or problem on wednesday  I ...
21069    problem with a lender or other company charge ...
21070    other feature term or problem I have have flaw...
21071    payment to acct not credit roughly 10 year ago...
Length: 21072, dtype: object

In [None]:
label_map = {"urgent": 1, "not urgent": 0}
labels = df['priority'].map(label_map)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts.tolist(), labels.tolist(), test_size=0.2, random_state=42)

In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


bert-base-uncased: A transformers model pretrained on a large corpus of English data in a self-supervised fashion. It was trained on BookCorpus, a dataset consisting of 11,038 unpublished books, and English Wikipedia, excluding lists, tables, and headers.

In [None]:
# Tokenize input texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
# Create PyTorch datasets
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels))
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             torch.tensor(test_labels))

In [None]:
# DataLoader for batching and shuffling data
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [None]:
# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
# Evaluation loop
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

In [None]:
accurracy_priority = total_correct / total_samples
print("Accuracy:", accurracy_priority)

Accuracy: 0.9263542502993823


# BERT FOR DEPT CLASSIFICATION

In [None]:
texts = df['subject'] + " " + df['complaint']

In [None]:
label_map = {
    "Retail Banking Operations": 0,
    "Credit card Management": 1,
    "Payment and Billing": 2,
    "Dispute reporting": 3,
    "Mortgages/loans": 4
}
labels = df['Topic'].map(label_map)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts.tolist(), labels.tolist(), test_size=0.2, random_state=42)

In [None]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize input texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
# Create PyTorch datasets
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels))
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             torch.tensor(test_labels))

In [None]:
# DataLoader for batching and shuffling data
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [None]:
# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
# Evaluation loop
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

In [None]:
accuracy_dept = total_correct / total_samples
print("Accuracy:", accuracy_dept)

Accuracy: 0.8100296354293823
