<a href="https://colab.research.google.com/github/kkumarsonu/Text_Classification_Tagging/blob/main/TextClassification_Tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Installing the transformers library and additional libraries if looking process
!pip install -q transformers

In [None]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig, BertForSequenceClassification, AdamW

In [None]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:

# Read CSV file into a pandas DataFrame
df = pd.read_csv("/content/train - train.csv")

# Assuming your CSV has columns: feature_column, label_column
X = df['title'].tolist()
y = df['hotlist'].tolist()

# Print the first few samples to verify
for i in range(min(50, len(X))):  # Print first 5 samples or less if dataset is smaller
    print(f"Feature: {X[i]} - Label: {y[i]}")


Feature: [Kinetoscope] User not able  to control the device volume during playback - Label: GTV - Source - QA Testing
Feature: [AD1A/AP2A][GTV] Show notification while the movie window is pinned. - Label: GTV - Source - QA Testing
Feature: [K] Pause + screensaver = lost playback progress - Label: GTV - Source - QA Testing
Feature: Freeplay Player Remains Active Behind EPG When Channel Errors - Label: GTV - Source - QA Testing
Feature: [Google TV] Java crash in Google TV : java.lang.IllegalArgumentException - Label: GTV - Source - QA Testing
Feature: [Tracking bug - Tubi ] GTVm : Transkeys are displayed in the description of the title "Eminem AKA" - Label: GTV - Source - QA Testing
Feature: [Tracking bug - Amazon Prime ] GTVm : Transkeys are displayed in the description of the title "Eminem AKA" - Label: GTV - Source - QA Testing
Feature: [Tracking bug - Plex ] GTVm : Transkeys are displayed in the description of the title "Eminem AKA" - Label: GTV - Source - QA Testing
Feature: [Tracki

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and temporary (validation + test) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Further split temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [None]:

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Tokenize input texts
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length=128)

In [None]:
# Convert labels to tensors
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)
test_labels = torch.tensor(y_test)


In [None]:
# Create PyTorch datasets
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    train_labels
)
val_dataset = TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    val_labels
)
test_dataset = TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    test_labels
)

In [None]:
# Load pre-trained BERT model
model_name = 'bert-base-uncased'  # Example model name
bert_model = BertModel.from_pretrained(model_name)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Define classification layer on top of BERT
class CustomBERTModel(torch.nn.Module):
    def __init__(self, num_classes):
        super(CustomBERTModel, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # Use pooled_output for classification
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Example usage:
num_classes = 6  # Adjust based on your classification task
model = CustomBERTModel(num_classes)



In [None]:
# Training parameters
batch_size = 16
epochs = 3
learning_rate = 2e-5

In [None]:
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
#test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()



In [None]:
# Function for calculating accuracy
def accuracy(preds, labels):
    pred_classes = preds.argmax(dim=1)
    correct = (pred_classes == labels).sum().item()
    return correct / len(labels)


In [None]:
!pip install tqdm
from tqdm import tqdm
model.to(device)

best_val_loss = float('inf')
# Wrap in a try-except block to catch potential CUDA errors


for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0.0
    train_acc = 0.0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}', leave=False)
    for batch in progress_bar:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Check if labels are within the valid range
        if torch.any(labels >= outputs.shape[1]):
            print("Warning: Labels out of bounds. Adjusting labels...")
            labels = torch.clamp(labels, max=outputs.shape[1] - 1)  # Clamp labels to the maximum index

        loss = loss_fn(outputs, labels)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        train_loss += loss.item()
        train_acc += accuracy(outputs, labels)

        progress_bar.set_postfix({'training_loss': train_loss / len(train_loader),
                                  'training_acc': train_acc / len(train_loader)})





In [None]:
# Validation
model.eval()
val_loss = 0.0
val_acc = 0.0

with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Check if labels are within the valid range and adjust if necessary
        if torch.any(labels >= outputs.shape[1]):
            print("Warning: Labels out of bounds during validation. Adjusting labels...")
            labels = torch.clamp(labels, max=outputs.shape[1] - 1)

        loss = loss_fn(outputs, labels)

        val_loss += loss.item()
        val_acc += accuracy(outputs, labels)

avg_train_loss = train_loss / len(train_loader)
avg_val_loss = val_loss / len(val_loader)
avg_train_acc = train_acc / len(train_loader)
avg_val_acc = val_acc / len(val_loader)

print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Train Acc: {avg_train_acc:.4f}, Val Acc: {avg_val_acc:.4f}')

# Save model if validation loss improves
if avg_val_loss < best_val_loss:
    best_val_loss = avg_val_loss
    torch.save(model.state_dict(), 'bert_model.pth')

print("Training complete!")

Epoch 3/3, Train Loss: 0.2461, Val Loss: 0.3894, Train Acc: 0.9249, Val Acc: 0.8913
Training complete!


In [None]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd

# Assuming you have a CustomBERTModel defined as before
class CustomBERTModel(torch.nn.Module):
    def __init__(self, num_classes):
        super(CustomBERTModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Initialize model
num_classes = 6  # Adjust based on your classification task
model = CustomBERTModel(num_classes)

# Load trained model state dict
model.load_state_dict(torch.load('bert_model.pth'))
model.eval()

# Move model to device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
# Extract unique labels from y
unique_labels = sorted(set(y))

# Create a label mapping dictionary
label_map = {label_idx: label_name for label_idx, label_name in enumerate(unique_labels)}

# Print label mapping dictionary
print("Label mapping:")
print(label_map)


Label mapping:
{0: 'GTV - Source - 3P App Partner', 1: 'GTV - Source - Analysis L1, GTV - Source - Analysis L2 - Auto-reports', 2: 'GTV - Source - Analysis L1, GTV - Source - Analysis L2 - Performance Reviews', 3: 'GTV - Source - Dogfood', 4: 'GTV - Source - QA Testing', 5: 'GTV - Source - VIP/Googler/TVC'}


In [None]:
def predict_text(text):
    # Tokenize input text
    inputs = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors='pt')

    # Move inputs to device
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Predict logits
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Convert logits to probabilities
    probs = torch.softmax(outputs, dim=-1)

    # Get predicted label index
    _, predicted_class = torch.max(probs, dim=-1)
    predicted_class_idx = predicted_class.item()

    # Get predicted label text using label_map
    predicted_label = label_map[predicted_class_idx]

    return predicted_class_idx, predicted_label, probs.squeeze().tolist()


In [40]:
# Example usage
text = "/m/0kt_rt don’t have spanish audio track in all available watch actions"
predicted_class_idx, predicted_label, probabilities = predict_text(text)
print(f'Predicted class index: {predicted_class_idx}')
print(f'Predicted label: {predicted_label}')
print(f'Class probabilities: {probabilities}')


Predicted class index: 5
Predicted label: GTV - Source - VIP/Googler/TVC
Class probabilities: [0.0004672050126828253, 0.0007958764908835292, 0.0005094915977679193, 0.0074640922248363495, 0.005399305839091539, 0.9853641390800476]


In [37]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
from google.colab import files
files.download('/content/bert_model.pth')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>