<a href="https://colab.research.google.com/github/ks-chauhan/HCL-Training-Project/blob/main/Notebooks/Training_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import transformers
import torch
print(torch.cuda.is_available())

In [None]:
df = pd.read_csv("/content/drive/MyDrive/HCL Training Project/guardian_headlines.csv")

In [None]:
df.drop("tag", axis=1, inplace=True)

In [None]:
df

In [None]:
# Encoding the labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label'] = le.fit_transform(df['Topic'])

num_labels = len(le.classes_)
print("Number of unique labels:", num_labels)

In [None]:
# Splitting the dataset into training and testing sets
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=18, stratify=df['label'])

In [None]:
# Converting to Hugging Face Datasets
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df[['headline', 'label']])
test_dataset = Dataset.from_pandas(test_df[['headline', 'label']])

In [None]:
# Loading the pre-trained model and tokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

In [None]:
def tokenize(x):
    return tokenizer(
        x["headline"],
        truncation=True,
        padding="max_length",
        max_length=64
    )

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset = train_dataset.remove_columns(['headline'])
test_dataset = test_dataset.remove_columns(['headline'])
train_dataset.set_format('torch')
test_dataset.set_format('torch')

In [None]:
# Setting up training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50
)

In [None]:
# Defining the compute_metrics function
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = np.argmax(preds, axis=1)
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)
    return {
        "f1": f1,
        "accuracy": acc
    }

In [None]:
# Setting up the Trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# Plotting Confusion Matrix

predictions = trainer.predict(test_dataset)

y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)

from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt="d")

print("\nLabel ID â†’ Class Name Mapping:")
for idx, class_name in enumerate(le.classes_):
    print(f"{idx} : {class_name}")

In [None]:
# Save the model and tokenizer

trainer.save_model("/content/drive/MyDrive/HCL Training Project/final_model")
tokenizer.save_pretrained("/content/drive/MyDrive/HCL Training Project/final_model")