In [None]:
%pip install pandas transformers datasets evaluate accelerate torch scikit-learn

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import torch

CSV_FILE_PATH = '/Users/himanshusharma/Personal_Code/FineTune/data/News_Category.csv'
MODEL_NAME = "distilbert-base-uncased"
OUTPUT_MODEL_DIR = '/Users/himanshusharma/Personal_Code/FineTune/news_classifier_model'

In [None]:
# Set device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# --- 1. Load and Preprocess Data ---
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)

    # Combine headline and short description
    df['text'] = df['headline'] + " " + df['short_description']

    # Drop irrelevant columns for training
    df = df[['text', 'category']]
    df.dropna(subset=['category', 'text'], inplace=True)

    # Map categories to unique integer IDs
    unique_categories = df['category'].unique().tolist()
    category_to_id = {cat: i for i, cat in enumerate(unique_categories)}
    id_to_category = {i: cat for i, cat in enumerate(unique_categories)}
    df['label'] = df['category'].map(category_to_id)

    # Split data into train and test sets (stratified for balanced categories)
    train_df, test_df = train_test_split(
        df, test_size=0.2, stratify=df['category'], random_state=42
    )

    # Convert Pandas DataFrames to Hugging Face Dataset format
    train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
    test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

    return train_dataset, test_dataset, id_to_category, category_to_id


In [None]:
# --- 2. Tokenize Data ---
def tokenize_function(examples):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    # The 'truncation=True' handles long descriptions, 'padding=True' dynamically pads to max length
    return tokenizer(examples["text"], padding=True, truncation=True, max_length=512)


In [None]:

# --- 3. Train the Model ---
def train_model(train_dataset, test_dataset, id_to_category):
    # Load pre-trained model with a classification head
    num_labels = len(id_to_category)
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=num_labels
    ).to(device)

    # Tokenize the datasets
    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_test = test_dataset.map(tokenize_function, batched=True)

    # Define evaluation metrics
    metric = evaluate.load("f1")
    
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        # Using 'macro' F1 score to treat all categories equally
        return metric.compute(predictions=predictions, references=labels, average="macro")

    # Define training arguments (hyperparameters)
    training_args = TrainingArguments(
        output_dir=OUTPUT_MODEL_DIR,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME),
        compute_metrics=compute_metrics,
    )

    # Train the model
    trainer.train()

    # Save the final model
    trainer.save_model(OUTPUT_MODEL_DIR)
    print(f"Model saved to {OUTPUT_MODEL_DIR}")

    return model, trainer, id_to_category


In [None]:
# --- 4. Use the Model for Prediction ---
def predict_category(text_to_predict, saved_model_path, id_to_category_map):
    # Load the tokenizer and model from the saved directory
    tokenizer = AutoTokenizer.from_pretrained(saved_model_path)
    model = AutoModelForSequenceClassification.from_pretrained(saved_model_path).to(device)

    # Preprocess the input text
    inputs = tokenizer(text_to_predict, return_tensors="pt", truncation=True, padding=True).to(device)

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted label index and convert it back to the category name
    prediction = torch.argmax(outputs.logits, dim=1).item()
    predicted_category = id_to_category_map[prediction]

    return predicted_category

In [None]:
# --- 5. Load and prepare data
train_ds, test_ds, id_to_cat, cat_to_id = load_and_preprocess_data(CSV_FILE_PATH)
print(f"Loaded data with {id_to_cat} unique categories.")

In [None]:
# --- 6. Train and save the model
# Note: GPU reqd.
trained_model, trainer, id_to_cat_map = train_model(train_ds, test_ds, id_to_cat)

In [None]:
# --- 7. Test with a new headline/description
new_text = "Scientists discover new black hole in nearby galaxy, challenging existing theories about cosmic formation."
predicted_category = predict_category(new_text, OUTPUT_MODEL_DIR, id_to_cat)

print(f"\nText: '{new_text}'")
print(f"Predicted Category: {predicted_category}")