In [18]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import mlflow.sklearn


In [19]:
# Load training and test data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [20]:
train_df.head()

Unnamed: 0,headlines,description,content,url,category
0,RBI revises definition of politically-exposed ...,The central bank has also asked chairpersons a...,The Reserve Bank of India (RBI) has changed th...,https://indianexpress.com/article/business/ban...,business
1,NDTV Q2 net profit falls 57.4% to Rs 5.55 cror...,NDTV's consolidated revenue from operations wa...,Broadcaster New Delhi Television Ltd on Monday...,https://indianexpress.com/article/business/com...,business
2,"Akasa Air ‘well capitalised’, can grow much fa...",The initial share sale will be open for public...,Homegrown server maker Netweb Technologies Ind...,https://indianexpress.com/article/business/mar...,business
3,India’s current account deficit declines sharp...,The current account deficit (CAD) was 3.8 per ...,India’s current account deficit declined sharp...,https://indianexpress.com/article/business/eco...,business
4,"States borrowing cost soars to 7.68%, highest ...",The prices shot up reflecting the overall high...,States have been forced to pay through their n...,https://indianexpress.com/article/business/eco...,business


In [21]:
# Drop unnecessary column
train_df.drop(columns=["url"], inplace=True)
test_df.drop(columns=["url"], inplace=True)

# Combine text features into one
def combine_text_columns(df):
    return (
        df["headlines"].fillna("") + " " +
        df["description"].fillna("") + " " +
        df["content"].fillna("")
    )

train_df["combined_text"] = combine_text_columns(train_df)
test_df["combined_text"] = combine_text_columns(test_df)

# Define features and target
X = train_df["combined_text"]
y = train_df["category"]


In [22]:
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)
X_test_vectorized = vectorizer.transform(test_df["combined_text"])

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)


In [23]:
mlflow.set_tracking_uri("http://localhost:5000")  # Or your MLflow URI
mlflow.set_experiment("News_Classification")

with mlflow.start_run():
    mlflow.sklearn.autolog()

    model = MultinomialNB()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    print("Accuracy:", accuracy)
    print(classification_report(y_val, y_pred))

    mlflow.log_metric("val_accuracy", accuracy)


2025/05/08 12:50:35 INFO mlflow.tracking.fluent: Experiment with name 'News_Classification' does not exist. Creating a new experiment.


Accuracy: 0.9818840579710145
               precision    recall  f1-score   support

     business       0.99      0.96      0.98       245
    education       0.99      0.99      0.99       274
entertainment       0.97      0.99      0.98       178
       sports       0.98      0.99      0.99       137
   technology       0.97      0.97      0.97       270

     accuracy                           0.98      1104
    macro avg       0.98      0.98      0.98      1104
 weighted avg       0.98      0.98      0.98      1104

🏃 View run auspicious-newt-564 at: http://localhost:5000/#/experiments/195901218684556543/runs/9cfdd50a18394ef7bfd0306baac77712
🧪 View experiment at: http://localhost:5000/#/experiments/195901218684556543
