In [5]:
# Multiclass classification project for NEWS categories
# Author: Muhammad Humayun Khan


import pandas as pd
import numpy as np

dataset_path = '../datasets/bbc_news_multiclass_classification/bbc-text.csv'

# Try latin-1 encoding
df = pd.read_csv(dataset_path)

In [6]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [8]:
df.shape

(2225, 2)

In [9]:
df.sample(10)

Unnamed: 0,category,text
1732,tech,mobiles not media players yet mobiles are no...
79,business,khodorkovsky ally denies charges a close assoc...
2061,politics,turkey deal to help world peace a deal bring...
94,politics,amnesty chief laments war failure the lack of ...
1861,sport,capriati out of australian open jennifer capri...
1771,tech,apple laptop is greatest gadget the apple po...
1088,business,germany calls for eu reform german chancellor ...
457,tech,podcasts mark rise of diy radio an apple ipod ...
1305,sport,parry relishes anfield challenge bbc sport ref...
1160,entertainment,csi shows give unrealistic view people have ...


In [13]:
# check for the missing values
df.isnull().sum()

category    0
text        0
dtype: int64

In [14]:
# Check basic info
print("Columns:", df.columns)
print("\nCategory Distribution:\n", df['category'].value_counts())

Columns: Index(['category', 'text'], dtype='object')

Category Distribution:
 category
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64


In [15]:
# Text preprocessing step
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Define the function for text preprocessing
def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove punctuation, numbers, and special characters
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)



In [16]:
# apply the text preprocessing on the data
df['clean_text'] = df['text'].apply(preprocess_text)
df[['text', 'clean_text']].head()

Unnamed: 0,text,clean_text
0,tv future in the hands of viewers with home th...,tv future hand viewer home theatre system plas...
1,worldcom boss left books alone former worldc...,worldcom bos left book alone former worldcom b...
2,tigers wary of farrell gamble leicester say ...,tiger wary farrell gamble leicester say rushed...
3,yeading face newcastle in fa cup premiership s...,yeading face newcastle fa cup premiership side...
4,ocean s twelve raids box office ocean s twelve...,ocean twelve raid box office ocean twelve crim...


In [17]:
# now text vectorization TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)

# fit and transform the featured text
X = vectorizer.fit_transform(df['clean_text'])

# label the category as y
y = df['category']

print("T-IDF Shape", X.shape)



T-IDF Shape (2225, 5000)


In [18]:
# For model training, will try the logistic model as robust for the the text classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize and train the model
model = LogisticRegression(max_iter=1000, class_weight='balanced')  # Handles any imbalance
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9820224719101124

Classification Report:
                precision    recall  f1-score   support

     business       0.99      0.96      0.98       102
entertainment       0.97      1.00      0.99        77
     politics       0.98      0.96      0.97        84
        sport       1.00      1.00      1.00       102
         tech       0.96      0.99      0.98        80

     accuracy                           0.98       445
    macro avg       0.98      0.98      0.98       445
 weighted avg       0.98      0.98      0.98       445


Confusion Matrix:
 [[ 98   1   2   0   1]
 [  0  77   0   0   0]
 [  0   1  81   0   2]
 [  0   0   0 102   0]
 [  1   0   0   0  79]]


In [19]:
# the model is trained with logistic and now want to create a method for new input sample data
def predict_news_category(text):
    # Step 1: Preprocess
    def preprocess_text(text):
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)

    cleaned_text = preprocess_text(text)

    # Step 2: TF-IDF Transform
    vectorized = vectorizer.transform([cleaned_text])

    # Step 3: Predict
    prediction = model.predict(vectorized)

    return prediction[0]


In [None]:
# prediction of new input data
print(predict_news_category("The stock market crashes amid global uncertainty"))
print(predict_news_category("The football team won the championship after a tense final"))
print(predict_news_category("The government announced new education policies"))
print(predict_news_category("Tech giants are investing heavily in artificial intelligence"))
print(predict_news_category("The new Marvel movie breaks all box office records"))

# the fourth one should be tech instead of business


business
sport
politics
business
entertainment
