In [15]:
import pandas as pd
import numpy as np
from nltk.corpus import  stopwords
import re
import nltk 
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Load Data

In [3]:
df = pd.read_csv("bbc_data.csv")

In [4]:
df

Unnamed: 0,data,labels
0,Musicians to tackle US red tape Musicians gro...,entertainment
1,"U2s desire to be number one U2, who have won ...",entertainment
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment
3,Snicket tops US box office chart The film ada...,entertainment
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment
...,...,...
2220,Warning over Windows Word files Writing a Mic...,tech
2221,Fast lifts rise into record books Two high-sp...,tech
2222,Nintendo adds media playing to DS Nintendo is...,tech
2223,Fast moving phone viruses appear Security fir...,tech


In [5]:
df['labels'].value_counts()

labels
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

# Preprocessing

In [6]:
def clean(text):
    result = text.lower()  # case folding
    
    punctuation_pattern = r'[^\w\s]'
    result = re.sub(punctuation_pattern, '', result)
    
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    result = url_pattern.sub(r'', result)
    
    html_tags_pattern = r'<.*?>'
    result = re.sub(html_tags_pattern, '', result)

    result = re.sub(r'[0-9]', '', result)   # remove digits
    return result

stop_words = set(stopwords.words("english"))
def remove_stopwords(text):
    word_tokens = text.split()
    result = [word for word in word_tokens if word not in stop_words]
    return ' '.join(result)

lemmatizer = WordNetLemmatizer() 
def lemmatize(text): 
    word_tokens = text.split()
    result = [lemmatizer.lemmatize(word) for word in word_tokens]
    return ' '.join(result)

def preprocess(text):
    result = clean(text)
    result = remove_stopwords(result)
    result = lemmatize(result)
    return result

In [7]:
df['preprocessed'] = df['data'].apply(preprocess)

# Data Splitting

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed'], df['labels'], test_size=0.2, random_state=42)

# Vectorization

In [9]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [10]:
print('Number of features:', X_train_tfidf.shape[1])

Number of features: 25245


# Classification

In [11]:
# Train with 10-fold cross validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [12]:
def train_model(X_train, X_test, y_train, y_test, model, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv=kf, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

	# Print accuracy scores for each combination
    results = grid_search.cv_results_
    for mean_score, params in zip(results['mean_test_score'], results['params']):
        print(f"Accuracy: {mean_score:.4f}, Parameters: {params}")
    
    # Get the Best Parameters
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)

    # Evaluate Final Model on Test Set
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("\nBest Parameters:", grid_search.best_params_)
    print("Accuracy on Test Set:", accuracy)
    print(classification_report(y_test, y_pred))

    return best_model

## Naive Bayes

In [13]:
nb_model = MultinomialNB()
param_grid = {'alpha': [0.1, 0.5, 1, 1.5, 2]}

best_nb_model = train_model(X_train_tfidf, X_test_tfidf, y_train, y_test, nb_model, param_grid)

Accuracy: 0.9742, Parameters: {'alpha': 0.1}
Accuracy: 0.9702, Parameters: {'alpha': 0.5}
Accuracy: 0.9629, Parameters: {'alpha': 1}
Accuracy: 0.9590, Parameters: {'alpha': 1.5}
Accuracy: 0.9551, Parameters: {'alpha': 2}

Best Parameters: {'alpha': 0.1}
Accuracy on Test Set: 0.9797752808988764
               precision    recall  f1-score   support

     business       0.97      0.96      0.97       103
entertainment       1.00      0.98      0.99        84
     politics       0.96      0.97      0.97        80
        sport       1.00      0.99      0.99        98
         tech       0.96      1.00      0.98        80

     accuracy                           0.98       445
    macro avg       0.98      0.98      0.98       445
 weighted avg       0.98      0.98      0.98       445



## Decision Tree

In [21]:
dt_model = DecisionTreeClassifier()
param_grid = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

best_dt_model = train_model(X_train_tfidf, X_test_tfidf, y_train, y_test, dt_model, param_grid)

Accuracy: 0.7713, Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy: 0.7725, Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5}
Accuracy: 0.7730, Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Accuracy: 0.7742, Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2}
Accuracy: 0.7663, Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5}
Accuracy: 0.7646, Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
Accuracy: 0.7674, Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}
Accuracy: 0.7646, Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5}
Accuracy: 0.7685, Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Accuracy: 0.8309, Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}
Accuracy: 0.8247, Parameters: {'max_depth': 20,

## Random Forests

In [22]:
rf_model = RandomForestClassifier()
param_grid = {
    'n_estimators': [25, 50, 100],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

best_rf_model = train_model(X_train_tfidf, X_test_tfidf, y_train, y_test, rf_model, param_grid)

Accuracy: 0.9146, Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 25}
Accuracy: 0.9337, Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Accuracy: 0.9388, Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 0.9185, Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 25}
Accuracy: 0.9287, Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
Accuracy: 0.9410, Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy: 0.9079, Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 25}
Accuracy: 0.9287, Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Accuracy: 0.9331, Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_

## Logistic Regression

In [23]:
lr_model = LogisticRegression()
param_grid = {
    'C' : [0.01, 0.1, 1, 10],
    'max_iter' : [100, 250, 500, 1000]
    
}

best_lr_model = train_model(X_train_tfidf, X_test_tfidf, y_train, y_test, lr_model, param_grid)

Accuracy: 0.4809, Parameters: {'C': 0.01, 'max_iter': 100}
Accuracy: 0.4809, Parameters: {'C': 0.01, 'max_iter': 250}
Accuracy: 0.4809, Parameters: {'C': 0.01, 'max_iter': 500}
Accuracy: 0.4809, Parameters: {'C': 0.01, 'max_iter': 1000}
Accuracy: 0.9146, Parameters: {'C': 0.1, 'max_iter': 100}
Accuracy: 0.9146, Parameters: {'C': 0.1, 'max_iter': 250}
Accuracy: 0.9146, Parameters: {'C': 0.1, 'max_iter': 500}
Accuracy: 0.9146, Parameters: {'C': 0.1, 'max_iter': 1000}
Accuracy: 0.9708, Parameters: {'C': 1, 'max_iter': 100}
Accuracy: 0.9708, Parameters: {'C': 1, 'max_iter': 250}
Accuracy: 0.9708, Parameters: {'C': 1, 'max_iter': 500}
Accuracy: 0.9708, Parameters: {'C': 1, 'max_iter': 1000}
Accuracy: 0.9725, Parameters: {'C': 10, 'max_iter': 100}
Accuracy: 0.9725, Parameters: {'C': 10, 'max_iter': 250}
Accuracy: 0.9725, Parameters: {'C': 10, 'max_iter': 500}
Accuracy: 0.9725, Parameters: {'C': 10, 'max_iter': 1000}

Best Parameters: {'C': 10, 'max_iter': 100}
Accuracy on Test Set: 0.9887640