In [3]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
sns.set_theme()

from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

## Data Import ##

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

## Text Cleaning ##

### Remove URL ###

In [7]:
train= train.drop('url', axis=1)
train

Unnamed: 0,headlines,description,content,category
0,RBI revises definition of politically-exposed ...,The central bank has also asked chairpersons a...,The Reserve Bank of India (RBI) has changed th...,business
1,NDTV Q2 net profit falls 57.4% to Rs 5.55 cror...,NDTV's consolidated revenue from operations wa...,Broadcaster New Delhi Television Ltd on Monday...,business
2,"Akasa Air ‘well capitalised’, can grow much fa...",The initial share sale will be open for public...,Homegrown server maker Netweb Technologies Ind...,business
3,India’s current account deficit declines sharp...,The current account deficit (CAD) was 3.8 per ...,India’s current account deficit declined sharp...,business
4,"States borrowing cost soars to 7.68%, highest ...",The prices shot up reflecting the overall high...,States have been forced to pay through their n...,business
...,...,...,...,...
5515,"Samsung sends out invites for ‘Unpacked 2024’,...",Samsung is most likely to announce next-genera...,Samsung plans to reveal the next-generation fl...,technology
5516,Google Pixel 8 Pro accidentally appears on off...,The Pixel 8 Pro will most likely carry over it...,Google once again accidentally gave us a glimp...,technology
5517,Amazon ad on Google Search redirects users to ...,Clicking on the real looking Amazon ad will op...,A new scam seems to be making rounds on the in...,technology
5518,"Elon Musk’s X, previously Twitter, now worth l...","Elon Musk's X, formerly Twitter, has lost more...",More than a year after Elon Musk acquired Twit...,technology


In [9]:
test = test.drop('url', axis=1)
test

Unnamed: 0,headlines,description,content,category
0,NLC India wins contract for power supply to Ra...,State-owned firm NLC India Ltd (NLCIL) on Mond...,State-owned firm NLC India Ltd (NLCIL) on Mond...,business
1,SBI Clerk prelims exams dates announced; admit...,SBI Clerk Prelims Exam: The SBI Clerk prelims ...,SBI Clerk Prelims Exam: The State Bank of Indi...,education
2,"Golden Globes: Michelle Yeoh, Will Ferrell, An...","Barbie is the top nominee this year, followed ...","Michelle Yeoh, Will Ferrell, Angela Bassett an...",entertainment
3,"OnePlus Nord 3 at Rs 27,999 as part of new pri...",New deal makes the OnePlus Nord 3 an easy purc...,"In our review of the OnePlus Nord 3 5G, we pra...",technology
4,Adani family’s partners used ‘opaque’ funds to...,Citing review of files from multiple tax haven...,Millions of dollars were invested in some publ...,business
...,...,...,...,...
1995,Sri Lankan government renews Lanka IOC’s petro...,The licence originally issued in 2003 was to e...,The Sri Lankan government has renewed the petr...,business
1996,Kia launches new Seltos with price starting at...,"Shares of Infosys, India's second-largest IT c...","Shares of Infosys, India’s second-largest IT c...",business
1997,Rebel Moon movie review: Zack Snyder’s spectac...,Rebel Moon movie review: Despite echoes of Lag...,Whatever the tonnage of the cash-laden trucks ...,entertainment
1998,ChatGPT most popular AI tool with 14.6 billion...,Coming in at a very distant second place was C...,ChatGPT has emerged as the undisputed winner w...,technology


In [11]:
y_train = train['category']
y_test = test['category']
X_train = train.drop('category', axis=1)
X_test = test.drop('category', axis=1)

### Remove all punctuation ###

In [13]:
datasets = [X_train, X_test]

In [15]:
def remove_punctuation_and_special_characters(text):
  """Removes punctuation and special characters from a string.

  Args:
    text: The input string.

  Returns:
    The string with punctuation and special characters removed.
  """
  pattern = r'[^\w\s]'  # Matches any character that is NOT a word character (\w) or whitespace (\s)
  return re.sub(pattern, ' ', text)

In [17]:
def clean_dataframe_column_string(df, column_name):
    df[column_name] = df[column_name].apply(remove_punctuation_and_special_characters)
    return df

In [19]:
for df in datasets:
    for column in df.columns:
        clean_dataframe_column_string(df, column)

In [21]:
X_train['headlines'] = X_train['headlines'].astype(str)
X_train['description'] = X_train['description'].astype(str)
X_train['content'] = X_train['content'].astype(str)

X_test['headlines'] = X_test['headlines'].astype(str)
X_test['description'] = X_test['description'].astype(str)
X_test['content'] = X_test['content'].astype(str)

X_train['headlines'] = X_train['headlines'].fillna('')  # Replace NaN with empty string
X_train['description'] = X_train['description'].fillna('')
X_train['content'] = X_train['content'].fillna('')

X_test['headlines'] = X_test['headlines'].fillna('')
X_test['description'] = X_test['description'].fillna('')
X_test['content'] = X_test['content'].fillna('')

### CountVectorizer ###

In [23]:
def vectorize_news_data(X_train, X_test): # Added train_df argument
    """Vectorizes heading, description, and body separately and combines them.

    Args:
      X_train: Training data DataFrame (containing heading, description, body).
      X_test: Optional test data DataFrame. If provided, it will be transformed using the vectorizers fitted on the training data.

    Returns:
        A tuple containing:
        - X_train_combined: Combined sparse matrix for training data.
        - X_test_combined: Combined sparse matrix for test data (if X_test is provided), otherwise None.
    """
    vectorizers = {}
    transformed_features_train = []
    transformed_features_test = []

    features = ['headlines', 'description', 'content']  # Features to vectorise

    for feature in features:
        vectorizers[feature] = CountVectorizer(stop_words='english', min_df=2, max_df=0.5)

        vectorizers[feature].fit(X_train[feature]) # Fit on training data

        transformed_train = vectorizers[feature].transform(X_train[feature])
        transformed_features_train.append(transformed_train)

        transformed_test = vectorizers[feature].transform(X_test[feature]) # Transform test data
        transformed_features_test.append(transformed_test)

    X_train_combined = hstack(transformed_features_train)
    X_test_combined = hstack(transformed_features_test)
    return X_train_combined, X_test_combined

In [25]:
X_train_combined, X_test_combined = vectorize_news_data(X_train, X_test)

## Defining Models ##

In [29]:
logistic_regression = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier()

## Defining hyperparameters for tuning ##

In [34]:
param_grid_lr = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

param_grid_dt = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

## Performing Grid Search with Cross-Validation for Logistic Regression ##

In [37]:
grid_search_lr = GridSearchCV(logistic_regression, param_grid_lr, cv=5, n_jobs=-1)
grid_search_lr.fit(X_train_combined, y_train)

## Performing Grid Search with Cross-Validation for Decision Tree ##

In [41]:
grid_search_dt = GridSearchCV(decision_tree, param_grid_dt, cv=5, n_jobs=-1)
grid_search_dt.fit(X_train_combined, y_train)

## Getting the best models ##

In [49]:
best_lr = grid_search_lr.best_estimator_
best_dt = grid_search_dt.best_estimator_

## Making predictions on the test set ##

In [53]:
y_pred_lr = best_lr.predict(X_test_combined)
y_pred_dt = best_dt.predict(X_test_combined)

## Print classification reports ##

In [56]:
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))

Logistic Regression Classification Report:
               precision    recall  f1-score   support

     business       0.98      0.96      0.97       400
    education       0.99      0.99      0.99       400
entertainment       1.00      0.99      0.99       400
       sports       0.99      0.99      0.99       400
   technology       0.95      0.97      0.96       400

     accuracy                           0.98      2000
    macro avg       0.98      0.98      0.98      2000
 weighted avg       0.98      0.98      0.98      2000

Decision Tree Classification Report:
               precision    recall  f1-score   support

     business       0.83      0.88      0.85       400
    education       0.92      0.93      0.93       400
entertainment       0.97      0.94      0.96       400
       sports       0.93      0.87      0.90       400
   technology       0.82      0.85      0.83       400

     accuracy                           0.89      2000
    macro avg       0.90      0.89 