# News Article Classification Project

## Overview

### Load Dependencies

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
import string

In [4]:
# Download NLTK Stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nb173419\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Function to clean and preprocess text
def clean_text(text):
    text = str(text).lower()
    text = text.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    tokens = text.split()  # tokenize by white space
    tokens = [word for word in tokens if word not in stop_words]  # remove stopwords
    return " ".join(tokens)  # Rejoin into a single string


In [7]:
# Load the dataset
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [8]:
# Apply cleaning function to the 'headlines' column
df_train["cleaned"] = df_train["headlines"].apply(clean_text)
df_test["cleaned"] = df_test["headlines"].apply(clean_text)

In [9]:
# Define input (X) and labels (y)
X_train = df_train["cleaned"] # Cleaned headline text
y_train = df_train["category"] # Target category(label)

X_test = df_test["cleaned"] # Cleaned headline text
y_test = df_test["category"] # Target category(label)

In [10]:
# Convert text data into TF-IDF vectors
vectorizer = TfidfVectorizer()

# Create vectorizer
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [11]:
#Train a logistic regression model
clf = LogisticRegression(max_iter=1000)

# Create classifier with higher max ierations
clf.fit(X_train_vec, y_train)

In [12]:
# Make predictions on validation set
y_pred_test = clf.predict(X_test_vec)

#Evaluate the model performance
print(classification_report(y_test,y_pred_test))

               precision    recall  f1-score   support

     business       0.90      0.90      0.90       400
    education       0.95      0.95      0.95       400
entertainment       0.92      0.94      0.93       400
       sports       0.96      0.82      0.89       400
   technology       0.82      0.94      0.88       400

     accuracy                           0.91      2000
    macro avg       0.91      0.91      0.91      2000
 weighted avg       0.91      0.91      0.91      2000



In [13]:
# Function to classify a user provided headline
def classify_text(text):
    cleaned = clean_text(text) # Clean Input text
    vec = vectorizer.transform([cleaned]) # Vectorize
    pred = clf.predict(vec)[0] # Predict category

    return pred

user_input = input("Enter a headline to classify: ")

predicted_category = classify_text(user_input)

print("\nPredicted Category:",predicted_category)

Enter a headline to classify:  rugby



Predicted Category: technology
