# 📰 News Headline Classification
Multi-class classification using ML and Deep Learning.

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from wordcloud import WordCloud
nltk.download('stopwords')

## Load Dataset

In [None]:
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)
df = df[['headline', 'category']]
df.head()

## Filter Categories

In [None]:
selected_categories = ['POLITICS','SPORTS','TECH','BUSINESS','ENTERTAINMENT']
df = df[df['category'].isin(selected_categories)].reset_index(drop=True)
df['category'].value_counts()

## Text Preprocessing

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]','', text)
    words = [w for w in text.split() if w not in stop_words]
    return ' '.join(words)
df['clean_headline'] = df['headline'].apply(clean_text)
df[['headline','clean_headline']].head()

## Label Encoding

In [None]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['category'])
le.classes_

## Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_headline'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

# 🤖 Model 1: TF-IDF + Logistic Regression

In [None]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)

# 🧠 Model 2: LSTM

In [None]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=30)
X_test_pad = pad_sequences(X_test_seq, maxlen=30)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=100, input_length=30))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(selected_categories), activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_split=0.1)

In [None]:
y_pred_lstm = np.argmax(model.predict(X_test_pad), axis=1)

# 📊 Evaluation

In [None]:
print('Logistic Regression Accuracy:', accuracy_score(y_test, y_pred_lr))
print('LSTM Accuracy:', accuracy_score(y_test, y_pred_lstm))
print('\nLR Report\n', classification_report(y_test, y_pred_lr, target_names=le.classes_))
print('\nLSTM Report\n', classification_report(y_test, y_pred_lstm, target_names=le.classes_))

## Confusion Matrix

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, y_pred_lr), annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Logistic Regression Confusion Matrix')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix(y_test, y_pred_lstm), annot=True, fmt='d', cmap='Greens', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('LSTM Confusion Matrix')
plt.show()

# ☁️ Word Clouds

In [None]:
for cat in selected_categories:
    text = ' '.join(df[df['category']==cat]['clean_headline'])
    wc = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10,5))
    plt.imshow(wc, interpolation='bilinear')
    plt.title(cat)
    plt.axis('off')
    plt.show()

# 📈 Model Comparison

In [None]:
models = ['Logistic Regression','LSTM']
accuracies = [accuracy_score(y_test, y_pred_lr), accuracy_score(y_test, y_pred_lstm)]
plt.bar(models, accuracies)
plt.ylim(0,1)
plt.title('Model Accuracy Comparison')
plt.show()

## ✅ Conclusion
Both models perform well. LSTM may capture context better, while Logistic Regression is faster and strong baseline.