In [1]:
# ✅ STEP 1: IMPORTS
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

# ✅ STEP 2: LOAD DATA
df = pd.read_json("News_Category_Dataset_v3.json", lines=True)
df = df[["headline", "short_description", "category"]].dropna().drop_duplicates()

# ✅ STEP 3: Create Sentiment Labels (Rule-Based)
def simple_sentiment(text):
    text = text.lower()
    if any(w in text for w in ["win", "great", "love", "happy", "celebration", "joy"]):
        return "positive"
    elif any(w in text for w in ["bad", "sad", "angry", "hate", "tragedy", "alarming"]):
        return "negative"
    else:
        return "neutral"

df["sentiment"] = df["short_description"].apply(simple_sentiment)

# ✅ STEP 4: ENCODING
cat_encoder = LabelEncoder()
df["cat_label"] = cat_encoder.fit_transform(df["category"])
y_cat = to_categorical(df["cat_label"])

sent_encoder = LabelEncoder()
df["sent_label"] = sent_encoder.fit_transform(df["sentiment"])
y_sent = to_categorical(df["sent_label"])

# ✅ STEP 5: TEXT TOKENIZATION
sentences = df["headline"].astype(str).values
max_words = 10000
max_len = 25

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
X = pad_sequences(tokenizer.texts_to_sequences(sentences), maxlen=max_len)

# ✅ STEP 6: SPLIT DATA
X_train, X_test, y_cat_train, y_cat_test, y_sent_train, y_sent_test = train_test_split(
    X, y_cat, y_sent, test_size=0.2, random_state=42
)

# ✅ STEP 7: MULTIPLE OUTPUT MODEL
inp = Input(shape=(max_len,))
x = Embedding(max_words, 64)(inp)
x = LSTM(64)(x)

out_category = Dense(y_cat.shape[1], activation='softmax', name='category')(x)
out_sentiment = Dense(y_sent.shape[1], activation='softmax', name='sentiment')(x)

model = Model(inputs=inp, outputs=[out_category, out_sentiment])

model.compile(
    optimizer='adam',
    loss={'category': 'categorical_crossentropy', 'sentiment': 'categorical_crossentropy'},
    metrics={'category': 'accuracy', 'sentiment': 'accuracy'}
)

# ✅ STEP 8: TRAIN
model.fit(
    X_train,
    {'category': y_cat_train, 'sentiment': y_sent_train},
    validation_split=0.1,
    epochs=5,
    batch_size=256,
    callbacks=[EarlyStopping(patience=2)]
)


Epoch 1/5
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 29ms/step - category_accuracy: 0.2368 - category_loss: 3.0558 - loss: 3.4780 - sentiment_accuracy: 0.8873 - sentiment_loss: 0.4222 - val_category_accuracy: 0.4199 - val_category_loss: 2.2392 - val_loss: 2.6136 - val_sentiment_accuracy: 0.8942 - val_sentiment_loss: 0.3756
Epoch 2/5
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 29ms/step - category_accuracy: 0.4710 - category_loss: 2.0269 - loss: 2.4001 - sentiment_accuracy: 0.8941 - sentiment_loss: 0.3732 - val_category_accuracy: 0.5083 - val_category_loss: 1.8691 - val_loss: 2.2400 - val_sentiment_accuracy: 0.8942 - val_sentiment_loss: 0.3714
Epoch 3/5
[1m588/588[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 29ms/step - category_accuracy: 0.5530 - category_loss: 1.6545 - loss: 2.0204 - sentiment_accuracy: 0.8954 - sentiment_loss: 0.3658 - val_category_accuracy: 0.5369 - val_category_loss: 1.7495 - val_loss: 2.1207 - val_sentiment

<keras.src.callbacks.history.History at 0x2854dc04050>

In [2]:
headline = "NASA Launches New Satellite"

# Convert to sequence
seq = tokenizer.texts_to_sequences([headline])
padded_input = pad_sequences(seq, maxlen=max_len)

# Make prediction
pred_cat, pred_sent = model.predict(padded_input)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step


In [3]:
print(pred_cat)

[[3.8271870e-03 4.9993733e-04 1.4080611e-03 2.7839690e-03 2.3692154e-04
  1.0437539e-03 2.3443110e-03 8.2195096e-04 1.5411191e-04 2.6266623e-04
  1.3880951e-03 2.6678145e-02 3.5265955e-04 3.8696909e-03 6.3549313e-03
  2.7924595e-02 1.6146407e-02 8.0445257e-04 2.1648947e-02 4.2519314e-04
  8.0663699e-04 1.3456309e-04 2.4478736e-03 1.0000396e-03 3.0713754e-03
  8.6234213e-04 1.0380455e-03 6.9304156e-01 8.6326245e-04 4.6829114e-04
  1.1436378e-03 1.7862410e-03 2.6040103e-03 6.0780975e-03 2.9947734e-02
  9.6796907e-04 5.9763395e-05 1.5806509e-02 1.0502318e-01 7.9431041e-04
  2.8842974e-03 1.0194301e-02]]


In [8]:
# 👇 Required for inverse transformation (convert label index → original label)
cat_labels = cat_encoder.inverse_transform(np.arange(y_cat.shape[1]))
sent_labels = sent_encoder.inverse_transform(np.arange(y_sent.shape[1]))

# 👇 Function to predict from user input
def predict_headline(headline_text):
    # Preprocess input
    seq = tokenizer.texts_to_sequences([headline_text])
    padded = pad_sequences(seq, maxlen=max_len)
    
    # Predict
    pred_cat, pred_sent = model.predict(padded, verbose=0)
    
    # Decode predictions
    pred_cat_label = cat_labels[np.argmax(pred_cat)]
    pred_sent_label = sent_labels[np.argmax(pred_sent)]
    
    print("📰 Headline:", headline_text)
    print("📂 Predicted Category:", pred_cat_label)
    print("🙂 Predicted Sentiment:", pred_sent_label)

# ✅ Example usage:
predict_headline("a boy awarded with 1 million dollars")


📰 Headline: a boy awarded with 1 million dollars
📂 Predicted Category: QUEER VOICES
🙂 Predicted Sentiment: neutral
