In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 1. Load and prepare data
df = pd.read_json("News_Category_Dataset_v3.json", lines=True)
df = df[['headline', 'category']]
df = df.dropna()

# Keep top 3 categories for multi-output demo
top3 = df['category'].value_counts().nlargest(3).index.tolist()
df = df[df['category'].isin(top3)]

# Simulate multiple outputs (just for demo)
# Let's say we split category into 3 parallel outputs
df = df.sample(frac=1).reset_index(drop=True)
df['cat1'] = df['category'].apply(lambda x: 1 if x == top3[0] else 0)
df['cat2'] = df['category'].apply(lambda x: 1 if x == top3[1] else 0)
df['cat3'] = df['category'].apply(lambda x: 1 if x == top3[2] else 0)

# 2. Tokenize text
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['headline'])
sequences = tokenizer.texts_to_sequences(df['headline'])
X = pad_sequences(sequences, maxlen=20)

# 3. Prepare outputs
y1 = df['cat1'].values
y2 = df['cat2'].values
y3 = df['cat3'].values

X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)
_, _, y3_train, y3_test = train_test_split(X, y3, test_size=0.2, random_state=42)

# 4. Build multi-output model
input_layer = tf.keras.Input(shape=(20,))
embed = tf.keras.layers.Embedding(10000, 64)(input_layer)
flat = tf.keras.layers.Flatten()(embed)
dense1 = tf.keras.layers.Dense(128, activation='relu')(flat)
dense2 = tf.keras.layers.Dense(64, activation='relu')(dense1)

# Each output layer uses sigmoid
out1 = tf.keras.layers.Dense(1, activation='sigmoid', name='cat1')(dense2)
out2 = tf.keras.layers.Dense(1, activation='sigmoid', name='cat2')(dense2)
out3 = tf.keras.layers.Dense(1, activation='sigmoid', name='cat3')(dense2)

model = tf.keras.Model(inputs=input_layer, outputs=[out1, out2, out3])

# 5. Compile model with multiple losses
model.compile(
    loss={
        'cat1': 'binary_crossentropy',
        'cat2': 'binary_crossentropy',
        'cat3': 'binary_crossentropy'
    },
    optimizer='adam',
    metrics={
        'cat1': 'accuracy',
        'cat2': 'accuracy',
        'cat3': 'accuracy'
    }
)


# 6. Train the model
model.fit(
    X_train,
    {'cat1': y1_train, 'cat2': y2_train, 'cat3': y3_train},
    epochs=5,
    batch_size=32,
    validation_split=0.1
)

# 7. Evaluate
model.evaluate(X_test, {'cat1': y1_test, 'cat2': y2_test, 'cat3': y3_test})


Epoch 1/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - cat1_accuracy: 0.8296 - cat1_loss: 0.3468 - cat2_accuracy: 0.8848 - cat2_loss: 0.2651 - cat3_accuracy: 0.8713 - cat3_loss: 0.3064 - loss: 0.9183 - val_cat1_accuracy: 0.9404 - val_cat1_loss: 0.1672 - val_cat2_accuracy: 0.9499 - val_cat2_loss: 0.1270 - val_cat3_accuracy: 0.9498 - val_cat3_loss: 0.1437 - val_loss: 0.4395
Epoch 2/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 8ms/step - cat1_accuracy: 0.9661 - cat1_loss: 0.0990 - cat2_accuracy: 0.9733 - cat2_loss: 0.0725 - cat3_accuracy: 0.9727 - cat3_loss: 0.0822 - loss: 0.2537 - val_cat1_accuracy: 0.9297 - val_cat1_loss: 0.1897 - val_cat2_accuracy: 0.9448 - val_cat2_loss: 0.1515 - val_cat3_accuracy: 0.9436 - val_cat3_loss: 0.1619 - val_loss: 0.5046
Epoch 3/5
[1m1596/1596[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 8ms/step - cat1_accuracy: 0.9868 - cat1_loss: 0.0398 - cat2_accuracy: 0.9894 - cat2_loss: 0.0321 - ca

[0.8590531349182129,
 0.3407551646232605,
 0.2317945659160614,
 0.287570595741272,
 0.9241997003555298,
 0.9414750933647156,
 0.938443124294281]

In [12]:
sample_text = ["Facebook faces government inquiry over data privacy issues"]
seq = tokenizer.texts_to_sequences(sample_text)
pad = pad_sequences(seq, maxlen=20)
p1, p2, p3 = model.predict(pad)

for i, prob in enumerate([p1, p2, p3]):
    print(f"{top3[i]}: {'Yes' if prob[0][0]>0.5 else 'No'} ({prob[0][0]:.2f})")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
POLITICS: Yes (1.00)
WELLNESS: No (0.00)
ENTERTAINMENT: No (0.00)


In [11]:
# Sample test headline
sample_text = ["Facebook faces government inquiry over data privacy issues"]

# Step 1: Tokenize and pad
sample_seq = tokenizer.texts_to_sequences(sample_text)
sample_pad = pad_sequences(sample_seq, maxlen=20)

# Step 2: Predict using model
pred1, pred2, pred3 = model.predict(sample_pad)

# Step 3: Threshold each output
threshold = 0.5
predicted = {
    top3[0]: int(pred1[0][0] > threshold),
    top3[1]: int(pred2[0][0] > threshold),
    top3[2]: int(pred3[0][0] > threshold),
}

# Step 4: Show predicted labels
print("Predicted categories:")
for label, val in predicted.items():
    if val == 1:
        print("✔️", label)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
Predicted categories:
✔️ POLITICS


In [13]:
pip install kaggle

Note: you may need to restart the kernel to use updated packages.Collecting kaggle
  Downloading kaggle-1.7.4.5-py3-none-any.whl.metadata (16 kB)
Downloading kaggle-1.7.4.5-py3-none-any.whl (181 kB)
Installing collected packages: kaggle
Successfully installed kaggle-1.7.4.5




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
