│   │     • Tokenize & convert to integer sequences  
│   │     • Build a simple 1D-CNN (Embedding → Conv1D → GlobalMaxPool → Dense)  
│   │     • Train, plot loss/accuracy curves  

In [1]:
import tensorflow as tf
print(tf.__version__)

2.16.2


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 1. Load data
def load_data():
    train_df = pd.read_csv("../data/processed/train.csv")
    test_df = pd.read_csv("../data/processed/test.csv")

    # Combine headline and short description
    train_df["combined_text"] = train_df["headline"].fillna('') + " " + train_df["short_description"].fillna('')
    test_df["combined_text"] = test_df["headline"].fillna('') + " " + test_df["short_description"].fillna('')

    # Remove rows with empty combined_text
    train_df = train_df[train_df["combined_text"].str.strip() != ""]
    test_df = test_df[test_df["combined_text"].str.strip() != ""]

    le = LabelEncoder()
    y_train = le.fit_transform(train_df["category"])
    y_test = le.transform(test_df["category"])

    return train_df["combined_text"], test_df["combined_text"], y_train, y_test, le

train_texts, test_texts, y_train, y_test, le = load_data()

# 2. Tokenize and pad sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_texts)

X_train = tokenizer.texts_to_sequences(train_texts)
X_test = tokenizer.texts_to_sequences(test_texts)

# Pad sequences to ensure uniform length for CNN input
X_train = pad_sequences(X_train, maxlen=200)  # You can adjust maxlen based on your dataset
X_test = pad_sequences(X_test, maxlen=200)

# Convert labels to categorical format
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)


In [None]:
# Build the 1D CNN Model
model = Sequential()

# Embedding Layer
model.add(Embedding(input_dim=10000, output_dim=128))  # Using 10000 words, embedding size of 128

# Convolutional Layer
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))  # 128 filters, kernel size of 5

# GlobalMaxPooling Layer
model.add(GlobalMaxPooling1D())  # Reduces dimensionality by keeping max feature

# Fully Connected (Dense) Layer
model.add(Dense(128, activation='relu'))  # Dense layer with 128 units
model.add(Dropout(0.5))  # Dropout for regularization

# Output Layer
model.add(Dense(y_train.shape[1], activation='softmax'))  # Output layer with the number of classes

In [None]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

# Plot loss and accuracy curves
plt.figure(figsize=(12, 4))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Test Accuracy')
plt.title('Accuracy over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Loss plot
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Test Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
import seaborn as sns
import numpy as np

# Predict class probabilities
y_pred_proba = model.predict(X_test)

# Convert one-hot to class labels
y_pred = np.argmax(y_pred_proba, axis=1)
y_true = np.argmax(y_test, axis=1)

print("📌 CNN")
print("Accuracy:", accuracy_score(y_true, y_pred))
print("F1 Score:", f1_score(y_true, y_pred, average='weighted'))
# Print classification report
print("Classification Report:\n")
print(classification_report(y_true, y_pred, target_names=le.classes_))
