In [48]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.metrics import Recall, Precision
# from tf.keras.utils import plot_model

In [49]:
df = pd.read_csv('uci-news-aggregator.csv')

In [50]:
# Clean the headlines
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower().strip()  # Convert to lowercase and remove leading/trailing spaces
    return text

df['clean_title'] = df['TITLE'].apply(clean_text)

In [51]:
# Split the data into training and testing sets
X = df['clean_title']
y = df['CATEGORY']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
# Encode the labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [53]:
vocab_size = 10000
max_length = 20

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')

X_test_seq = tokenizer.texts_to_sequences(X_test)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

In [54]:
embedding_dim = 16

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(64, dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', 
              metrics=['accuracy', Recall(), Precision()])

model.fit(X_train_pad, y_train, epochs=2, batch_size=32)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x1c4d4042890>

In [55]:
loss, accuracy, recall, precision = model.evaluate(X_test_pad, y_test)
f1 = 2 * (precision * recall) / (precision + recall)

print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1-score: {f1}")

Accuracy: 0.5164883136749268
Recall: 0.973751425743103
Precision: 0.8646601438522339
F1-score: 0.9159690482634187


In [59]:
import matplotlib.pyplot as plt
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)
# Display the image using Matplotlib
img = plt.imread('model.png')
plt.figure(figsize=(10, 10))
plt.imshow(img)
plt.axis('off')
plt.show()

AttributeError: module 'pydot' has no attribute 'InvocationException'