In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer
import json
import pandasql as ps
from sklearn.metrics import accuracy_score, classification_report

In [6]:
file_data = "../data/News_Category_Dataset_v3.json"
with open(file_data, "r", encoding="utf-8") as file:
    data = [json.loads(line) for line in file]
print(len(data))

209527


In [7]:
for i in data[0]:
    print(i, ":", data[0][i])

link : https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9
headline : Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters
category : U.S. NEWS
short_description : Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.
authors : Carla K. Johnson, AP
date : 2022-09-23


In [8]:
# Chuyển dữ liệu JSON thành DataFrame
df = pd.DataFrame(data)
df = df.drop(columns=["link", "date"])
print(df.head())

                                            headline   category  \
0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  
0  Health experts said it is too early to predict...  Carla K. Johnson, AP  
1  He was subdued by passengers and crew when he ...        Mary Papenfuss  
2  "Until you have a dog you don't understand wha...         Elyse Wanshel  
3  "Accidentally put grown-up toothpaste on my to...      Caroline Bologna  
4  Amy Cooper accused investment firm Franklin Te...        Nina Golgowski  


In [9]:
# Đếm category
query = "SELECT category, COUNT(category) AS count_category FROM df GROUP BY category ORDER BY count_category DESC LIMIT 10"
result = ps.sqldf(query, locals())
print(result)

         category  count_category
0        POLITICS           35602
1        WELLNESS           17945
2   ENTERTAINMENT           17362
3          TRAVEL            9900
4  STYLE & BEAUTY            9814
5       PARENTING            8791
6  HEALTHY LIVING            6694
7    QUEER VOICES            6347
8    FOOD & DRINK            6340
9        BUSINESS            5992


In [10]:
# Khai phá dữ liệu
print(df['headline'].iloc[0])
print(df['headline'].iloc[1])

Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters
American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video


In [11]:
# Tiền xử lý dữ liệu bằng cách gộp headline, short_description, authors thành một chuỗi đầu vào

nltk.download("punkt")

def preprocess_text(row):
    text = row["headline"] + " " + row["short_description"] + " " + row["authors"]
    return text.lower().strip()

df["text"] = df.apply(preprocess_text, axis=1)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\duclh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
# Encode labels (category)

label_encoder = LabelEncoder()
df["category"] = label_encoder.fit_transform(df["category"])
num_classes = len(label_encoder.classes_)  # Số lượng nhãn


In [16]:
# Tokenize dữ liệu văn bản

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

MAX_LEN = 50  # Giới hạn độ dài câu

def tokenize_text(text):
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=MAX_LEN, return_tensors="pt")
    return tokens["input_ids"].squeeze(0), tokens["attention_mask"].squeeze(0)

df["tokens"] = df["text"].apply(lambda x: tokenize_text(x)[0])
df["attention_masks"] = df["text"].apply(lambda x: tokenize_text(x)[1])


In [23]:
# Chia dữ liệu thành tập huấn luyện và tập kiểm tra

X = torch.stack(df["tokens"].tolist())  # Dữ liệu đầu vào
y = torch.tensor(df["category"].values, dtype=torch.long)  # Nhãn
masks = torch.stack(df["attention_masks"].tolist())  # Attention masks

# Chuyển tensor về numpy trước khi chia
X_list = X.detach().cpu().tolist()
y_list = y.detach().cpu().tolist()
masks_list = masks.detach().cpu().tolist()

# Chia tập dữ liệu (dùng random_state để tái lập kết quả)
X_train_list, X_test_list, y_train_list, y_test_list, masks_train_list, masks_test_list = train_test_split(
    X_list, y_list, masks_list, test_size=0.2, random_state=42
)

# Chuyển lại về tensor
X_train, X_test = torch.tensor(X_train_list), torch.tensor(X_test_list)
y_train, y_test = torch.tensor(y_train_list), torch.tensor(y_test_list)
masks_train, masks_test = torch.tensor(masks_train_list), torch.tensor(masks_test_list)


# In kích thước để kiểm tra
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")
print(f"masks_train: {masks_train.shape}, masks_test: {masks_test.shape}")


X_train: torch.Size([167621, 50]), X_test: torch.Size([41906, 50])
y_train: torch.Size([167621]), y_test: torch.Size([41906])
masks_train: torch.Size([167621, 50]), masks_test: torch.Size([41906, 50])


In [24]:
# Khởi tạo mô hình RNN/LSTM

VOCAB_SIZE = tokenizer.vocab_size
EMBED_DIM = 128  
HIDDEN_DIM = 256  
NUM_LAYERS = 2  
DROPOUT = 0.5  
LEARNING_RATE = 1e-3  
EPOCHS = 10  
BATCH_SIZE = 32  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embedding = nn.Embedding(VOCAB_SIZE, EMBED_DIM).to(device)
lstm = nn.LSTM(EMBED_DIM, HIDDEN_DIM, num_layers=NUM_LAYERS, batch_first=True, dropout=DROPOUT).to(device)
fc = nn.Linear(HIDDEN_DIM, num_classes).to(device)
dropout = nn.Dropout(DROPOUT).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(list(embedding.parameters()) + list(lstm.parameters()) + list(fc.parameters()), lr=LEARNING_RATE)


In [25]:
# Chuyển dữ liệu thành DataLoader

train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)


In [26]:
# Vòng lặp huấn luyện

for epoch in range(EPOCHS):
    total_loss = 0
    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        
        embedded = embedding(inputs)
        lstm_out, _ = lstm(embedded)
        final_hidden_state = lstm_out[:, -1, :]
        output = fc(dropout(final_hidden_state))
        
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(train_loader):.4f}")


Epoch 1/10, Loss: 2.3061


KeyboardInterrupt: 

In [27]:
# Đánh giá mô hình

y_pred = []
y_true = []

with torch.no_grad():
    for batch in test_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        
        embedded = embedding(inputs)
        lstm_out, _ = lstm(embedded)
        final_hidden_state = lstm_out[:, -1, :]
        output = fc(dropout(final_hidden_state))
        
        _, predicted = torch.max(output, 1)
        
        y_pred.extend(predicted.cpu().numpy())
        y_true.extend(targets.cpu().numpy())

accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))


RuntimeError: Numpy is not available

In [None]:
# Dự đoán với dữ liệu mới

def predict_category(text):
    tokenized_text, _ = tokenize_text(text)
    tokenized_text = tokenized_text.unsqueeze(0).to(device)  # Thêm batch dimension

    with torch.no_grad():
        embedded = embedding(tokenized_text)
        lstm_out, _ = lstm(embedded)
        final_hidden_state = lstm_out[:, -1, :]
        output = fc(dropout(final_hidden_state))
        _, predicted = torch.max(output, 1)

    return label_encoder.inverse_transform([predicted.cpu().item()])[0]

new_text = "Apple announces new iPhone with AI-powered camera"
print("Predicted Category:", predict_category(new_text))


In [None]:
# Lưu mô hình

torch.save({
    "embedding": embedding.state_dict(),
    "lstm": lstm.state_dict(),
    "fc": fc.state_dict()
}, "text_rnn_model.pth")
