In [None]:
!pip install gdown==4.6.0



## Lấy data từ Google Drive

In [None]:
import gdown
!mkdir -p content/data
# Download all files
file_ids = [
    '1JXdyh-UVveXhBToLJj8ZJNttiJif6zH8',  # X_train
    '1rtyLyNDBr3sPcyElMhpDPK3qTbNZH1YT',  # X_test
    '1IkNvEhPFfano0qSDZFWhsmpF3ttC1q0Y',  # y_train
    '1JlMWumYQP9OIH79dVsCiZ-eavZAhgSzv'   # y_test
]

file_paths = [
    'content/data/X_train.csv',
    'content/data/X_test.csv',
    'content/data/y_train.csv',
    'content/data/y_test.csv'
]
for file_id, file_path in zip(file_ids, file_paths):
    gdown.download(f'https://drive.google.com/uc?id={file_id}', file_path, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1JXdyh-UVveXhBToLJj8ZJNttiJif6zH8
To: /content/content/data/X_train.csv
100%|██████████| 5.03M/5.03M [00:00<00:00, 48.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1rtyLyNDBr3sPcyElMhpDPK3qTbNZH1YT
To: /content/content/data/X_test.csv
100%|██████████| 1.34M/1.34M [00:00<00:00, 21.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1IkNvEhPFfano0qSDZFWhsmpF3ttC1q0Y
To: /content/content/data/y_train.csv
100%|██████████| 16.9k/16.9k [00:00<00:00, 28.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1JlMWumYQP9OIH79dVsCiZ-eavZAhgSzv
To: /content/content/data/y_test.csv
100%|██████████| 4.24k/4.24k [00:00<00:00, 8.14MB/s]


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

X_train = pd.read_csv('content/data/X_train.csv',header=0,keep_default_na=False)
X_test = pd.read_csv('content/data/X_test.csv',header=0,keep_default_na=False)
y_train = pd.read_csv('content/data/y_train.csv',header=0,keep_default_na=False)
y_test = pd.read_csv('content/data/y_test.csv',header=0,keep_default_na=False)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8457, 1)
(2115, 1)
(8457, 1)
(2115, 1)


## Chỉ dùng TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Chuyển các DataFrame 1 cột này thành Series
X_train = pd.read_csv('content/data/X_train.csv',header=0,keep_default_na=False)
X_test = pd.read_csv('content/data/X_test.csv',header=0,keep_default_na=False)
y_train = pd.read_csv('content/data/y_train.csv',header=0,keep_default_na=False)
y_test = pd.read_csv('content/data/y_test.csv',header=0,keep_default_na=False)
X_train = X_train.squeeze()
X_test = X_test.squeeze()
y_train = y_train.squeeze()
y_test = y_test.squeeze()

# Vector hóa TF-IDF
tfidf = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Huấn luyện mô hình
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# Dự đoán và đánh giá
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred, target_names=['ham', 'junk']))


              precision    recall  f1-score   support

         ham       0.96      0.97      0.97      1566
        junk       0.92      0.87      0.90       549

    accuracy                           0.95      2115
   macro avg       0.94      0.92      0.93      2115
weighted avg       0.95      0.95      0.95      2115



## Kết hợp TF-IDF và Embedding

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from sentence_transformers import SentenceTransformer
from scipy.sparse import csr_matrix, hstack
from sklearn.decomposition import PCA
import logging
logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
logging.getLogger("sentence_transformers.SentenceTransformer").setLevel(logging.ERROR)
# Tf-idf ở shell trước

# Giảm chiều TF-IDF bằng TruncatedSVD
# svd = TruncatedSVD(n_components=300, random_state=42)
# X_tfidf_reduced = svd.fit_transform(X_tfidf)

# Embedding bằng all-MiniLM-L6-v2
embedder = SentenceTransformer("all-MiniLM-L6-v2")
X_train_embed = embedder.encode(X_train, convert_to_numpy=True)
X_test_embed = embedder.encode(X_test, convert_to_numpy=True)

pca = PCA(n_components=100)  # Giảm còn 100 chiều
X_train_embed_reduced = pca.fit_transform(X_train_embed)
X_test_embed_reduced = pca.transform(X_test_embed)

# Chuẩn hóa cả hai phần
scaler = StandardScaler()
X_train_embed_sparse = csr_matrix(X_train_embed_reduced)  # Convert numpy → sparse
X_train_combined = hstack([X_train_tfidf, X_train_embed_sparse])
X_test_embed_sparse = csr_matrix(X_test_embed_reduced)
X_test_combined = hstack([X_test_tfidf, X_test_embed_sparse])


# Huấn luyện Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_combined, y_train)

# Đánh giá
y_pred = model.predict(X_test_combined)
print(classification_report(y_test, y_pred, target_names=['ham', 'junk']))

              precision    recall  f1-score   support

         ham       0.96      0.99      0.97      1566
        junk       0.96      0.87      0.91       549

    accuracy                           0.96      2115
   macro avg       0.96      0.93      0.94      2115
weighted avg       0.96      0.96      0.95      2115

