**Read csv**

In [1]:
import pandas as pd

df = pd.read_csv('rating.csv')

**Encoding user and product**

In [19]:
from sklearn.preprocessing import LabelEncoder

# init 
user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

# fit & transform
df['user_id_encoded'] = user_encoder.fit_transform(df['user_id'])
df['product_id_encoded'] = product_encoder.fit_transform(df['product_id'])


num_users = len(user_encoder.classes_)
num_products = len(product_encoder.classes_)

print(f"User IDs: 0 -> {num_users-1}")
print(f"Product IDs: 0 -> {num_products-1}")


User IDs: 0 -> 9950
Product IDs: 0 -> 1468


**NLP Preprocessing**

In [7]:
!pip install underthesea

Collecting underthesea
  Downloading underthesea-8.3.0-py3-none-any.whl.metadata (14 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.12-cp313-cp313-win_amd64.whl.metadata (4.4 kB)
Collecting underthesea_core==1.0.5 (from underthesea)
  Downloading underthesea_core-1.0.5-cp313-none-win_amd64.whl.metadata (1.4 kB)
Collecting huggingface-hub (from underthesea)
  Downloading huggingface_hub-1.3.2-py3-none-any.whl.metadata (13 kB)
Collecting hf-xet<2.0.0,>=1.2.0 (from huggingface-hub->underthesea)
  Downloading hf_xet-1.2.0-cp37-abi3-win_amd64.whl.metadata (5.0 kB)
Collecting typer-slim (from huggingface-hub->underthesea)
  Downloading typer_slim-0.21.1-py3-none-any.whl.metadata (16 kB)
Downloading underthesea-8.3.0-py3-none-any.whl (8.3 MB)
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   

In [11]:
from underthesea import word_tokenize
import re

def clean_text(text):
    # lower
    text = text.lower()

    # remove emoji / icon
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags
        "\U00002700-\U000027BF"  # dingbats
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    text = emoji_pattern.sub('', text)

    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# clean
df['clean_comment'] = df['comment'].apply(clean_text)

# example : "sản phẩm rất tuyệt vời" -> ["sản_phẩm", "rất", "tuyệt_vời"]
df['tokenized_comment'] = df['clean_comment'].apply(lambda x: word_tokenize(x, format="text"))

print(df[['comment', 'tokenized_comment']].head(2))

                              comment                   tokenized_comment
0  Tôi hài lòng về chất lượng dịch vụ  tôi hài_lòng về chất_lượng dịch_vụ
1                  Âm thanh chân thật                  âm_thanh chân_thật


**Vectorization**

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# config 
MAX_VOCAB_SIZE = 10000  # 10000 most common words
MAX_LEN = 100 

# learn vocabulary from all comment
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(df['tokenized_comment'])

# text to string number 
sequences = tokenizer.texts_to_sequences(df['tokenized_comment'])

# padding to all are equal
X_text = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')

word_index = tokenizer.word_index
print(f"Dimension of dictionary: {len(word_index)}")
print(f"Shape of text input: {X_text.shape}")

# example: sentence "Tuyệt vời" convert to vector [34, 12, 0, 0, ..., 0]

Kích thước từ điển: 7398
Shape của dữ liệu Text đầu vào: (14612, 100)


**[1-5] -> [0-1]**

In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

y = scaler.fit_transform(df['rating'].values.reshape(-1, 1))

print(f"Min rating: {y.min()}, Max rating: {y.max()}")

Min rating: 0.0, Max rating: 1.0


**Split Data**

In [30]:
from sklearn.model_selection import train_test_split

X_user = df['user_id_encoded'].values
X_product = df['product_id_encoded'].values

X_train_u, X_test_u, X_train_p, X_test_p, X_train_t, X_test_t, y_train, y_test = train_test_split(
    X_user, X_product, X_text, y, 
    test_size=0.2, 
    random_state=42
)

print("--- Data Shapes ---")
print(f"Train User: {X_train_u.shape}")
print(f"Train Product: {X_train_p.shape}")
print(f"Train Text: {X_train_t.shape}")
print(f"Train Label: {y_train.shape}")



--- Data Shapes ---
Train User: (11689,)
Train Product: (11689,)
Train Text: (11689, 100)
Train Label: (11689, 1)
[[1.  ]
 [0.5 ]
 [0.5 ]
 ...
 [1.  ]
 [0.75]
 [0.75]]


In [27]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2 # use L2 Regularization

EMBEDDING_DIM = 8       
TEXT_EMBEDDING_DIM = 32 


# Collaborative Filtering
user_input = Input(shape=(1,), name='user_input')
product_input = Input(shape=(1,), name='product_input')

# add Regularization into Embedding penalty
user_embedding = Embedding(input_dim=num_users, output_dim=EMBEDDING_DIM, embeddings_regularizer=l2(1e-4), name='user_embedding')(user_input)
product_embedding = Embedding(input_dim=num_products, output_dim=EMBEDDING_DIM, embeddings_regularizer=l2(1e-4), name='product_embedding')(product_input)

user_vec = Flatten()(user_embedding)
product_vec = Flatten()(product_embedding)

# NLP 
text_input = Input(shape=(MAX_LEN,), name='text_input')
text_embed = Embedding(input_dim=MAX_VOCAB_SIZE + 1, output_dim=TEXT_EMBEDDING_DIM, input_length=MAX_LEN)(text_input)

text_vec = GlobalAveragePooling1D()(text_embed)

# Combine "Collaborative Filtering" vs "NLP"
concat = Concatenate()([user_vec, product_vec, text_vec])

# Deep Layers 
dense_1 = Dense(64, activation='relu', kernel_regularizer=l2(1e-3))(concat)
dropout_1 = Dropout(0.5)(dense_1) # Dropout 0.5

dense_2 = Dense(32, activation='relu', kernel_regularizer=l2(1e-3))(dropout_1)
dropout_2 = Dropout(0.5)(dense_2)

output = Dense(1, activation='sigmoid', name='output')(dense_2)

model = Model(inputs=[user_input, product_input, text_input], outputs=output)

# compile
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='mean_squared_error',
              metrics=['mae'])

model.summary()

In [28]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# only save model which having val_loss lowest
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True)

# train 
history = model.fit(
    x={'user_input': X_train_u, 'product_input': X_train_p, 'text_input': X_train_t},
    y=y_train,
    batch_size=64,
    epochs=20,
    validation_data=({'user_input': X_test_u, 'product_input': X_test_p, 'text_input': X_test_t}, y_test),
    callbacks=[early_stop, checkpoint],
    verbose=1
)

Epoch 1/20
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.1507 - mae: 0.2607 - val_loss: 0.1171 - val_mae: 0.2515
Epoch 2/20
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.1020 - mae: 0.2409 - val_loss: 0.0905 - val_mae: 0.2308
Epoch 3/20
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0799 - mae: 0.2041 - val_loss: 0.0770 - val_mae: 0.2022
Epoch 4/20
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0664 - mae: 0.1789 - val_loss: 0.0744 - val_mae: 0.2016
Epoch 5/20
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.0560 - mae: 0.1546 - val_loss: 0.0722 - val_mae: 0.1814
Epoch 6/20
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 0.0468 - mae: 0.1326 - val_loss: 0.0730 - val_mae: 0.1832
Epoch 7/20
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - 

In [39]:
from sklearn.metrics import mean_squared_error
import math

# prediction all set test
y_pred_normalized = model.predict(
    [X_test_u, X_test_p, X_test_t], 
    batch_size=64, 
    verbose=1
)

# [0-1] ->  [1-5], y_real = y_norm * (max - min) + min
y_pred_real = y_pred_normalized * 4.0 + 1.0 
y_true_real = y_test * 4.0 + 1.0

# RMSE
rmse = math.sqrt(mean_squared_error(y_true_real, y_pred_real))
print(f"RMSE: {rmse:.4f} (The average error is approximately {rmse:.2f} star)")


[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
RMSE: 0.9519 (The average error is approximately 0.95 star)


In [42]:
import numpy as np
from tqdm import tqdm

def dcg_at_k(r, k):
    r = np.asarray(r, dtype=float)[:k]
    if r.size:
        return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(r, k):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k) / dcg_max

def evaluate_ranking_metrics(model, test_users_indices, df_full, k=10, num_eval_users=50):
    
    precision_scores = []
    recall_scores = []
    ndcg_scores = []
    
    unique_test_users = np.unique(test_users_indices)
    
    # user random
    sampled_users = np.random.choice(unique_test_users, size=min(num_eval_users, len(unique_test_users)), replace=False)
    
    # Chuẩn bị dữ liệu input giả lập
    all_product_idxs = np.arange(num_products) 
    empty_text_input = np.zeros((num_products, MAX_LEN)) 
    
    for user_idx in tqdm(sampled_users):
        # Ground Truth
        user_id_real = user_encoder.inverse_transform([user_idx])[0]
        user_data = df_full[df_full['user_id'] == user_id_real]
        
        # get products >= 4star from product_id_encoded
        true_relevant_items = set(user_data[user_data['rating'] >= 4]['product_id_encoded'].values)
        
        if len(true_relevant_items) == 0:
            continue 
            
        # prediction
        user_input_data = np.full(shape=(num_products,), fill_value=user_idx)
        
        predictions = model.predict(
            [user_input_data, all_product_idxs, empty_text_input], 
            batch_size=128, verbose=0
        ).flatten()
        
        # top K
        top_k_indices = predictions.argsort()[-k:][::-1]
        
        # metrics
        r = [1 if item in true_relevant_items else 0 for item in top_k_indices]
        
        precision = sum(r) / k
        recall = sum(r) / len(true_relevant_items)
        ndcg = ndcg_at_k(r, k)
        
        precision_scores.append(precision)
        recall_scores.append(recall)
        ndcg_scores.append(ndcg)
        
    return np.mean(precision_scores), np.mean(recall_scores), np.mean(ndcg_scores)

K = 10
p_at_k, r_at_k, ndcg_at_k_score = evaluate_ranking_metrics(
    model, 
    X_test_u, 
    df,  
    k=K, 
    num_eval_users=50 
)

print(f"\nResult Ranking (Top-{K}):")
print(f"Precision@{K}: {p_at_k:.4f}")
print(f"Recall@{K}   : {r_at_k:.4f}")
print(f"NDCG@{K}     : {ndcg_at_k_score:.4f}")

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:04<00:00, 12.14it/s]


Result Ranking (Top-10):
Precision@10: 0.0024
Recall@10   : 0.0119
NDCG@10     : 0.0238





In [29]:
def recommend_products(user_id_raw, model, top_k=5):
    # 1. Kiểm tra xem user này có trong dữ liệu huấn luyện không
    try:
        user_idx = user_encoder.transform([user_id_raw])[0]
    except ValueError:
        print(f"User {user_id_raw} là người dùng mới, chưa có dữ liệu để gợi ý cá nhân hóa.")
        return []

    # 2. Lấy danh sách tất cả sản phẩm
    all_product_idxs = np.arange(num_products)
    
    # (Tuỳ chọn) Lọc bỏ các sản phẩm user đã mua rồi
    # Ở đây tôi bỏ qua bước này để code đơn giản, bạn có thể thêm logic lọc nếu cần
    
    # 3. Chuẩn bị dữ liệu đầu vào cho mô hình
    # - User input: Lặp lại ID của user cho bằng số lượng sản phẩm
    user_input_data = np.full(shape=(num_products,), fill_value=user_idx)
    
    # - Product input: Là danh sách tất cả các sản phẩm
    product_input_data = all_product_idxs
    
    # - Text input: Vì chưa mua nên chưa có comment. 
    # Ta dùng chuỗi rỗng hoặc tên sản phẩm để làm input giả lập.
    # Ở đây ta dùng padding (toàn số 0) đại diện cho "không có ý kiến"
    text_input_data = np.zeros((num_products, MAX_LEN)) 
    
    # 4. Dự đoán (Predict)
    # Kết quả trả về là rating dự đoán (đã chuẩn hóa 0-1)
    predictions = model.predict(
        [user_input_data, product_input_data, text_input_data], 
        batch_size=64, 
        verbose=0
    )
    
    # 5. Xử lý kết quả
    # Reshape về mảng 1 chiều
    predictions = predictions.flatten()
    
    # Lấy top k chỉ số có điểm dự đoán cao nhất
    # argsort sắp xếp tăng dần, nên ta lấy những phần tử cuối cùng [-top_k:] và đảo ngược [::-1]
    top_indices = predictions.argsort()[-top_k:][::-1]
    
    # 6. Giải mã (Decode) từ số thành tên sản phẩm/ID thật
    recommended_product_ids = product_encoder.inverse_transform(top_indices)
    recommended_scores = predictions[top_indices] * 5.0 # Nhân 5 để về thang điểm 5 sao
    
    # In kết quả
    print(f"--- Gợi ý cho User: {user_id_raw} ---")
    results = []
    for pid, score in zip(recommended_product_ids, recommended_scores):
        print(f"Sản phẩm: {pid} | Dự đoán: {score:.2f} sao")
        results.append((pid, score))
        
    return results

sample_user = df['user_id'].iloc[0] 
print(f"Đang gợi ý cho user ID gốc: {sample_user}")

recommendations = recommend_products(sample_user, model, top_k=5)

Đang gợi ý cho user ID gốc: 6652
--- Gợi ý cho User: 6652 ---
Sản phẩm: casio-mtp-1375l-1avdf-nam | Dự đoán: 4.67 sao
Sản phẩm: sac-xe-hoi-3-cong-20w-xmobile-sn-156-den-xam | Dự đoán: 4.65 sao
Sản phẩm: redmi-pad-se-8-7-wifi-4gb-64gb | Dự đoán: 4.64 sao
Sản phẩm: loa-bluetooth-alpha-works-aw-w88 | Dự đoán: 4.63 sao
Sản phẩm: hop-muc-brother-tn-2385-danh-cho-brother-dcpl2520d | Dự đoán: 4.62 sao
