In [3]:
import requests
import pandas as pd
import numpy as np
import json
from typing import List, Dict, Any
from torch.utils.data import DataLoader, TensorDataset
from datetime import datetime, timedelta



In [5]:
AUTH_URL    = "http://localhost:8080/auth/public/login"
ORDERS_API  = "http://localhost:8080/orders/secure/recommend"
PRODUCT_API = "http://localhost:8080/product/secure/reviews/recommend"
USERNAME    = "admin"
PASSWORD    = "123456"

# ----- OPTIONAL DATE FILTERS (ISO format) -----
end_date = datetime.now().date()
start_date = end_date - timedelta(days=90)
start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")
START_DATE = start_date_str          # e.g. "2025-11-01"
END_DATE   = end_date_str          # e.g. "2025-11-07"

# ========================== 1. GET TOKEN ==========================
def get_token() -> str:
    r = requests.post(AUTH_URL, json={"username": USERNAME, "password": PASSWORD})
    r.raise_for_status()
    token = r.json().get("token") or r.json().get("accessToken")
    if not token:
        raise ValueError("Token missing")
    print("Token OK")
    return token

token   = get_token()
headers = {"Authorization": f"Bearer {token}"}

Token OK


In [8]:
def fetch_orders() -> List[Dict[str, Any]]:
    params = {"status": "DELIVERED"}
    if START_DATE:
        params["startDate"] = START_DATE
    if END_DATE:
        params["endDate"] = END_DATE

    r = requests.get(ORDERS_API, headers=headers, params=params)
    r.raise_for_status()
    raw = r.json()

    data_block = raw["data"]
    if isinstance(data_block, str):
        data_block = json.loads(data_block)

    orders = data_block["content"]

    orders_filtered = [
        {
            "orderId": order["id"],
            "customerId": order.get("customerId"),
            # "orderDate": order.get("orderDate"),
            "totalAmount": order.get("totalAmount"),
            "items": [
                {
                    "variantId": item.get("variantId"),
                    "productId": item.get("productId"),
                    "quantity": item.get("quantity"),
                    "unitPrice": item.get("unitPrice"),
                    "variantName": item.get("variantName"),
                }
                for item in order.get("items", [])
            ],
        }
        for order in orders
    ]

    return orders_filtered

orders_raw = fetch_orders()
print("\n--- SAMPLE RATINGS (first 1) ---")
print(orders_raw[:1])


--- SAMPLE RATINGS (first 1) ---
[{'orderId': 24, 'customerId': 5, 'totalAmount': 18810000.0, 'items': [{'variantId': 26, 'productId': 17, 'quantity': 1, 'unitPrice': 18810000.0, 'variantName': 'Máy chơi game PS5 Pro'}]}]


In [9]:
def fetch_reviews() -> List[Dict[str, Any]]:
    params = {"status": "DELIVERED"}
    if START_DATE:
        params["startDate"] = START_DATE
    if END_DATE:
        params["endDate"] = END_DATE
    r = requests.get(PRODUCT_API, headers=headers,params=params)
    r.raise_for_status()
    raw = r.json()
    data_block = raw
    if isinstance(data_block, str):
        data_block = json.loads(data_block)

    # reviews may be a plain list or also have "content"
    reviews = data_block
    reviews_filtered = [
    {
        "reviewId": review["id"],
        "orderId": review.get("orderId"),
        "productId": review.get("productId"),
        "variantId": review.get("variantId"),
        "customerId": review.get("customerId"),
        "rating": review.get("rating"),
        # "createdAt": review.get("createdAt")
    }
    for review in reviews
]
    return reviews_filtered

reviews_raw = fetch_reviews()
print("\n--- SAMPLE REVIEWS ---")
print(reviews_raw)


--- SAMPLE REVIEWS ---
[{'reviewId': 9, 'orderId': 6, 'productId': 15, 'variantId': 18, 'customerId': 2, 'rating': 2}, {'reviewId': 10, 'orderId': 5, 'productId': 13, 'variantId': 16, 'customerId': 2, 'rating': 4}, {'reviewId': 11, 'orderId': 7, 'productId': 14, 'variantId': 22, 'customerId': 1, 'rating': 5}, {'reviewId': 12, 'orderId': 3, 'productId': 14, 'variantId': 17, 'customerId': 1, 'rating': 3}, {'reviewId': 13, 'orderId': 2, 'productId': 13, 'variantId': 16, 'customerId': 1, 'rating': 5}, {'reviewId': 14, 'orderId': 8, 'productId': 16, 'variantId': 25, 'customerId': 1, 'rating': 2}, {'reviewId': 15, 'orderId': 9, 'productId': 14, 'variantId': 20, 'customerId': 1, 'rating': 4}, {'reviewId': 16, 'orderId': 15, 'productId': 19, 'variantId': 29, 'customerId': 5, 'rating': 4}, {'reviewId': 17, 'orderId': 17, 'productId': 25, 'variantId': 37, 'customerId': 5, 'rating': 1}, {'reviewId': 18, 'orderId': 11, 'productId': 17, 'variantId': 26, 'customerId': 3, 'rating': 4}, {'reviewId': 

In [12]:
def process_orders(orders: List[Dict[str, Any]]) -> pd.DataFrame:
    # Convert to DataFrame
    df = pd.DataFrame(orders)
    # Explode items if necessary
    if 'items' in df.columns:
        df = df.explode('items').reset_index(drop=True)
        items_df = pd.json_normalize(df['items'])
        df = pd.concat([df.drop(columns=['items']), items_df], axis=1)
    orders_agg=df.groupby(['customerId', 'productId']).agg({'quantity':'sum'}).reset_index()
    orders_agg['implicit_rating']=np.minimum(orders_agg['quantity']*2.5,5.0)
    orders_agg['source']='order'
    return orders_agg
orders_agg=process_orders(orders_raw)
print(orders_agg)


   customerId  productId  quantity  implicit_rating source
0           1         13         1              2.5  order
1           1         14         3              5.0  order
2           1         16         1              2.5  order
3           2         13         2              5.0  order
4           2         14         1              2.5  order
5           2         15         2              5.0  order
6           3         14         1              2.5  order
7           3         17         1              2.5  order
8           5         19         1              2.5  order


In [13]:
def process_reviews(reviews: List[Dict[str, Any]]) -> pd.DataFrame:
    df = pd.DataFrame(reviews)
    reviews_agg=df.groupby(['customerId', 'productId']).agg({'rating':'mean'}).reset_index()
    reviews_agg['source']='review'
    return reviews_agg
reviews_agg=process_reviews(reviews_raw)
print(reviews_agg)

   customerId  productId  rating  source
0           1         13     5.0  review
1           1         14     4.0  review
2           1         16     2.0  review
3           2         13     4.0  review
4           2         15     2.0  review
5           5         19     4.0  review


In [14]:
merged = pd.merge(orders_agg, reviews_agg, on=['customerId', 'productId'], how='outer')
print("Raw Merged:\n", merged)  # Check for NaNs

Raw Merged:
    customerId  productId  quantity  implicit_rating source_x  rating source_y
0           1         13         1              2.5    order     5.0   review
1           1         14         3              5.0    order     4.0   review
2           1         16         1              2.5    order     2.0   review
3           2         13         2              5.0    order     4.0   review
4           2         14         1              2.5    order     NaN      NaN
5           2         15         2              5.0    order     2.0   review
6           3         14         1              2.5    order     NaN      NaN
7           3         17         1              2.5    order     NaN      NaN
8           5         19         1              2.5    order     4.0   review


In [15]:
merged['final_rating'] = np.where(merged['rating'].notna(), merged['rating'], merged['implicit_rating'])
merged['source']=merged['source_y'].fillna(merged['source_x'])
interactions_df=merged[['customerId', 'productId', 'final_rating', 'source','quantity']].copy()
interactions_df=interactions_df.dropna(subset=['final_rating'])
print("Interactions DF:\n", interactions_df)

Interactions DF:
    customerId  productId  final_rating  source  quantity
0           1         13           5.0  review         1
1           1         14           4.0  review         3
2           1         16           2.0  review         1
3           2         13           4.0  review         2
4           2         14           2.5   order         1
5           2         15           2.0  review         2
6           3         14           2.5   order         1
7           3         17           2.5   order         1
8           5         19           4.0  review         1


In [16]:
user_item_matrix = interactions_df.pivot_table(index='customerId', columns='productId', values='final_rating',fill_value=0)
print("User-Item Matrix:\n", user_item_matrix)

User-Item Matrix:
 productId    13   14   15   16   17   19
customerId                              
1           5.0  4.0  0.0  2.0  0.0  0.0
2           4.0  2.5  2.0  0.0  0.0  0.0
3           0.0  2.5  0.0  0.0  2.5  0.0
5           0.0  0.0  0.0  0.0  0.0  4.0


In [17]:
np.random.seed(44)  # Reproducible fakes

print( user_item_matrix.columns.tolist())
products=user_item_matrix.columns.tolist()
fake_ids=np.arange(6, 30)
fake_data = []
real_avg_ratings = interactions_df.groupby('productId')['final_rating'].mean().to_dict()
for cust_id in fake_ids:
    num_interactions = np.random.randint(2, 5)  # 2-4 products per cust
    selected_products = np.random.choice(products, size=num_interactions, replace=False)
    for prod_id in selected_products:
        real_avg = real_avg_ratings.get(prod_id, 3.0)  
        rating = np.clip(np.random.normal(real_avg, 1.0), 1, 5)
        
        quantity = np.random.randint(1, 2) 
        
        fake_data.append({
            'customerId': cust_id,
            'productId': prod_id,
            'final_rating': round(rating, 1), 
            'source': 'synthetic',
            'quantity': quantity
        })

fake_df = pd.DataFrame(fake_data)
print("Sample Fake:\n", fake_df.head())


[13, 14, 15, 16, 17, 19]
Sample Fake:
    customerId  productId  final_rating     source  quantity
0           6         19           5.0  synthetic         1
1           6         15           1.0  synthetic         1
2           7         13           2.2  synthetic         1
3           7         19           2.6  synthetic         1
4           7         15           1.8  synthetic         1


In [18]:
augmented_df = pd.concat([interactions_df, fake_df], ignore_index=True)
print(augmented_df.head(10))
augmented_df=augmented_df.drop_duplicates(subset=['customerId', 'productId'], keep='last')
augmented_df=augmented_df.sort_values(['customerId', 'productId']).reset_index(drop=True)

print(f"Clean Augmented: {len(augmented_df)} pairs")
print(augmented_df.groupby('source').size())

   customerId  productId  final_rating     source  quantity
0           1         13           5.0     review         1
1           1         14           4.0     review         3
2           1         16           2.0     review         1
3           2         13           4.0     review         2
4           2         14           2.5      order         1
5           2         15           2.0     review         2
6           3         14           2.5      order         1
7           3         17           2.5      order         1
8           5         19           4.0     review         1
9           6         19           5.0  synthetic         1
Clean Augmented: 79 pairs
source
order         3
review        6
synthetic    70
dtype: int64


In [19]:
augmented_matrix= augmented_df.pivot_table(
    index='customerId',
    columns='productId',
    values='final_rating',
    fill_value=0)
print("Augmented Matrix Shape:", augmented_matrix.shape)
print(augmented_matrix.head())

Augmented Matrix Shape: (28, 6)
productId    13   14   15   16   17   19
customerId                              
1           5.0  4.0  0.0  2.0  0.0  0.0
2           4.0  2.5  2.0  0.0  0.0  0.0
3           0.0  2.5  0.0  0.0  2.5  0.0
5           0.0  0.0  0.0  0.0  0.0  4.0
6           0.0  0.0  1.0  0.0  0.0  5.0


In [20]:
sparsity=1.0-(augmented_matrix>0).sum().sum()/(augmented_matrix.shape[0]*augmented_matrix.shape[1])
print(f"New Sparsity: {sparsity*100:.1f}%")
augmented_df.to_csv("augmented_interactions.csv", index=False)
print("Saved! Use augmented_df or augmented_matrix for modeling.")

New Sparsity: 53.0%
Saved! Use augmented_df or augmented_matrix for modeling.


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()
normalized_matrix = scaler.fit_transform(augmented_matrix)
normalized_df=pd.DataFrame(normalized_matrix, index=augmented_matrix.index, columns=augmented_matrix.columns)
print("Normalized Sample (Cust1):\n", normalized_df.loc[1].head())  # Ratings 0-1

In [None]:
norm_array= normalized_df.values
similarity_matrix = cosine_similarity(norm_array)
sim_df=pd.DataFrame(similarity_matrix, index=augmented_matrix.index, columns=augmented_matrix.index)
print("Similarity Sample (Cust1 to others):\n", sim_df.loc[1].sort_values(ascending=False).head())

In [None]:
def get_recommendations(customer_id,matrix=augmented_matrix,sim_df=sim_df,k=5,n=3):
    if customer_id not in matrix.index:
        return "New customer: Recommend popular products (e.g., top avg rated)."
    similar_custs = sim_df.loc[customer_id].sort_values(ascending=False).iloc[1:k+1].index.tolist()
    print(f"Top {k} similar to {customer_id}: {similar_custs}")

    customer_ratings= matrix.loc[customer_id]
    unseen=customer_ratings[customer_ratings==0].index.tolist()
    if not unseen:
        return "No new recs! All seen!"
    
    predictions = {}
    for prod in unseen:
        weighted_sum=0
        sim_sum=0
        for sim_cust in similar_custs:
            if(matrix.loc[sim_cust,prod]>0):
                sim_score=sim_df.loc[customer_id,sim_cust]
                rating=matrix.loc[sim_cust,prod]
                weighted_sum += sim_score * rating
                sim_sum += sim_score
        if sim_sum>0:
            predictions[prod]=weighted_sum/sim_sum
        else:
            predictions[prod]=0
    
    recs=sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:n]
    return [(prod, round(score,2)) for prod, score in recs]

# Test on real cust
recs = get_recommendations(1, n=2)
print(f"Recs for Cust1: {recs}")

# Popular fallback (for cold starts)
popular = augmented_matrix.mean().sort_values(ascending=False).head(3)
print("Global Popular:", popular)

In [21]:
import torch
import torch.nn as nn
from torch.optim import Adam

class RecModel(nn.Module):
    def __init__(self, num_users, num_items, emb_dim=32):
        super(RecModel, self).__init__()
        self.user_emb=nn.Embedding(num_users, emb_dim)
        self.item_emb=nn.Embedding(num_items, emb_dim)
        self.fc=nn.Linear(emb_dim*2, 1)
    def forward(self, user_ids, item_ids):
        user_vecs=self.user_emb(user_ids)
        item_vecs=self.item_emb(item_ids)
        x=torch.cat([user_vecs, item_vecs], dim=-1)
        out=self.fc(x)
        return out.squeeze()

if __name__ == "__main__":
    # Your existing model test (keep it)
    num_users = 29  # Updated: Enough for fake_ids up to 29
    num_items = 17  # From your Step 1
    model = RecModel(num_users, num_items)
    print("Model created! Layers:", model)
    print("Sample forward pass:")
    sample_u = torch.tensor([0])  # User index 0
    sample_i = torch.tensor([2])  # Product index 2
    pred = model(sample_u, sample_i)
    print(f"Predicted rating: {pred.item():.2f}")


    # NEW: Create mappings (real_id → internal_idx) - Updated for larger IDs
    unique_customers = sorted(fake_df['customerId'].unique())  # Only fakes for now; add reals later
    unique_products = sorted(set(products))  # All possible
    user_map = {cust: idx for idx, cust in enumerate(unique_customers)}
    item_map = {prod: idx for idx, prod in enumerate(unique_products)}
    print(f"\nUser map (sample): {dict(list(user_map.items())[:5])}...")  # First 5
    print(f"Item map keys (sample): {list(item_map.keys())[:5]}...")

    # NEW: Prepare training data (list of (u_idx, i_idx, normalized_rating))
    train_data = []
    for _, row in fake_df.iterrows():  # Train on fakes only for test
        u_idx = user_map[row['customerId']]
        i_idx = item_map[row['productId']]
        r_norm = row['final_rating'] / 5.0  # Normalize to [0,1]
        train_data.append((u_idx, i_idx, r_norm))
    print(f"\nTraining data ready: {len(train_data)} samples")
    print("Sample batch:", train_data[:3])  # First 3 tuples

Model created! Layers: RecModel(
  (user_emb): Embedding(29, 32)
  (item_emb): Embedding(17, 32)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)
Sample forward pass:
Predicted rating: -0.09

User map (sample): {6: 0, 7: 1, 8: 2, 9: 3, 10: 4}...
Item map keys (sample): [13, 14, 15, 16, 17]...

Training data ready: 70 samples
Sample batch: [(0, 5, 1.0), (0, 2, 0.2), (1, 0, 0.44000000000000006)]


In [26]:
print("\n=== Training the Model (Fixed) ===")
optimizer = Adam(model.parameters(), lr=0.01)  # Optimizer
criterion = nn.MSELoss()  # Loss: Mean Squared Error

num_epochs = 20

train_size=int(0.9*len(train_data))
val_size=len(train_data)-train_size
train_split,val_split=torch.utils.data.random_split(train_data,[train_size,val_size])

train_tensors=[(torch.tensor([u],dtype=torch.long),torch.tensor([i],dtype=torch.long),
                torch.tensor([r],dtype=torch.long)) for u,i,r in train_split]
val_tensors = [(torch.tensor([u], dtype=torch.long), torch.tensor([i], dtype=torch.long),
                 torch.tensor([r], dtype=torch.float)) for u, i, r in val_split]

def build_dataset(split):
    users, items, ratings = zip(*[
        (torch.tensor(u, dtype=torch.long),
         torch.tensor(i, dtype=torch.long),
         torch.tensor(float(r), dtype=torch.float))
        for (u, i, r) in split
    ])

    return TensorDataset(
        torch.stack(users),
        torch.stack(items),
        torch.stack(ratings)
    )

train_dataset = build_dataset(train_split)
val_dataset = build_dataset(val_split)

batch_size = min(8, len(train_dataset))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=False)

best_val_loss=float('inf')
patience=3
no_improve=0

model.train()  # Enable training mode (gradients on)
for epoch in range(num_epochs):
    train_loss = 0
    for batch in train_loader:
        users,items,targets=batch
        pred = model(users, items).unsqueeze(1)
        loss = criterion(pred, targets.unsqueeze(1))
        loss.backward()
        optimizer.step()
        train_loss+=loss.item()+len(batch)
    avg_train_loss=train_loss/len(train_dataset)

    model.eval()
    val_loss=0
    with torch.no_grad():
        for batch in val_loader:
            users,items,targets=batch
            pred = model(users, items).unsqueeze(1)
            val_loss+=criterion(pred,targets.unsqueeze(1)).item()*len(batch)
    avg_val_loss=val_loss/len(val_dataset)
    model.train()
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    if avg_val_loss<best_val_loss:
        best_val_loss=avg_val_loss
        no_improve=0
    else:
        no_improve+=1
        if no_improve>= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break
    model.eval()
    print("Training completed! Model learned embeddings.")

    print("\n=== Adding Negative Sampling for Better Recs ===")
    neg_samples=[]
    num_neg=len(train_data)*2
    for _ in range(num_neg):
        u_idx=np.random.randint(0,num_users)
        i_idx=np.random.randint(0,num_items)
        if(u_idx,i_idx,0.0) not in train_data:
            neg_samples.append((u_idx,i_idx,0.0))
    
    model.train()
    for u,i,r in neg_samples[:100]:
        pred=model(torch.tensor([u],dtype=torch.long),torch.tensor([i],dtype=torch.long)).unsqueeze(0)
        loss=criterion(pred,torch.tensor([r],dtype=torch.float))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    model.eval()
    print(f"Added {len(neg_samples)} negatives-model now penalizes unseen as low-rated")

test_u=6
test_i=15
u_idx=user_map[test_u]
i_idx=item_map[test_i]
with torch.no_grad():
    pred_norm=model(torch.tensor([u_idx],dtype=torch.long),
                     torch.tensor([i_idx],dtype=torch.long)).item()
    pred_rating=pred_norm*5.0
    pred_rating=max(1.0, min(5.0, pred_rating))
print(f"Test Prediction for Cust{test_u}, Prod{test_i}: {pred_rating:.2f}")
if not fake_df[(fake_df['customerId']==test_u) & (fake_df['productId']==test_i)].empty:
    true_rating = fake_df[(fake_df['customerId']==test_u) & (fake_df['productId']==test_i)]['final_rating'].iloc[0]
    print(f"Ground truth rating: {true_rating}")
    print(f"Error: {abs(pred_rating - true_rating):.2f}")
else:
    print("No exact match—model generalizes!")


=== Training the Model (Fixed) ===
Epoch 1/20, Train Loss: 0.4348, Val Loss: 0.2978
Training completed! Model learned embeddings.

=== Adding Negative Sampling for Better Recs ===
Added 140 negatives-model now penalizes unseen as low-rated
Epoch 2/20, Train Loss: 0.4146, Val Loss: 0.1365
Training completed! Model learned embeddings.

=== Adding Negative Sampling for Better Recs ===
Added 140 negatives-model now penalizes unseen as low-rated
Epoch 3/20, Train Loss: 0.4150, Val Loss: 0.0664
Training completed! Model learned embeddings.

=== Adding Negative Sampling for Better Recs ===
Added 140 negatives-model now penalizes unseen as low-rated
Epoch 4/20, Train Loss: 0.4208, Val Loss: 0.0783
Training completed! Model learned embeddings.

=== Adding Negative Sampling for Better Recs ===
Added 140 negatives-model now penalizes unseen as low-rated
Epoch 5/20, Train Loss: 0.4085, Val Loss: 0.0454
Training completed! Model learned embeddings.

=== Adding Negative Sampling for Better Recs ===

In [41]:
def get_recommendations(model,user_map,item_map,df,customer_id,n=3):
    if customer_id not in user_map:
        popular=df.groupby('productId')['final_rating'].mean().sort_values(ascending=False).head(n)
        return[{'product_id':int(pid),'score':float(score)} for pid,score in popular.items()]
    u_idx=user_map[customer_id]
    seen_products=df[df['customerId']==customer_id]['productId'].unique()
    all_products=list(item_map.keys())
    unseen=[p for p in all_products if p not in seen_products]

    if not unseen:
        return []
    
    predictions = {}
    model.eval()
    with torch.no_grad():
        for prod in unseen:
            i_idx=item_map[prod]
            pred_norm=model(torch.tensor([u_idx],dtype=torch.long),
                            torch.tensor([i_idx],dtype=torch.long)).item()
            pred_rating=pred_norm*5.0
            pred_rating=max(1.0,min(5.0,pred_rating))
            predictions[int(prod)]=pred_rating
    
    top_recs=sorted(predictions.items(),key=lambda x:x[1],reverse=True)[:n]
    return [{'product_id':prod,"score":round(score,2)} for prod,score in top_recs]

print("\n=== Generating Recommendations ===")
recs=get_recommendations(model,user_map,item_map,fake_df,customer_id=3,n=6)
print(f"Top 3 recommendations for Customer 4: ")
for rec in recs:
    print(f"Product {rec['product_id']}: Score {rec['score']}")


=== Generating Recommendations ===
Top 3 recommendations for Customer 4: 
Product 13: Score 4.177777777777778
Product 19: Score 3.7999999999999994
Product 14: Score 3.3
Product 17: Score 2.835714285714286
Product 15: Score 2.2714285714285714
Product 16: Score 1.907142857142857
