# Low Rank Matrix Completion

In [143]:
import os
import torch
import pandas as pd
import numpy as np
from PIL import Image
import clip
from sklearn.preprocessing import StandardScaler
import hdbscan
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from IPython.display import display, HTML

In [101]:
# load data
excel_path = "/Users/laurali/Downloads/STAT-4830-vllm-project/dataset_sheet.xlsx"
df = pd.read_excel(excel_path, sheet_name='Sheet1')

# If the first column is useless, drop it
df.drop(df.columns[0], axis=1, inplace=True)

df.head()


Unnamed: 0,brand,name,description,price,user
0,Forever 21,Twill Wide-Leg Cargo Pants,A pair of twill pants featuring contrasting to...,24.49,vivian
1,gifteabox,"Denim for all seasons, 3-length vintage Faded ...",Country of Origin : China(OEM)\nMaterial : 100...,25.68,megan
2,Forever 21,Mid-Rise Wide-Leg Cargo Pants,A pair of twill pants featuring a mid-rise wai...,24.49,vivian
3,binary01,Fine layered check blouse,Country of Origin : China(OEM)\nMaterial : Cot...,33.53,megan
4,wonderwonder,Blena Heart Strap Button Lace Cropped Sleevele...,Country of Origin : Korea/China(OEM)\nMaterial...,24.57,megan


In [102]:
#load clip
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)


In [103]:
image_folder = "../dataset/"
df["text_embedding"] = None
df["image_embedding"] = None

In [104]:
for idx, row in df.iterrows():
    # ----------------------------------
    # a) Find the corresponding image file
    # ----------------------------------
    # As per your description, for row i in df, 
    # the image file is named (i+2).jpg or (i+2).png
    image_number = idx + 2
    jpg_path = os.path.join(image_folder, f"{image_number}.jpg")
    png_path = os.path.join(image_folder, f"{image_number}.png")

    # Convert .jpg to .png if needed
    if os.path.isfile(jpg_path) and not os.path.isfile(png_path):
        try:
            img = Image.open(jpg_path).convert("RGB")
            img.save(png_path, "PNG")  # Save as PNG
            os.remove(jpg_path)  # Delete original JPG
            print(f"Converted {jpg_path} → {png_path}")
        except Exception as e:
            print(f"Error converting {jpg_path}: {e}")

    image_path = png_path if os.path.isfile(png_path) else None

    if not image_path:
        print(f"Warning: No image found for row {idx}, expected {image_number}.png")
        continue

    # ----------------------------------
    # b) Load text and image
    # ----------------------------------
    text_str = f"{row['brand']} {row['name']} {row['description']}"
    text_str = text_str[:77]

    # Load and preprocess the image
    raw_image = Image.open(image_path).convert("RGB")
    image_input = preprocess(raw_image).unsqueeze(0).to(device)

    # ----------------------------------
    # c) Encode text and image using CLIP
    # ----------------------------------
    with torch.no_grad():
        # Text encoding
        text_tokens = clip.tokenize([text_str]).to(device)  # batch with 1 item
        text_embeds = model.encode_text(text_tokens)
        # Image encoding
        image_embeds = model.encode_image(image_input)

    # Convert embeddings to CPU and then to list (or NumPy array) for easier storage
    text_embedding = text_embeds[0].cpu().numpy().tolist()
    image_embedding = image_embeds[0].cpu().numpy().tolist()

    # ----------------------------------
    # d) Insert embeddings into the DataFrame
    # ----------------------------------
    df.at[idx, "text_embedding"] = text_embedding
    df.at[idx, "image_embedding"] = image_embedding
    df.at[idx, "image_path"] = image_path

In [105]:
# ----------------------------
# 4) Verify your DataFrame
# ----------------------------
print(df.head())

          brand                                               name  \
0    Forever 21                         Twill Wide-Leg Cargo Pants   
1     gifteabox  Denim for all seasons, 3-length vintage Faded ...   
2    Forever 21                      Mid-Rise Wide-Leg Cargo Pants   
3      binary01                          Fine layered check blouse   
4  wonderwonder  Blena Heart Strap Button Lace Cropped Sleevele...   

                                         description  price    user  \
0  A pair of twill pants featuring contrasting to...  24.49  vivian   
1  Country of Origin : China(OEM)\nMaterial : 100...  25.68   megan   
2  A pair of twill pants featuring a mid-rise wai...  24.49  vivian   
3  Country of Origin : China(OEM)\nMaterial : Cot...  33.53   megan   
4  Country of Origin : Korea/China(OEM)\nMaterial...  24.57   megan   

                                      text_embedding  \
0  [0.2016282081604004, 0.28957486152648926, -0.4...   
1  [0.29881617426872253, 0.0287922360002

In [106]:
df.head(10)

Unnamed: 0,brand,name,description,price,user,text_embedding,image_embedding,image_path
0,Forever 21,Twill Wide-Leg Cargo Pants,A pair of twill pants featuring contrasting to...,24.49,vivian,"[0.2016282081604004, 0.28957486152648926, -0.4...","[0.12176767736673355, 0.12389594316482544, -0....",../dataset/2.png
1,gifteabox,"Denim for all seasons, 3-length vintage Faded ...",Country of Origin : China(OEM)\nMaterial : 100...,25.68,megan,"[0.29881617426872253, 0.028792236000299454, -0...","[0.07662051916122437, 0.6227596402168274, -0.1...",../dataset/3.png
2,Forever 21,Mid-Rise Wide-Leg Cargo Pants,A pair of twill pants featuring a mid-rise wai...,24.49,vivian,"[0.29192599654197693, 0.23970846831798553, -0....","[0.05716700479388237, 0.08652622997760773, -0....",../dataset/4.png
3,binary01,Fine layered check blouse,Country of Origin : China(OEM)\nMaterial : Cot...,33.53,megan,"[0.06796179711818695, 0.4492296576499939, -0.2...","[-0.2535107433795929, 0.1870366334915161, -0.3...",../dataset/5.png
4,wonderwonder,Blena Heart Strap Button Lace Cropped Sleevele...,Country of Origin : Korea/China(OEM)\nMaterial...,24.57,megan,"[0.24646836519241333, -0.13289624452590942, -0...","[-0.08947852998971939, 0.2080671191215515, -0....",../dataset/6.png
5,binary01,Kikuni Frill Midi Skirt,Country of Origin : Korea\nMaterial : Cotton 1...,36.02,megan,"[-0.3107108473777771, 0.12375964969396591, -0....","[-0.589783251285553, 0.2945042550563812, 0.022...",../dataset/7.png
6,Uniqlo,AIRism Cotton Oversized T-Shirt | Half-Sleeve,- Sheer: Not Sheer\n- Fit: Oversized\n- Pocket...,19.9,matt,"[0.23920343816280365, 0.28880682587623596, -0....","[-0.12310713529586792, 0.13516740500926971, 0....",../dataset/8.png
7,Uniqlo,Crew Neck T-Shirt,The Uniqlo U collection is the realization of ...,14.9,matt,"[0.3258145749568939, 0.10569125413894653, -0.3...","[-0.14776739478111267, 0.1866864264011383, -0....",../dataset/9.png
8,Uniqlo,Sweat Oversized Pullover Hoodie,- Sheer: Not Sheer\n- Fit: Relaxed\n- Pockets:...,49.9,matt,"[-0.0857551097869873, 0.15847620368003845, -0....","[-0.2460365891456604, 0.37825092673301697, 0.4...",../dataset/10.png
9,Uniqlo,3D Knit Sweater,- Updated silhouette for a sleeker fit.\n- Sty...,49.9,matt,"[0.1999933123588562, -0.049161288887262344, -0...","[-0.051011890172958374, 0.07367589324712753, 0...",../dataset/11.png


In [107]:
# Drops any row where at least one column is NaN/None
df = df.dropna(axis="index", how="any")

In [108]:
df["text_embedding"].head(10)

0    [0.2016282081604004, 0.28957486152648926, -0.4...
1    [0.29881617426872253, 0.028792236000299454, -0...
2    [0.29192599654197693, 0.23970846831798553, -0....
3    [0.06796179711818695, 0.4492296576499939, -0.2...
4    [0.24646836519241333, -0.13289624452590942, -0...
5    [-0.3107108473777771, 0.12375964969396591, -0....
6    [0.23920343816280365, 0.28880682587623596, -0....
7    [0.3258145749568939, 0.10569125413894653, -0.3...
8    [-0.0857551097869873, 0.15847620368003845, -0....
9    [0.1999933123588562, -0.049161288887262344, -0...
Name: text_embedding, dtype: object

In [109]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

text_embs = np.array(df["text_embedding"].to_list())  # (n, 512)
img_embs  = np.array(df["image_embedding"].to_list()) # (n, 512)

# Scale price
scaler = StandardScaler()
price_scaled = scaler.fit_transform(df[["price"]])  # shape (n, 1)

# Combine features => shape (n, 1025)
features = np.hstack([text_embs, img_embs, price_scaled])


In [110]:
import hdbscan

# Create the HDBSCAN model. 
# You can tune 'min_cluster_size' to control how large a cluster must be 
# to be considered valid. 
# 'min_samples' can help control outlier sensitivity.

clusterer = hdbscan.HDBSCAN(
    min_cluster_size=2,  # e.g. each cluster must have at least 10 points
    min_samples=1,        # adjusts how strictly points are judged as outliers
    metric='euclidean'
)

# Fit and predict cluster labels
cluster_labels = clusterer.fit_predict(features)

# Add them to the DataFrame
df["cluster"] = cluster_labels

# HDBSCAN may label some points as -1 if they are outliers/noise


In [111]:
unique_clusters = np.unique(cluster_labels)
print("Clusters found:", unique_clusters)  # e.g. [0, 1, 2, 3, -1]


Clusters found: [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17]


In [112]:
from pathlib import Path

image_folder = Path("/Users/laurali/Downloads/STAT-4830-vllm-project/dataset")

df["image_path"] = [
    (image_folder / f"{i+2}.png").as_posix()
    for i in df.index
]

In [113]:
df["image_path"][0]

'/Users/laurali/Downloads/STAT-4830-vllm-project/dataset/2.png'

In [114]:
from IPython.display import display, Image
import os

unique_clusters = sorted(df["cluster"].unique())

# for label in unique_clusters:
#     cluster_df = df[df["cluster"] == label]
    
#     print(f"=== Cluster {label} === (Total items: {len(cluster_df)})")
    
#     # Show just a few images from this cluster
#     for _, row in cluster_df.head(5).iterrows():
#         img_path = row["image_path"]
#         if os.path.isfile(img_path):
#             display(Image(filename=img_path, width=200))
#         else:
#             print(f"Image not found: {img_path}")
    
#     print()  # blank line

### Process ratings data

In [115]:
# Read the ratings sheet and rename columns to align with sheet 1
sheet2 = pd.read_excel(excel_path, sheet_name='Copy of Sheet1')
sheet2.drop(sheet2.columns[0], axis=1, inplace=True)
sheet2.rename(
    columns={
        "Matt": "matt",
        "Laura": "Laura",
        "Megan": "megan",
        "Vivian": "vivian"
    },
    inplace=True
)
sheet2.head()

Unnamed: 0,matt,Laura,megan,vivian,Medium,user
0,6.0,3.0,4.0,,4.333333,vivian
1,7.0,,,7.0,7.0,megan
2,,,3.0,,3.0,vivian
3,,,,1.0,1.0,megan
4,,4.0,,,4.0,megan


In [129]:
rating_col = ["matt", "Laura", "megan", "vivian"]
partial_ratings = sheet2[rating_col]
partial_ratings = partial_ratings.reindex(df.index) # shape (62, 4)

# Each row = an item
# Each column = one user's rating (NaN if that user did not rate)
print(partial_ratings.head())
print(partial_ratings.shape)

   matt  Laura  megan  vivian
0   6.0    3.0    4.0     NaN
1   7.0    NaN    NaN     7.0
2   NaN    NaN    3.0     NaN
3   NaN    NaN    NaN     1.0
4   NaN    4.0    NaN     NaN
(62, 4)


In [132]:
# Create the full ratings dataframe containing all items
ratings_df = partial_ratings.copy()

# Fill in 10 for items chosen by user
for i, row in df.iterrows():
    chosen_user = str(row["user"]).strip().lower()
    if chosen_user in rating_col:
        ratings_df.loc[i, chosen_user] = 10.0

print("Final ratings_df shape:", ratings_df.shape)
print(ratings_df.head(10))

Final ratings_df shape: (62, 4)
   matt  Laura  megan  vivian
0   6.0    3.0    4.0    10.0
1   7.0    NaN   10.0     7.0
2   NaN    NaN    3.0    10.0
3   NaN    NaN   10.0     1.0
4   NaN    4.0   10.0     NaN
5   1.0    NaN   10.0     NaN
6  10.0    NaN    NaN     NaN
7  10.0    5.0    NaN     NaN
8  10.0    NaN    NaN     NaN
9  10.0    NaN    6.0     7.0


### Create rating matrix

In [133]:
# R[i, j] is the rating for item i by user j, or NaN if missing.
R = ratings_df.values.astype(np.float32)  # shape (m, n)

# m = number of items
# n = number of users
m, n = R.shape

print("Shape of rating matrix:", R.shape)

Shape of rating matrix: (62, 4)


In [134]:
# known_mask[i, j] = True if R[i,j] is a known rating (not NaN)
known_mask = ~np.isnan(R)

# We will fill the missing entries with 0 just so the numeric array is well-defined,
# but the mask ensures we only compute errors on the known entries.
R_filled = np.nan_to_num(R, nan=0.0)

In [135]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

M_tensor = torch.tensor(R_filled, dtype=torch.float32, device=device)
mask_tensor = torch.tensor(known_mask, dtype=torch.bool, device=device)

### Implement low-rank

Initialize low‐rank factors $U$ and $V$

In [136]:
rank = 3

U = torch.randn(m, rank, device=device, requires_grad=True)
V = torch.randn(n, rank, device=device, requires_grad=True)

Set up optimizer and training loop:

Here we use the Adam optimizer, but could also try plain SGD or any other solver.

In [137]:
optimizer = torch.optim.Adam([U, V], lr=1e-2)
num_epochs = 1000

for epoch in range(num_epochs):
    # Forward: predicted matrix (m x n)
    pred = U @ V.t()
    
    # We only compute error on known entries
    diff = pred - M_tensor
    diff_known = diff[mask_tensor]  # 1D tensor of differences for known entries
    
    # MSE on known entries
    loss = torch.mean(diff_known**2)
    
    # Backprop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # (Optional) print progress occasionally
    if (epoch+1) % 100 == 0:
        print(f"Epoch {epoch+1}/{num_epochs} - Loss = {loss.item():.4f}")

Epoch 100/1000 - Loss = 25.2943
Epoch 200/1000 - Loss = 2.8642
Epoch 300/1000 - Loss = 0.8047
Epoch 400/1000 - Loss = 0.4378
Epoch 500/1000 - Loss = 0.2814
Epoch 600/1000 - Loss = 0.1839
Epoch 700/1000 - Loss = 0.1296
Epoch 800/1000 - Loss = 0.1050
Epoch 900/1000 - Loss = 0.0951
Epoch 1000/1000 - Loss = 0.0905


In [138]:
# Convert predictions back to a DataFrame
pred_np = pred.detach().cpu().numpy()  # shape (m, n)
predicted_df = pd.DataFrame(pred_np, 
                            index=ratings_df.index, 
                            columns=ratings_df.columns)

### Generate recommendations for users

In [139]:
def get_top_n_recs_for_user(
    user_name,
    ratings_df,     # second sheet with actual user ratings
    predictions_df, # same shape, but predicted ratings
    original_df,      # your df from the first sheet, which has 'image_path'
    n=3
):
    # 1) All predicted ratings for this user
    user_pred = predictions_df[user_name]
    
    # 2) Actual ratings for this user
    user_actual = ratings_df[user_name]
    
    # 3) Identify items that are not chosen or rated by user
    unrated = user_actual.isna() | (user_actual == 0)
    not_chosen = (original_df["user"] != user_name)
    combined_mask = unrated & not_chosen

    # 4) Sort by highest predicted rating
    top_n_idx = user_pred[combined_mask].sort_values(ascending=False).head(n).index
    
    # 5) Build a recommended subset from ratings_df (or from sheet2)
    recs = ratings_df.loc[top_n_idx].copy()
    recs["predicted_rating"] = user_pred.loc[top_n_idx].values
    
    # 6) Join with the first sheet to get brand/name/description/image_path
    #    This works IF ratings_df and df share the same index for each item
    recs = recs.join(original_df[["brand","name","description","price","image_path"]], 
                     how="left")
    
    return recs

In [144]:
for user in ["matt","Laura","megan","vivian"]:
    top_recs = get_top_n_recs_for_user(
        user_name=user,
        ratings_df=ratings_df,
        predictions_df=predicted_df,
        original_df=df,
        n=5
    )
    print(f"Top 5 recommended items for user '{user}':")
    display(top_recs)
    
    html_snippet = '<div style="display:flex; flex-direction:row;">'
    for idx, row in top_recs.iterrows():
        img_path = row.get("image_path")
        if img_path and os.path.isfile(img_path):
            # Add an <img> tag with a bit of spacing
            html_snippet += f'<div style="margin-right:10px;"><img src="{img_path}" width="200"/></div>'
        else:
            # You could skip or add a placeholder
            html_snippet += '<div style="width:200px;height:200px;margin-right:10px;">No image</div>'
    html_snippet += '</div>'
    
    display(HTML(html_snippet))
    # Now display each image, if found
    # for idx, row in top_recs.iterrows():
    #     img_path = row.get("image_path", None)
    #     if img_path and os.path.isfile(img_path):
    #         display(Image(filename=img_path, width=200))
    #     else:
    #         print(f"No image found for index {idx}: {img_path}")

Top 5 recommended items for user 'matt':


Unnamed: 0,matt,Laura,megan,vivian,predicted_rating,brand,name,description,price,image_path
31,,9.0,,10.0,10.52517,White House Black Market,V-Neck Open Stitch Sequin Sweater,The open stitch design of this pullover is jus...,99.0,/Users/laurali/Downloads/STAT-4830-vllm-projec...
44,,,,10.0,9.866011,Champion,"Powerblend Hoodie, Relaxed, Full Embroidered S...",THAT CLASSIC POWERBLEND COMBO\n\nWith its comf...,41.25,/Users/laurali/Downloads/STAT-4830-vllm-projec...
60,,,,10.0,9.60691,Tommy Hilfiger,Slim Fit Zip Polo,About\nTommy Hilfiger women's polo. Designed i...,38.7,/Users/laurali/Downloads/STAT-4830-vllm-projec...
2,,,3.0,10.0,8.365989,Forever 21,Mid-Rise Wide-Leg Cargo Pants,A pair of twill pants featuring a mid-rise wai...,24.49,/Users/laurali/Downloads/STAT-4830-vllm-projec...
61,,,,10.0,7.359422,Forever 21,Hooded Varsity Letterman Jacket,Details\nA fleece varsity letterman jacket fea...,20.99,/Users/laurali/Downloads/STAT-4830-vllm-projec...


Top 5 recommended items for user 'Laura':


Unnamed: 0,matt,Laura,megan,vivian,predicted_rating,brand,name,description,price,image_path
61,,,,10.0,9.55954,Forever 21,Hooded Varsity Letterman Jacket,Details\nA fleece varsity letterman jacket fea...,20.99,/Users/laurali/Downloads/STAT-4830-vllm-projec...
1,7.0,,10.0,7.0,8.037586,gifteabox,"Denim for all seasons, 3-length vintage Faded ...",Country of Origin : China(OEM)\nMaterial : 100...,25.68,/Users/laurali/Downloads/STAT-4830-vllm-projec...
10,10.0,,6.0,,6.85692,Uniqlo,Wide Straight Jeans,- Fit: Loose\n- Silhouette: Straight\n- Pocket...,49.9,/Users/laurali/Downloads/STAT-4830-vllm-projec...
16,2.0,,,10.0,6.744753,Forever 21,Buttoned Denim Shorts,"Crafted from stretch-denim, this pair of mid-r...",4.5,/Users/laurali/Downloads/STAT-4830-vllm-projec...
25,7.0,,10.0,,6.334893,Lewkin,Juliana Washed Wide-Leg Jeans CJ509,"Style : Street, Casual\nOccasion : Back to sch...",48.0,/Users/laurali/Downloads/STAT-4830-vllm-projec...


Top 5 recommended items for user 'megan':


Unnamed: 0,matt,Laura,megan,vivian,predicted_rating,brand,name,description,price,image_path
31,,9.0,,10.0,10.7131,White House Black Market,V-Neck Open Stitch Sequin Sweater,The open stitch design of this pullover is jus...,99.0,/Users/laurali/Downloads/STAT-4830-vllm-projec...
61,,,,10.0,10.429358,Forever 21,Hooded Varsity Letterman Jacket,Details\nA fleece varsity letterman jacket fea...,20.99,/Users/laurali/Downloads/STAT-4830-vllm-projec...
15,10.0,,,3.0,7.101832,Uniqlo,Ultra Stretch DRY-EX Shorts,- Sheer: Not Sheer\n- Fit: Regular\n- Pockets:...,29.9,/Users/laurali/Downloads/STAT-4830-vllm-projec...
16,2.0,,,10.0,4.196191,Forever 21,Buttoned Denim Shorts,"Crafted from stretch-denim, this pair of mid-r...",4.5,/Users/laurali/Downloads/STAT-4830-vllm-projec...
48,,,,,4.142276,Zara,TRF Denim Midi Dress,Midi dress with a straight neckline and off-th...,69.9,/Users/laurali/Downloads/STAT-4830-vllm-projec...


Top 5 recommended items for user 'vivian':


Unnamed: 0,matt,Laura,megan,vivian,predicted_rating,brand,name,description,price,image_path
7,10.0,5.0,,,20.437674,Uniqlo,Crew Neck T-Shirt,The Uniqlo U collection is the realization of ...,14.9,/Users/laurali/Downloads/STAT-4830-vllm-projec...
6,10.0,,,,17.735176,Uniqlo,AIRism Cotton Oversized T-Shirt | Half-Sleeve,- Sheer: Not Sheer\n- Fit: Oversized\n- Pocket...,19.9,/Users/laurali/Downloads/STAT-4830-vllm-projec...
14,10.0,,,,17.023617,Uniqlo,Jersey Overshirt,- Sheer: Not Sheer\n- Fit: Relaxed\n,39.9,/Users/laurali/Downloads/STAT-4830-vllm-projec...
12,10.0,7.0,4.0,,15.348933,Uniqlo,Washable Milano Ribbed Sweater,- The images shown may include colors that are...,49.9,/Users/laurali/Downloads/STAT-4830-vllm-projec...
10,10.0,,6.0,,12.172935,Uniqlo,Wide Straight Jeans,- Fit: Loose\n- Silhouette: Straight\n- Pocket...,49.9,/Users/laurali/Downloads/STAT-4830-vllm-projec...
