In [1]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import random
from tqdm import tqdm
import numpy as np
import os
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# !huggingface-cli login
from huggingface_hub import login

load_dotenv()
hf_token = os.getenv("HUGGINGFACE_TOKEN")

login(hf_token)

In [3]:
dataset_path = "/home/mkuo/code/dataset_m.csv"
dataset_df = pd.read_csv(dataset_path, usecols=['name', 'description'])

print(dataset_df.shape)
dataset_df.head()


(6219, 2)


Unnamed: 0,name,description
0,Twill Wide-Leg Cargo Pants,A pair of twill pants featuring contrasting to...
1,"Denim for all seasons, 3-length vintage Faded ...",Country of Origin : China(OEM)\nMaterial : 100...
2,Mid-Rise Wide-Leg Cargo Pants,A pair of twill pants featuring a mid-rise wai...
3,Fine layered check blouse,Country of Origin : China(OEM)\nMaterial : Cot...
4,Blena Heart Strap Button Lace Cropped Sleevele...,Country of Origin : Korea/China(OEM)\nMaterial...


In [5]:
## Load the mistral model
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
device = "cuda:0"


# model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(device)


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.07it/s]


In [31]:
females = [
    # Female Personas
    {
        "name": "Alex",
        "bio": "A 28-year-old graphic designer who prefers minimalist styles, neutral colors, and high-quality basics. Loves clean lines and avoids bold patterns.",
    },
    {
        "name": "Morgan",
        "bio": "A 22-year-old college student with a vintage aesthetic. Loves thrifted items, 90s fashion, and unique statement pieces.",
    },
    {
        "name": "Taylor",
        "bio": "A 31-year-old corporate professional who needs polished business attire. Prefers classic cuts, muted colors, and sophisticated pieces.",
    },
    {
        "name": "Casey",
        "bio": "A 24-year-old photographer who loves eclectic, artistic fashion. Enjoys mixing bold prints with vintage finds and doesn’t shy away from colorful statement pieces.",
    },
    {
        "name": "Skylar",
        "bio": "A 29-year-old environmental activist who embraces sustainable fashion. Prefers organic fabrics, earthy tones, and secondhand clothing, often choosing pieces with a minimal environmental footprint.",
    },
    {
        "name": "Morgan",
        "bio": "A 19-year-old social media influencer with a vibrant, trendy style. Loves oversized clothing, graphic tees, neon colors, and staying ahead of fast fashion trends.",
    },
    {
        "name": "Quinn",
        "bio": "A 26-year-old artist with an avant-garde style. Enjoys experimental fashion, often incorporating custom-made pieces, avant-garde designers, and unusual materials into their wardrobe.",
    },
    {
        "name": "Sage",
        "bio": "A 28-year-old yoga instructor who values comfort and flexibility. Prefers soft, breathable fabrics in muted colors and loves layering for both style and practicality.",
    },
    {
        "name": "Avery",
        "bio": "A 30-year-old graphic novelist with a quirky and artistic style. Prefers eclectic, vintage clothing, often mixing bold colors, unusual patterns, and creative accessories.",
    },
    {
        "name": "Blair",
        "bio": "A 34-year-old interior designer who enjoys elegant, sophisticated fashion. Prefers high-quality fabrics like silk and wool, and gravitates toward luxurious, timeless pieces in soft, muted colors.",
    },
    {
        "name": "Harper",
        "bio": "A 27-year-old photographer who enjoys streetwear and casual outfits. Loves oversized sweaters, distressed jeans, and chunky sneakers, blending comfort with urban fashion influences.",
    },
    {
        "name": "Dylan",
        "bio": "A 23-year-old aspiring actor who loves to experiment with bold, dramatic fashion. Enjoys statement pieces like leather jackets, oversized coats, and edgy accessories.",
    },
    {
        "name": "Charlie",
        "bio": "A 38-year-old lawyer who enjoys classic, tailored outfits. Prefers suits, structured blazers, and high-end brands, with a preference for neutral tones and timeless elegance.",
    },
]

males = [
    {
        "name": "Jordan",
        "bio": "A 35-year-old fitness instructor who prioritizes athleisure and performance wear. Prefers bright colors and modern athletic brands.",
    },
    {
        "name": "Sam",
        "bio": "A 27-year-old tech entrepreneur who enjoys futuristic, sleek styles. Prefers smart casual wear with minimalist accessories, and often gravitates toward neutral tones with tech-inspired designs.",
    },
    {
        "name": "Parker",
        "bio": "A 40-year-old chef who values comfort and practicality. Prefers durable, easy-to-maintain clothes, such as denim, aprons, and breathable fabrics, with a penchant for simple, clean lines.",
    },
    {
        "name": "Blake",
        "bio": "A 33-year-old travel blogger who gravitates toward versatile, functional clothing. Prefers outdoor gear, durable fabrics, and items that can easily transition from urban exploration to adventure travel.",
    },
    {
        "name": "Jordan",
        "bio": "A 26-year-old professional gamer who prefers casual, comfortable, and tech-inspired clothing. Loves oversized hoodies, graphic tees, and accessories with a futuristic vibe.",
    },
    {
        "name": "Emerson",
        "bio": "A 32-year-old scientist who prefers practical and functional clothing. Prefers neutral colors, casual yet professional fits, and well-made, comfortable pieces suitable for long hours in the lab.",
    },
    {
        "name": "Riley",
        "bio": "A 25-year-old musician with an edgy style. Loves leather, black clothing, band tees, and anything with an alternative vibe.",
    },
]

personas = females + males


In [32]:
def generate_rating(persona, item_name, item_description):
    # Create a prompt for Mistral
    prompt = f"""<s>[INST] You are helping create realistic synthetic data for fashion recommendations.

    Here is information about a persona:
    Name: {persona['name']}
    Fashion preferences: {persona['bio']}

    And here is a clothing item:
    Name: {item_name}
    Description: {item_description}

    Based on the persona's fashion preferences, would they like this item?
    Rate it on a scale from 1-10 if they would like it, or -1 if they would dislike it.
    Return only the numeric rating. [/INST]</s>
    """

    # Tokenize and generate
    # print("Prompt:", prompt)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=10,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Parse just the rating
    response = response.split("[/INST]")[1].strip()
    # print("LLM response:", response)
    # print()
    # print()

    # Try to extract just the number
    try:
        # Look for a number in the response
        import re
        ratings = re.findall(r'-?\d+', response)
        if ratings:
            rating = int(ratings[0])
            # Ensure rating is in our scale
            if rating > 10:
                rating = 10
            elif rating > 0 and rating < 1:
                rating = 1
            # -1 stays as -1
            return rating
        else:
            # If no number found, randomly assign
            print("Randomly assigning because LLM didn't return a number")
            return random.choice([-1] + list(range(1, 11)))
    except:
        # Fallback to random rating
        return random.choice([-1] + list(range(1, 11)))

In [33]:
def create_sparse_ratings(dataset_df, personas, sparsity=0.9):
    # Initialize ratings matrix with NaN
    ratings_matrix = pd.DataFrame(
        np.nan,
        index=range(len(dataset_df)),
        columns=[p['name'] for p in personas]
    )

    for persona in personas:
        # Choose the items to rate
        n_ratings = int(len(dataset_df) * (1 - sparsity))
        items_to_rate = random.sample(range(len(dataset_df)), n_ratings)

        for idx in tqdm(items_to_rate, desc=f"Generating ratings for {persona['name']}", position=0, leave=True):
            item_name = dataset_df.iloc[idx]['name']
            item_desc = dataset_df.iloc[idx]['description']

            rating = generate_rating(persona, item_name, item_desc)
            ratings_matrix.loc[idx, persona['name']] = rating
            # print(ratings_matrix.loc[idx, persona['name']])

    return ratings_matrix

In [None]:
sparsity = 0.95
ratings_matrix = create_sparse_ratings(dataset_df, personas, sparsity)

# Display stats about the ratings
print(f"Total cells: {ratings_matrix.size}")
print(f"Rated cells: {ratings_matrix.count().sum()}")
print(f"Sparsity: {1 - ratings_matrix.count().sum() / ratings_matrix.size:.2%}")

# Show distribution of ratings by persona
print("\nRating distribution by persona:")
for persona in personas:
    name = persona['name']
    ratings = ratings_matrix[name].dropna()
    likes = (ratings > 0).sum()
    dislikes = (ratings == -1).sum()
    # print(f"{name}: {len(ratings)} ratings - {likes} likes, {dislikes} dislikes")

# Save the ratings matrix
ratings_matrix.to_csv('sparse_ratings_matrix.csv')

# Display a sample of the ratings matrix
ratings_matrix.head(10)

In [36]:
## check the number of cells with value = -1.0
ratings_matrix[ratings_matrix == 1.0].count().sum()

78