<a href="https://colab.research.google.com/github/jackysiupuichung/mlops_zoomcamp/blob/main/amazon_product_search_Recommendation_Engines_mlflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pkg_resources
import os
import sys
import subprocess

# === 🔧 Install Missing Packages ===
def ensure_packages():
    required = {'sentence_transformers', 'annoy', 'mlflow', 'pytorch-lightning'}
    installed = {pkg.key for pkg in pkg_resources.working_set}
    missing = required - installed
    if missing:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', *missing])
        print(f"✅ Installed missing packages: {missing}")

ensure_packages()

  import pkg_resources


✅ Installed missing packages: {'sentence_transformers'}


In [None]:
# File: data_ingestion/setup.py

import os
import subprocess
import gc
import torch
import pandas as pd
import hashlib
import mlflow


def setup_environment():
    """
    Sets up pandas and torch environment options for reproducibility and readability.
    """
    pd.set_option('display.max_colwidth', None)
    torch.set_printoptions(sci_mode=False)
    print("✅ Environment setup complete.")


def download_amazon_query_product_dataset(download_path="data/"):
    """
    Downloads the Amazon Query Product Search dataset from Kaggle using the Kaggle CLI.

    Args:
        download_path (str): The path where the dataset should be stored and unzipped.
    """
    os.makedirs(download_path, exist_ok=True)
    print(f"📦 Downloading dataset to: {download_path}")
    subprocess.run([
        "kaggle", "datasets", "download",
        "-d", "abhishekmungoli/amazon-query-product-search",
        "-p", download_path, "--unzip"
    ])
    print("✅ Dataset downloaded and unzipped.")

    # Log dataset location and hash to MLflow
    dataset_file = os.path.join(download_path, "train_v0.1.csv")  # Example file, adjust as needed
    if os.path.exists(dataset_file):
        data_hash = hash_file(dataset_file)
        mlflow.log_param("dataset_path", dataset_file)
        mlflow.log_param("data_hash", data_hash)
        mlflow.log_artifact(dataset_file, artifact_path="data")
        df = pd.read_csv(dataset_file)
        mlflow.log_metric("total_rows", len(df))
        print(f"🔐 Data hash logged: {data_hash}")
    else:
        print(f"⚠️ Warning: Expected dataset file not found at {dataset_file}")


def set_working_directory(path):
    """
    Sets the working directory to the specified path.

    Args:
        path (str): The path to switch to.
    """
    if os.path.exists(path):
        os.chdir(path)
        print(f"📂 Changed working directory to: {os.getcwd()}")
    else:
        raise FileNotFoundError(f"Directory {path} does not exist")


def clean_memory():
    """
    Frees up memory by collecting garbage and clearing CUDA cache.
    """
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    print("🧹 Memory cleaned up.")


def hash_file(filepath, block_size=65536):
    """
    Generate SHA256 hash of a file for version tracking.

    Args:
        filepath (str): Path to the file to hash.
        block_size (int): Block size to read in chunks.

    Returns:
        str: SHA256 hex digest of the file.
    """
    sha256 = hashlib.sha256()
    with open(filepath, 'rb') as f:
        for block in iter(lambda: f.read(block_size), b''):
            sha256.update(block)
    return sha256.hexdigest()


In [None]:
import os
import sys
import torch
import subprocess
import numpy as np
from sentence_transformers import SentenceTransformer

# === 🔧 Install Missing Packages ===
def ensure_packages():
    required = {'sentence_transformers', 'annoy', 'mlflow'}
    installed = {pkg.key for pkg in pkg_resources.working_set}
    missing = required - installed
    if missing:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', *missing])
        print(f"✅ Installed missing packages: {missing}")

# === 🧠 Load Model ===
def load_model(model_name='distilbert-base-uncased'):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print("GPU available:", torch.cuda.is_available())
    return SentenceTransformer(model_name, device=device)

# === 🔄 Embedding Reshaper ===
def reshape_array(input_array: np.ndarray, d: int) -> np.ndarray:
    k, _ = input_array.shape
    new_array = np.zeros((k, d))
    for i in range(k):
        for j in range(d):
            start_idx = j * (768 // d)
            end_idx = (j + 1) * (768 // d) if j < (d - 1) else 768
            new_array[i, j] = np.mean(input_array[i, start_idx:end_idx])
    return new_array

# === 🔍 Sentence Embedding Function ===
def find_embeddings(lst_product_title, i, step, model, product_dim, maxlen=None):
    sentences = list(lst_product_title)[i:i + step]

    sentences = list(lst_product_title)[i:i + step]

    # Default maxlen: word count of first sentence
    if maxlen is None and len(sentences) > 0:
        maxlen = len(sentences[0].split())

    # Truncate all sentences to the maxlen
    sentences = [' '.join(s.split()[:maxlen]) for s in sentences]

    embeddings = model.encode(sentences, convert_to_tensor=True).cpu().detach().numpy()
    return reshape_array(embeddings, product_dim)


In [None]:
import pandas as pd
import numpy as np
import mlflow
import joblib
from sklearn.preprocessing import LabelEncoder
import os

def preprocess_product_search_data(
    df_path='Dataset.csv',
    products_path='shopping_queries_dataset_products.parquet',
    positive_label='E',
    sample_fraction=0.01,
    random_state=42,
    output_dir='outputs/'
):
    os.makedirs(output_dir, exist_ok=True)

    # Load dataset
    df = pd.read_csv(df_path)
    df_pos = df[df['esci_label'] == positive_label]
    mlflow.log_metric("rows_positive", len(df_pos))

    # Sample subset of unique queries
    unique_queries = df_pos['query'].unique()
    sampled_queries = np.random.choice(unique_queries, int(len(unique_queries) * sample_fraction), replace=False)
    df_sample = df_pos[df_pos['query'].isin(sampled_queries)].copy()
    mlflow.log_param("sample_fraction", sample_fraction)
    mlflow.log_param("random_state", random_state)
    mlflow.log_metric("rows_sampled", len(df_sample))
    mlflow.log_metric("unique_queries", df_sample['query'].nunique())

    # Encode queries and product IDs
    query_encoder = LabelEncoder()
    product_encoder = LabelEncoder()
    df_sample['queryid'] = query_encoder.fit_transform(df_sample['query'])
    df_sample['productid'] = product_encoder.fit_transform(df_sample['product_id'])

    joblib.dump(query_encoder, os.path.join(output_dir, "query_encoder.pkl"))
    joblib.dump(product_encoder, os.path.join(output_dir, "product_encoder.pkl"))
    mlflow.log_artifact(os.path.join(output_dir, "query_encoder.pkl"), artifact_path="encoders")
    mlflow.log_artifact(os.path.join(output_dir, "product_encoder.pkl"), artifact_path="encoders")

    # Load product metadata
    df_products = pd.read_parquet(products_path)
    product_cols = ['product_title', 'product_description', 'product_bullet_point', 'product_brand', 'product_color', 'product_id']
    df_products = df_products[product_cols].drop_duplicates()
    df_products = df_products.merge(df_sample[['product_id', 'productid']].drop_duplicates(), on='product_id', how='inner')

    # Lowercase text columns
    text_cols = ['product_title', 'product_description', 'product_bullet_point', 'product_brand', 'product_color']
    for col in text_cols:
        df_products[col] = df_products[col].apply(lambda x: str(x).lower() if pd.notna(x) else '')
    df_sample['query'] = df_sample['query'].apply(lambda x: str(x).lower() if pd.notna(x) else '')

    # Save and log outputs
    df_sample.to_csv(os.path.join(output_dir, "df_sample.csv"), index=False)
    df_products.to_csv(os.path.join(output_dir, "df_products.csv"), index=False)
    mlflow.log_artifact(os.path.join(output_dir, "df_sample.csv"), artifact_path="processed")
    mlflow.log_artifact(os.path.join(output_dir, "df_products.csv"), artifact_path="processed")

    return df_sample, df_products


In [None]:
import os
import numpy as np
import pandas as pd
import mlflow

def embed_queries(queries_df, model, query_dim=32, step=10000, output_dir='outputs/query_embeddings'):
    os.makedirs(output_dir, exist_ok=True)
    cols = ['q' + str(x) for x in range(query_dim)] + ['query'] + ['query_id']
    cnt = 0

    for i in range(0, len(queries_df), step):
        cnt += 1
        sentences = queries_df['query'][i:i+step]
        embeddings = model.encode(sentences, convert_to_tensor=True).cpu().numpy()
        reshaped = reshape_array(embeddings, query_dim)

        df_tmp = pd.DataFrame(np.concatenate(
            [reshaped,
             np.array(sentences).reshape(-1, 1),
             np.array(queries_df['queryid'][i:i+step]).reshape(-1, 1)],
            axis=1
        ), columns=cols)

        path = os.path.join(output_dir, f'query_{cnt}.parquet')
        df_tmp.to_parquet(path, index=False)
        mlflow.log_artifact(path, artifact_path='query_embeddings')
        print(f"✅ Saved query batch {cnt}: {path}")

def embed_products(df_products, model, product_dim=32, step=1000, output_dir='outputs/product_embeddings'):
    os.makedirs(output_dir, exist_ok=True)
    cols = ['p' + str(i) for i in range(product_dim * 5)] + ['product_id', 'productid']
    cnt = 0

    for i in range(0, len(df_products), step):
        cnt += 1
        embeds = []

        for col, maxlen in zip(
            ['product_title', 'product_description', 'product_bullet_point', 'product_brand', 'product_color'],
            [50, 20, 50, 5, 5]
        ):
            df_products[col] = df_products[col].apply(lambda x: str(x).lower() if pd.notna(x) else '')
            embed = find_embeddings(list(df_products[col]), i, step, model, product_dim, maxlen)
            embeds.append(embed)

        df_tmp = pd.DataFrame(np.concatenate(embeds + [
            np.array(df_products['product_id'][i:i+step]).reshape(-1, 1),
            np.array(df_products['productid'][i:i+step]).reshape(-1, 1)
        ], axis=1), columns=cols)

        path = os.path.join(output_dir, f'product_{cnt}.parquet')
        df_tmp.to_parquet(path, index=False)
        mlflow.log_artifact(path, artifact_path='product_embeddings')
        print(f"✅ Saved product batch {cnt}: {path}")

In [None]:
import pandas as pd
import numpy as np
from annoy import AnnoyIndex
import mlflow

def build_annoy_index(embeddings: np.ndarray, metric: str = 'angular', n_trees: int = 20):
    index = AnnoyIndex(embeddings.shape[1], metric)
    for i, vector in enumerate(embeddings):
        index.add_item(i, vector)
    index.build(n_trees)
    print(f"✅ Annoy index built with {n_trees} trees")
    return index

def get_similar_candidates(index, embeddings: np.ndarray, top_k: int = 100):
    similar_candidates = []
    for vector in embeddings:
        indices, distances = index.get_nns_by_vector(vector, top_k, include_distances=True)
        indices, distances = indices[1:], distances[1:]  # skip self
        similar_candidates.append(list(zip(indices, distances)))
    return similar_candidates

def construct_triplets(df_pos: pd.DataFrame, similar_candidates: list):
    triplets = []
    for user_id, group in df_pos.groupby('queryid'):
        pos_items = group['productid'].tolist()
        for pos_item in pos_items:
            candidates = similar_candidates[pos_item]
            negatives = [cand[0] for cand in candidates if cand[0] not in pos_items]
            triplets.append((user_id, pos_item, negatives))
    return pd.DataFrame(triplets, columns=['query_id', 'product_id', 'negative_samples'])

def save_triplets(triplets_df: pd.DataFrame, path: str = "outputs/triplets.parquet"):
    triplets_df.to_parquet(path, index=False)
    mlflow.log_artifact(path, artifact_path="triplet_data")
    print(f"✅ Triplets saved and logged: {path}")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Source: current working directory
os.chdir('/content/drive/MyDrive/colab_backup')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# # import mlflow
# # import pandas as pd
# # import torch
# # import os
# # import numpy as np
# # from sentence_transformers import SentenceTransformer

# # from data_ingestion.setup import setup_environment, download_amazon_query_product_dataset
# # from preprocess_product_search_data import preprocess_product_search_data
# # from features.generate_embeddings import embed_queries, embed_products
# # from features.generate_triplets import build_annoy_index, get_similar_candidates, construct_triplets, save_triplets


# # === 🔧 SETUP ===
# mlflow.set_tracking_uri("file:./mlruns")
# mlflow.set_experiment("amazon_query_product_retrieval")

# with mlflow.start_run():
#     # Step 1: Environment setup + download raw data
#     # setup_environment()
#     # # download_amazon_query_product_dataset("data/")

#     # # Step 2: Preprocess data
#     # df_sample, df_products = preprocess_product_search_data(
#     #     df_path="Dataset.csv",
#     #     products_path="shopping_queries_dataset_products.parquet",
#     #     sample_fraction=0.1,
#     #     random_state=42,
#     #     output_dir="outputs/"
#     # )

#     # # Step 3: Generate Embeddings
#     # model = SentenceTransformer('distilbert-base-uncased', device='cuda' if torch.cuda.is_available() else 'cpu')

#     # queries_df = df_sample[['query', 'queryid']].drop_duplicates().reset_index(drop=True)
#     # embed_queries(queries_df=queries_df, model=model, query_dim=32, step=10000, output_dir='outputs/query_embeddings')
#     # embed_products(df_products=df_products, model=model, product_dim=32, step=1000, output_dir='outputs/product_embeddings')

# # Step 4: Load concatenated embeddings
#     # query_embeds = pd.concat([pd.read_parquet(f'outputs/query_embeddings/{f}') for f in os.listdir('outputs/query_embeddings')])
#     # product_embeds = pd.concat([pd.read_parquet(f'outputs/product_embeddings/{f}') for f in os.listdir('outputs/product_embeddings')])

#     # query_embeddings = query_embeds.drop(columns=['query', 'query_id']).values
#     # product_embeddings = product_embeds.drop(columns=['product_id', 'productid']).values

#     # # Step 5: Build index + sample negatives
#     # annoy_index = build_annoy_index(product_embeddings)
#     # similar_candidates = get_similar_candidates(annoy_index, product_embeddings)

#     # triplets_df = construct_triplets(df_sample, similar_candidates)
#     save_triplets(triplets_df)

In [None]:
# import torch
# import torch.nn as nn
# import pytorch_lightning as pl
# import numpy as np
# from sklearn.metrics import ndcg_score, precision_score, recall_score
# from collections import defaultdict
# from sklearn.model_selection import train_test_split
# import random


# class RecommendationModel(pl.LightningModule):
#     def __init__(self, query_embeddings, product_embeddings, learning_rate=1e-3):
#         super().__init__()
#         self.query_embeddings = nn.Embedding.from_pretrained(
#             torch.tensor(query_embeddings, dtype=torch.float32), freeze=True
#         )
#         self.product_embeddings = nn.Embedding.from_pretrained(
#             torch.tensor(product_embeddings, dtype=torch.float32), freeze=True
#         )
#         self.fc = nn.Linear(query_embeddings.shape[1] + product_embeddings.shape[1], 1)
#         self.learning_rate = learning_rate
#         self.validation_step_outputs = []
#         self.test_step_outputs = []
#         self.save_hyperparameters()

#     def forward(self, query_ids, product_ids):
#         query_emb = self.query_embeddings(query_ids)
#         product_emb = self.product_embeddings(product_ids)
#         combined = torch.cat((query_emb, product_emb), dim=1)
#         return self.fc(combined).squeeze()

#     def training_step(self, batch, batch_idx):
#         user_ids, product_ids, labels = batch
#         predictions = self(user_ids, product_ids)
#         loss = nn.BCEWithLogitsLoss()(predictions, labels.float())
#         self.log('train_loss', loss)
#         return loss

#     def validation_step(self, batch, batch_idx):
#         user_ids, product_ids, labels = batch
#         logits = self(user_ids, product_ids)
#         probs = torch.sigmoid(logits)
#         loss = nn.BCEWithLogitsLoss()(logits, labels.float())
#         self.log('val_loss', loss)

#         self.validation_step_outputs.append({
#             'user_ids': user_ids.detach().cpu(),
#             'product_ids': product_ids.detach().cpu(),
#             'labels': labels.detach().cpu(),
#             'probs': probs.detach().cpu()
#         })

#     def on_validation_epoch_end(self):
#         self._log_epoch_metrics(self.validation_step_outputs, prefix="Val")
#         self.validation_step_outputs.clear()

#     def test_step(self, batch, batch_idx):
#         user_ids, product_ids, labels = batch
#         logits = self(user_ids, product_ids)
#         probs = torch.sigmoid(logits)
#         self.test_step_outputs.append({
#             'user_ids': user_ids.detach().cpu(),
#             'product_ids': product_ids.detach().cpu(),
#             'labels': labels.detach().cpu(),
#             'probs': probs.detach().cpu()
#         })

#     def on_test_epoch_end(self):
#         self._log_epoch_metrics(self.test_step_outputs, prefix="Test")
#         self.test_step_outputs.clear()

#     def _log_epoch_metrics(self, outputs, prefix=""):
#         user_pred_dict = defaultdict(list)
#         user_true_dict = defaultdict(set)
#         for output in outputs:
#             for uid, pid, label, score in zip(output['user_ids'], output['product_ids'], output['labels'], output['probs']):
#                 uid, pid = int(uid), int(pid)
#                 user_pred_dict[uid].append((pid, score))
#                 if label == 1:
#                     user_true_dict[uid].add(pid)
#         y_true_grouped = []
#         y_pred_grouped = []
#         for user in user_pred_dict:
#             sorted_preds = sorted(user_pred_dict[user], key=lambda x: x[1], reverse=True)
#             y_pred_grouped.append([pid for pid, _ in sorted_preds])
#             y_true_grouped.append(list(user_true_dict[user]))
#         metrics = calculate_metrics(y_true_grouped, y_pred_grouped, k_values=[5, 10, 15])
#         for key, value in metrics.items():
#             self.log(f"{prefix}/{key}", value, prog_bar=True)

#     def configure_optimizers(self):
#         return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

#     def predict_from_embedding(self, query_emb_float, product_emb_matrix):
#         """
#         Predict top-k scores from float query embedding and full product embedding matrix.

#         Args:
#             query_emb_float: torch.Tensor of shape [1, d]
#             product_emb_matrix: torch.Tensor of shape [N, d]

#         Returns:
#             scores: torch.Tensor of shape [N]
#         """
#         with torch.no_grad():
#             repeated_query = query_emb_float.repeat(product_emb_matrix.shape[0], 1)
#             combined = torch.cat((repeated_query, product_emb_matrix), dim=1)
#             logits = self.fc(combined).squeeze()
#             return torch.sigmoid(logits)


# def calculate_metrics(y_true, y_pred, k_values=[5, 10, 15]):
#     results = {}
#     for k in k_values:
#         y_pred_truncated = [pred[:k] if len(pred) >= k else pred + [0] * (k - len(pred)) for pred in y_pred]
#         y_true_truncated = [true[:k] if len(true) >= k else true + [0] * (k - len(true)) for true in y_true]
#         try:
#             results[f'NDCG_{k}'] = ndcg_score(np.array([y_true_truncated]), np.array([y_pred_truncated]))
#         except ValueError:
#             results[f'NDCG_{k}'] = 0
#         try:
#             results[f'Precision_{k}'] = precision_score(np.array(y_true_truncated).flatten(),
#                                                         np.array(y_pred_truncated).flatten(),
#                                                         average='macro', zero_division=0)
#         except ValueError:
#             results[f'Precision_{k}'] = 0
#         try:
#             results[f'Recall_{k}'] = recall_score(np.array(y_true_truncated).flatten(),
#                                                   np.array(y_pred_truncated).flatten(),
#                                                   average='macro', zero_division=0)
#         except ValueError:
#             results[f'Recall_{k}'] = 0
#     return results


# def explode_triplets(triplets, negatives):
#     exploded_triplets = []
#     for user_id, pos_items, neg_items in triplets:
#         pos_items = [pos_items] if isinstance(pos_items, int) else pos_items
#         for pos_item in pos_items:
#             exploded_triplets.append([user_id, pos_item, 1])
#         for neg_item in random.sample(list(neg_items), negatives):
#             exploded_triplets.append([user_id, neg_item, 0])
#     return exploded_triplets


# def train_val_test_split(triplets, negatives, train_size=0.8, val_size=0.1, test_size=0.1, random_state=42):
#     exploded_data = explode_triplets(triplets, negatives)
#     df_exploded = pd.DataFrame(exploded_data, columns=['user_id', 'item_id', 'label'])
#     train_data_df, temp_data_df = train_test_split(df_exploded, train_size=train_size, random_state=random_state)
#     val_data_df, test_data_df = train_test_split(temp_data_df, train_size=val_size/(val_size + test_size), random_state=random_state)
#     train_data = train_data_df.values.tolist()
#     val_data = val_data_df.values.tolist()
#     test_data = test_data_df.values.tolist()
#     return train_data, val_data, test_data


In [None]:
# import os
# import sys
# import mlflow
# import joblib
# import torch
# import pandas as pd
# from itertools import product
# from torch.utils.data import DataLoader, TensorDataset
# from pytorch_lightning import Trainer
# from pytorch_lightning.callbacks import ModelCheckpoint
# from pytorch_lightning.loggers import TensorBoardLogger
# # from recommendation_model import RecommendationModel, train_val_test_split, calculate_metrics


# def install_dependencies():
#     import subprocess
#     import pkg_resources
#     required = {'mlflow', 'pytorch-lightning'}
#     installed = {pkg.key for pkg in pkg_resources.working_set}
#     missing = required - installed
#     if missing:
#         python = sys.executable
#         subprocess.check_call([python, '-m', 'pip', 'install', *missing])


# def make_loader(data, batch_size=128):
#     return DataLoader(TensorDataset(
#         torch.tensor([x[0] for x in data]),
#         torch.tensor([x[1] for x in data]),
#         torch.tensor([x[2] for x in data])
#     ), batch_size=batch_size, shuffle=True)


# def run_training_pipeline():
#     # Load embeddings and triplets
#     df_triplets = pd.read_parquet('outputs/triplets.parquet')
#     query_embeds = pd.concat([pd.read_parquet(f'outputs/query_embeddings/{f}') for f in os.listdir('outputs/query_embeddings')])
#     product_embeds = pd.concat([pd.read_parquet(f'outputs/product_embeddings/{f}') for f in os.listdir('outputs/product_embeddings')])

#     product_embeddings = product_embeds.sort_values(by='product_id').drop(columns=['product_id', 'productid']).values
#     query_embeddings = query_embeds.sort_values(by='query_id').drop(columns=['query_id', 'query']).values

#     # MLflow setup
#     mlflow.set_tracking_uri("file:./mlruns")
#     mlflow.set_experiment("my_recommendation_experiment")

#     negative_samples_list = [1, 10, 30]
#     learning_rates = [1e-3]
#     max_epoch = 5
#     batch_size = 256

#     for negative_samples, lr in product(negative_samples_list, learning_rates):
#         run_name = f"neg{negative_samples}_lr{lr}"

#         with mlflow.start_run(run_name=run_name):
#             mlflow.log_param("num_negative_samples", negative_samples)
#             mlflow.log_param("learning_rate", lr)
#             mlflow.log_param("max_epochs", max_epoch)
#             mlflow.log_param("data_version", "hard_negative_22042025")

#             train_data, val_data, test_data = train_val_test_split(
#                 df_triplets.values,
#                 negative_samples,
#                 train_size=0.8,
#                 val_size=0.1,
#                 test_size=0.1,
#                 random_state=42
#             )

#             train_loader = make_loader(train_data, batch_size)
#             val_loader = make_loader(val_data, batch_size)
#             test_loader = make_loader(test_data, batch_size)

#             model = RecommendationModel(
#                 query_embeddings=query_embeddings,
#                 product_embeddings=product_embeddings,
#                 learning_rate=lr
#             )

#             tb_logger = TensorBoardLogger("lightning_logs", name=run_name)
#             checkpoint_callback = ModelCheckpoint(monitor='val_loss')

#             trainer = Trainer(
#                 max_epochs=max_epoch,
#                 logger=tb_logger,
#                 callbacks=[checkpoint_callback],
#                 enable_model_summary=True
#             )

#             trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
#             mlflow.pytorch.log_model(model, artifact_path="rec_model")

#             results = trainer.test(model, dataloaders=test_loader)
#             for metric, value in results[0].items():
#                 mlflow.log_metric(metric, value)

#             # Register the model
#             model_uri = f"runs:/{mlflow.active_run().info.run_id}/rec_model"
#             model_name = "RecommendationModel"
#             try:
#                 mlflow.register_model(model_uri, model_name)
#             except Exception:
#                 pass

#             from mlflow.tracking import MlflowClient
#             client = MlflowClient()
#             model_versions = client.get_latest_versions(model_name, stages=["Production"])
#             new_version = client.get_latest_versions(model_name, stages=["None"])[0].version

#             # Get metric to compare
#             new_score = results[0].get("Test/Recall_5", 0)
#             if model_versions:
#                 old_version = model_versions[0].version
#                 old_score = float(client.get_model_version(model_name, old_version).tags.get("Test/Recall_5", 0))
#                 if new_score > old_score:
#                     client.transition_model_version_stage(model_name, new_version, stage="Production")
#                     client.transition_model_version_stage(model_name, old_version, stage="Archived")
#             else:
#                 client.transition_model_version_stage(model_name, new_version, stage="Production")
#             client.set_model_version_tag(model_name, new_version, "Test/Recall_5", str(new_score))


# if __name__ == "__main__":
#     # install_dependencies()
#     run_training_pipeline()


In [None]:
# import mlflow

# client = mlflow.tracking.MlflowClient()
# model_name = "RecommendationModel"

# # Get all versions
# versions = client.get_latest_versions(model_name, stages=["Production"])

# # Demote each Production version (optional: move to Archived)
# for v in versions:
#     client.transition_model_version_stage(
#         name=model_name,
#         version=v.version,
#         stage="Archived"  # or "Staging"
#     )
#     print(f"Version {v.version} transitioned to Archived.")

In [None]:
from flask import Flask, request, jsonify
import torch
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import mlflow.pytorch
import joblib
from threading import Thread
import os

# --- Load everything once ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

rec_model = mlflow.pytorch.load_model("models:/RecommendationModel/Production").to(device)
rec_model.eval()

query_encoder_model = SentenceTransformer('distilbert-base-uncased', device=device)

product_embedding_df = pd.concat([pd.read_parquet(
    f'outputs/product_embeddings/{f}') for f in os.listdir(
        'outputs/product_embeddings')])

product_ids = product_embedding_df.sort_values(by='product_id')[['product_id', 'productid']]
product_embeddings = product_embedding_df.sort_values(by='product_id').drop(
            columns=['product_id', 'productid']).values

product_emb_tensor = torch.tensor(product_embeddings, dtype=torch.float32).to(device)

products_df = pd.read_csv('outputs/df_products.csv')
product_encoder = joblib.load('outputs/product_encoder.pkl')
productid_to_metadata = products_df.set_index('productid')[['product_id', 'product_title']]
index_to_productid = product_ids.reset_index(drop=True)['productid'].values

query_dim = 32

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# --- Define Flask app ---
app = Flask(__name__)

@app.route('/health', methods=['GET'])
def health():
    return jsonify({'status': 'ok'})

@app.route('/predict', methods=['POST'])
def predict():
    try:
        data = request.get_json(force=True)
        query_text = data.get('query')
        if not query_text:
            return jsonify({'error': 'No query provided'}), 400

        query_emb = query_encoder_model.encode([query_text], convert_to_tensor=True).to(device)
        query_emb = reshape_array(query_emb.cpu().numpy(), query_dim)
        query_emb = torch.tensor(query_emb, dtype=torch.float32).to(device)

        with torch.no_grad():
            scores = rec_model.predict_from_embedding(query_emb, product_emb_tensor).squeeze()
            scores = torch.sigmoid(scores)

        topk = 5
        top_indices = torch.topk(scores, topk).indices.cpu().numpy()
        top_scores = torch.topk(scores, topk).values.cpu().numpy()

        top_productids = index_to_productid[top_indices]
        original_product_ids = product_encoder.inverse_transform(top_productids)

        results = productid_to_metadata.loc[top_productids].copy()
        results['real_product_id'] = original_product_ids
        results['score'] = top_scores

        return jsonify(results.reset_index(drop=True).to_dict(orient='records'))

    except Exception as e:
        import traceback
        tb = traceback.format_exc()

        debug_info = {
            "error": str(e),
            "traceback": tb,
            "query_emb_shape": list(query_emb.shape) if 'query_emb' in locals() else "undefined",
            "product_emb_tensor_shape": list(product_emb_tensor.shape) if 'product_emb_tensor' in locals() else "undefined",
            "fc_input_dim": list(rec_model.fc.weight.shape) if hasattr(rec_model, 'fc') else "undefined"
        }

        return jsonify(debug_info), 500

@app.route('/batch_predict', methods=['POST'])
def batch_predict():
    try:
        data = request.get_json(force=True)
        queries = data.get('queries')

        if not queries or not isinstance(queries, list):
            return jsonify({'error': 'No valid queries list provided'}), 400

        batch_results = []

        for query_text in queries:
            query_emb = query_encoder_model.encode([query_text], convert_to_tensor=True).to(device).float()

            with torch.no_grad():
                scores = rec_model.predict_from_embedding(query_emb, product_emb_tensor).squeeze()
                scores = torch.sigmoid(scores)

            topk = 5
            top_indices = torch.topk(scores, topk).indices.cpu().numpy()
            top_scores = torch.topk(scores, topk).values.cpu().numpy()

            top_productids = index_to_productid[top_indices]
            original_product_ids = product_encoder.inverse_transform(top_productids)

            results = productid_to_metadata.loc[top_productids].copy()
            results['real_product_id'] = original_product_ids
            results['score'] = top_scores

            batch_results.append({
                'query': query_text,
                'results': results.reset_index(drop=True).to_dict(orient='records')
            })

        return jsonify(batch_results)

    except Exception as e:
        import traceback
        print(traceback.format_exc())
        return jsonify({'error': str(e)}), 500

def run_server():
    print("🟢 Starting Flask server...")
    app.run(host="0.0.0.0", port=5000, debug=True, use_reloader=False)

In [None]:
from threading import Thread
server_thread = Thread(target=run_server)
server_thread.start()

🟢 Starting Flask server...
 * Serving Flask app '__main__'
 * Debug mode: on


In [None]:
!curl http://localhost:5000/health

 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [25/Apr/2025 12:39:50] "GET /health HTTP/1.1" 200 -


{
  "status": "ok"
}


In [None]:
import requests
response = requests.post("http://localhost:5000/predict", json={"query": "wireless earbuds"})
print(response.json())

INFO:werkzeug:127.0.0.1 - - [25/Apr/2025 12:39:51] "[35m[1mPOST /predict HTTP/1.1[0m" 500 -


{'error': 'Length of values (5) does not match length of index (6)', 'fc_input_dim': [1, 192], 'product_emb_tensor_shape': 'undefined', 'query_emb_shape': [1, 32], 'traceback': 'Traceback (most recent call last):\n  File "<ipython-input-13-d4d07a3840bd>", line 32, in predict\n    results[\'real_product_id\'] = original_product_ids\n    ~~~~~~~^^^^^^^^^^^^^^^^^^^\n  File "/usr/local/lib/python3.11/dist-packages/pandas/core/frame.py", line 4311, in __setitem__\n    self._set_item(key, value)\n  File "/usr/local/lib/python3.11/dist-packages/pandas/core/frame.py", line 4524, in _set_item\n    value, refs = self._sanitize_column(value)\n                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/usr/local/lib/python3.11/dist-packages/pandas/core/frame.py", line 5266, in _sanitize_column\n    com.require_length_match(value, self.index)\n  File "/usr/local/lib/python3.11/dist-packages/pandas/core/common.py", line 573, in require_length_match\n    raise ValueError(\nValueError: Length of values (