In [None]:
# Cell 1 - Install required packages (run once)
%pip install --quiet pandas numpy scikit-learn mlxtend flask requests python-dotenv matplotlib seaborn tqdm
# mlxtend supplies apriori/association rules


In [3]:
# Cell 2 - Imports and global settings
import os
import math
import json
import pickle
from pathlib import Path
from collections import defaultdict

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from mlxtend.frequent_patterns import apriori, association_rules

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm
tqdm.pandas()

# For LLM calls
import requests
from dotenv import load_dotenv
load_dotenv()  # loads .env if exists

# reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Paths
DATA_PATH = "Online Retail Updated.csv"  # change if necessary
ARTIFACTS_DIR = Path("artifacts")
ARTIFACTS_DIR.mkdir(exist_ok=True)


In [4]:
# Cell 3 - Load data (CSV)
# Use latin1 encoding commonly required for this dataset
df = pd.read_csv(DATA_PATH, encoding='latin1')

print("Raw shape:", df.shape)
df.head(5)


Raw shape: (541909, 8)


Unnamed: 0,InvoiceNo,Product_id,Description,Quantity,InvoiceDate,UnitPrice,customer_id,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [5]:
# Cell 4 - Quick EDA and schema confirmation
df.info()
display(df.describe(include='all').T)
print("\nSample countries:", df['Country'].value_counts().head(10))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   Product_id   541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   customer_id  406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
InvoiceNo,541909.0,25900.0,573585,1114.0,,,,,,,
Product_id,541909.0,4070.0,85123A,2313.0,,,,,,,
Description,540455.0,4223.0,WHITE HANGING HEART T-LIGHT HOLDER,2369.0,,,,,,,
Quantity,541909.0,,,,9.55225,218.081158,-80995.0,1.0,3.0,10.0,80995.0
InvoiceDate,541909.0,23260.0,10/31/2011 14:41,1114.0,,,,,,,
UnitPrice,541909.0,,,,4.611114,96.759853,-11062.06,1.25,2.08,4.13,38970.0
customer_id,406829.0,,,,15287.69057,1713.600303,12346.0,13953.0,15152.0,16791.0,18287.0
Country,541909.0,38.0,United Kingdom,495478.0,,,,,,,



Sample countries: Country
United Kingdom    495478
Germany             9495
France              8557
EIRE                8196
Spain               2533
Netherlands         2371
Belgium             2069
Switzerland         2002
Portugal            1519
Australia           1259
Name: count, dtype: int64


In [6]:
# Cell 5 - Basic cleaning & preprocessing
# - strip strings
# - convert dates
# - drop exact duplicate rows if exist
# - ensure customer_id numeric (some missing)
df['Description'] = df['Description'].astype(str).str.strip()
df['Product_id'] = df['Product_id'].astype(str).str.strip()
df['InvoiceNo'] = df['InvoiceNo'].astype(str).str.strip()

# convert InvoiceDate to datetime (coerce errors)
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')

# Drop duplicates
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f"Dropped {before-after} duplicate rows")

# Observe negative quantity / unitprice -> returns or errors
print("Negative qty count:", (df['Quantity'] < 0).sum())
print("Negative price count:", (df['UnitPrice'] < 0).sum())

# Keep a cleaned copy where Quantity>0 and UnitPrice>0 for "valid sales" (we'll keep raw df for returns analysis)
sales = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)].copy()
print("Valid sales shape:", sales.shape)


Dropped 5268 duplicate rows
Negative qty count: 10587
Negative price count: 2
Valid sales shape: (524878, 8)


In [7]:
# Cell 6 - Add TotalSales and helpful columns
sales['TotalSales'] = sales['Quantity'] * sales['UnitPrice']
sales['InvoiceMonth'] = sales['InvoiceDate'].dt.to_period('M')
sales['InvoiceDay'] = sales['InvoiceDate'].dt.date


In [8]:
# Cell 7 - Basic summaries
total_revenue = sales['TotalSales'].sum()
num_customers = int(sales['customer_id'].nunique())
num_products = int(sales['Product_id'].nunique())
num_transactions = int(sales['InvoiceNo'].nunique())

print(f"Total revenue (valid sales): {total_revenue:,.2f}")
print("Unique customers (valid sales):", num_customers)
print("Unique products:", num_products)
print("Unique transactions:", num_transactions)


Total revenue (valid sales): 10,642,110.80
Unique customers (valid sales): 4338
Unique products: 3922
Unique transactions: 19960


In [9]:
# Cell 8 - Prepare product-level metadata
# We'll create a product dataframe with aggregated fields
product_df = (
    sales.groupby(['Product_id', 'Description'], as_index=False)
    .agg(total_quantity=('Quantity', 'sum'),
         total_revenue=('TotalSales', 'sum'),
         unitprice_mean=('UnitPrice', 'mean'),
         transactions=('InvoiceNo', 'nunique'))
)
product_df = product_df.sort_values('total_revenue', ascending=False)
product_df.head(10)


Unnamed: 0,Product_id,Description,total_quantity,total_revenue,unitprice_mean,transactions
4147,DOT,DOTCOM POSTAGE,706,206248.77,292.137068,706
1340,22423,REGENCY CAKESTAND 3 TIER,13851,174156.54,13.983936,1988
2665,23843,"PAPER CRAFT , LITTLE BIRDIE",80995,168469.6,2.08,1
3637,85123A,WHITE HANGING HEART T-LIGHT HOLDER,37580,104284.24,3.116159,2189
2874,47566,PARTY BUNTING,18283,99445.23,5.797928,1685
3616,85099B,JUMBO BAG RED RETROSPOT,48371,94159.81,2.486197,2089
2120,23166,MEDIUM CERAMIC TOP STORAGE JAR,78033,81700.92,1.46848,247
4150,POST,POSTAGE,3150,78101.88,31.076581,1126
4148,M,Manual,6984,77750.27,234.489652,289
2026,23084,RABBIT NIGHT LIGHT,30739,66870.03,2.386401,994


In [10]:
# Cell 9 - Apriori: prepare transaction-product one-hot encoding
# We will create a basket per InvoiceNo (transaction-level)
basket = sales.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().fillna(0)
# Convert to 1/0 for apriori (presence/absence)
basket_binary = basket.applymap(lambda x: 1 if x > 0 else 0)

print("Basket shape:", basket_binary.shape)


  basket_binary = basket.applymap(lambda x: 1 if x > 0 else 0)


Basket shape: (19960, 4015)


In [12]:
# Example: keep only products that appear in at least min_txn transactions
min_txn = 200   # tune: 200 means product appears in >=200 invoices
product_freq = sales.groupby('Description')['InvoiceNo'].nunique().sort_values(ascending=False)
keep_products = product_freq[product_freq >= min_txn].index.tolist()
print("Products before:", sales['Description'].nunique(), "-> after filter:", len(keep_products))

# Build the basket only for filtered products
basket_small = (sales[sales['Description'].isin(keep_products)]
                .groupby(['InvoiceNo', 'Description'])['Quantity'].sum()
                .unstack(fill_value=0))
# Convert to boolean (True/False) — important to avoid deprecation warning
basket_binary_small = (basket_small > 0).astype(bool)

from mlxtend.frequent_patterns import apriori, association_rules
freq_items = apriori(basket_binary_small, min_support=0.01, use_colnames=True)
rules = association_rules(freq_items, metric="lift", min_threshold=1.2)
rules[['antecedents','consequents','support','confidence','lift']].head()


Products before: 4015 -> after filter: 809


Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(6 RIBBONS RUSTIC CHARM),(DOTCOM POSTAGE),0.010554,0.211518,5.734064
1,(DOTCOM POSTAGE),(6 RIBBONS RUSTIC CHARM),0.010554,0.286119,5.734064
2,(6 RIBBONS RUSTIC CHARM),(JAM MAKING SET PRINTED),0.012331,0.24712,4.070256
3,(JAM MAKING SET PRINTED),(6 RIBBONS RUSTIC CHARM),0.012331,0.203098,4.070256
4,(6 RIBBONS RUSTIC CHARM),(JAM MAKING SET WITH JARS),0.010816,0.216754,3.664711


In [13]:
# Cell 11 - Content-based: TF-IDF on product descriptions
# Build a product-level DataFrame 'product_df' if you haven't already,
# using the filtered products (keep_products) or the full sales data as appropriate.
product_df = (
    sales[sales['Description'].isin(keep_products)]
    .groupby(['Product_id','Description'], as_index=False)
    .agg(total_quantity=('Quantity','sum'),
         total_revenue=('Quantity','sum'))  # total_revenue will be Quantity; replace if you prefer UnitPrice*Quantity 
)

# Fill any NaN descriptions safely
product_df['Description'] = product_df['Description'].fillna('').astype(str)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words='english')
product_tfidf = tfidf.fit_transform(product_df['Description'].values)

# maps for indexing
prod_idx_to_id = product_df['Product_id'].values
prod_id_to_idx = {pid: i for i, pid in enumerate(prod_idx_to_id)}

# product-product content similarity (cosine)
product_product_sim = cosine_similarity(product_tfidf, product_tfidf)

print("Product DF shape:", product_df.shape)
print("TF-IDF matrix shape:", product_tfidf.shape)


Product DF shape: (842, 4)
TF-IDF matrix shape: (842, 824)


In [14]:
# Cell 12 - Collaborative Filtering: build user-item matrix (using filtered product set)
# We'll pivot to customers x products (quantities)
user_item = sales[sales['Description'].isin(keep_products)].pivot_table(
    index='customer_id', columns='Product_id', values='Quantity', aggfunc='sum', fill_value=0
)

print("User-item matrix shape:", user_item.shape)
# Save item list for references
item_ids = list(user_item.columns)
itemid_to_itemidx = {pid: idx for idx, pid in enumerate(item_ids)}


User-item matrix shape: (4285, 805)


In [15]:
# Cell 13 - Item-item similarity via user interactions (cosine)
# Use sklearn's normalize for numerical stability
from sklearn.preprocessing import normalize
import numpy as np

# item_user_matrix: items x users (rows = items)
item_user_matrix = user_item.T.values  # shape (num_items, num_users)
# normalize rows
item_user_norm = normalize(item_user_matrix, axis=1)
# compute dense similarity matrix (watch memory) — ok on filtered set
item_item_sim_matrix = item_user_norm.dot(item_user_norm.T)

print("Item-item similarity matrix shape:", item_item_sim_matrix.shape)


Item-item similarity matrix shape: (805, 805)


In [16]:
# Cell 14 - Utility functions: top-k similar products by content and by item-CF
def top_k_similar_by_content(product_id, k=10):
    if product_id not in prod_id_to_idx:
        return []
    idx = prod_id_to_idx[product_id]
    sims = product_product_sim[idx]
    top_idxs = np.argsort(-sims)[1:k+1]  # skip self at 0
    return [(product_df.iloc[i]['Product_id'], product_df.iloc[i]['Description'], float(sims[i])) for i in top_idxs]

def top_k_similar_by_item_cf(product_id, k=10):
    if product_id not in itemid_to_itemidx:
        return []
    idx = itemid_to_itemidx[product_id]
    sims = item_item_sim_matrix[idx]
    top_idxs = np.argsort(-sims)[1:k+1]
    return [(item_ids[i], product_df.loc[product_df['Product_id']==item_ids[i],'Description'].values[0], float(sims[i])) for i in top_idxs]

# quick check
sample_pid = product_df['Product_id'].iloc[0]
print("Sample product:", sample_pid, product_df['Description'].iloc[0])
print("Top by content:", top_k_similar_by_content(sample_pid, 5))
print("Top by item-CF:", top_k_similar_by_item_cf(sample_pid, 5))


Sample product: 10133 COLOURING PENCILS BROWN TUBE
Top by content: [('10135', 'COLOURING PENCILS BROWN TUBE', 1.0), ('22561', 'WOODEN SCHOOL COLOURING SET', 0.23401586019336376), ('22422', 'TOOTHPASTE TUBE PEN', 0.21708663376319695), ('20974', '12 PENCILS SMALL TUBE SKULL', 0.21143044143287065), ('20978', '36 PENCILS TUBE SKULLS', 0.20654324250431502)]
Top by item-CF: [('10135', 'COLOURING PENCILS BROWN TUBE', 0.2244662249141693), ('22208', 'WOOD STAMP SET THANK YOU', 0.19421218332471438), ('84596B', 'SMALL DOLLY MIX DESIGN ORANGE BOWL', 0.15318544098560233), ('23191', 'BUNDLE OF 3 RETRO NOTE BOOKS', 0.15077361449894827), ('22996', 'TRAVEL CARD WALLET VINTAGE TICKET', 0.1425250187860388)]


In [17]:
# Cell 15 - User-level recommendation functions: item-CF and content-based aggregation
def recommend_item_based_cf(customer_id, top_n=10):
    if customer_id not in user_item.index:
        return []
    user_vector = user_item.loc[customer_id].values  # shape = (num_items,)
    # score each item by similarity to items purchased weighted by user quantities
    scores = user_vector.dot(item_item_sim_matrix)  # shape num_items
    # mask out already purchased items
    purchased_mask = user_vector > 0
    scores[purchased_mask] = -np.inf
    top_idxs = np.argsort(-scores)[:top_n]
    results = []
    for idx in top_idxs:
        pid = item_ids[idx]
        desc = product_df.loc[product_df['Product_id']==pid, 'Description'].values[0]
        results.append({'Product_id': pid, 'Description': desc, 'score': float(scores[idx])})
    return results

def recommend_content_based(customer_id, top_n=10, top_k_past=10):
    if customer_id not in user_item.index:
        return []
    # pick the customer's most frequently bought products
    user_series = user_item.loc[customer_id]
    purchased = user_series[user_series > 0].sort_values(ascending=False)
    if purchased.empty:
        return []
    top_purchased = purchased.index[:top_k_past].tolist()
    # aggregate tf-idf similarities
    agg_scores = np.zeros(product_product_sim.shape[0])
    for pid in top_purchased:
        if pid in prod_id_to_idx:
            idx = prod_id_to_idx[pid]
            agg_scores += product_product_sim[idx]
    # remove already purchased
    purchased_set = set(purchased.index)
    recs = []
    order = np.argsort(-agg_scores)
    for i in order:
        pid = product_df.iloc[i]['Product_id']
        if pid in purchased_set:
            continue
        recs.append({'Product_id': pid, 'Description': product_df.iloc[i]['Description'], 'score': float(agg_scores[i])})
        if len(recs) >= top_n:
            break
    return recs


In [18]:
# Cell 16 - Build apriori rule lookup (convert frozensets to tuples) for quick online scoring
from collections import defaultdict
rules_processed = []
for _, row in rules.iterrows():
    antecedents = tuple(sorted(list(row['antecedents'])))
    consequents = tuple(sorted(list(row['consequents'])))
    rules_processed.append({
        'antecedents': antecedents,
        'consequents': consequents,
        'support': float(row['support']),
        'confidence': float(row['confidence']),
        'lift': float(row['lift'])
    })
# index rules by length of antecedent to speed up subset checks
rules_by_len = defaultdict(list)
for r in rules_processed:
    rules_by_len[len(r['antecedents'])].append(r)

print("Processed", len(rules_processed), "rules and indexed by antecedent length.")


Processed 4080 rules and indexed by antecedent length.


In [19]:
# Cell 17 - Hybrid recommender: combine CF, content, and apriori-boost
import numpy as np

def apriori_boost_for_history(user_history_descriptions):
    """Return dictionary mapping Description -> aggregated apriori boost"""
    user_set = set([d.strip() for d in user_history_descriptions if isinstance(d, str)])
    candidate_boost = defaultdict(float)
    for r in rules_processed:
        if set(r['antecedents']).issubset(user_set):
            boost = r['confidence'] * r['lift']  # tunable formula
            for cons in r['consequents']:
                candidate_boost[cons] += boost
    return candidate_boost

def recommend_hybrid(customer_id, top_n=10, weights={'cf':0.5,'content':0.4,'apriori':0.1}):
    # fallback popular if no history
    if customer_id not in user_item.index:
        top_pop = product_df.sort_values('total_revenue', ascending=False).head(top_n)
        return [{'Product_id': r['Product_id'], 'Description': r['Description'], 'score': float(r['total_revenue'])} for _, r in top_pop.iterrows()]
    # obtain candidate lists
    cf_candidates = recommend_item_based_cf(customer_id, top_n=500)
    content_candidates = recommend_content_based(customer_id, top_n=500)
    cf_map = {c['Product_id']: c['score'] for c in cf_candidates}
    content_map = {c['Product_id']: c['score'] for c in content_candidates}
    candidates = set(list(cf_map.keys()) + list(content_map.keys()))
    # normalize scores
    def normalize_map(m):
        if not m:
            return {}
        vals = np.array(list(m.values()))
        minv, maxv = vals.min(), vals.max()
        if maxv - minv <= 0:
            return {k: 0.0 for k in m.keys()}
        return {k: (v - minv)/(maxv - minv) for k, v in m.items()}
    cf_norm = normalize_map(cf_map)
    content_norm = normalize_map(content_map)
    # compute apriori boosts
    user_history = sales[sales['customer_id']==customer_id]['Description'].unique().tolist()
    apriori_map = apriori_boost_for_history(user_history)
    # score combination
    scored = []
    for pid in candidates:
        desc = product_df.loc[product_df['Product_id']==pid,'Description'].values[0]
        score = weights['cf']*cf_norm.get(pid,0.0) + weights['content']*content_norm.get(pid,0.0) + weights['apriori']*apriori_map.get(desc,0.0)
        scored.append({'Product_id': pid, 'Description': desc, 'score': float(score)})
    scored = sorted(scored, key=lambda x: -x['score'])[:top_n]
    return scored

# quick run
sample_cid = user_item.index[0]
print("Hybrid recs for sample customer:", sample_cid)
recommend_hybrid(sample_cid, top_n=5)


Hybrid recs for sample customer: 12346.0


[{'Product_id': '23165',
  'Description': 'LARGE CERAMIC TOP STORAGE JAR',
  'score': 0.8926262007317242},
 {'Product_id': '23167',
  'Description': 'SMALL CERAMIC TOP STORAGE JAR',
  'score': 0.6254960044126419},
 {'Product_id': '22652',
  'Description': 'TRAVEL SEWING KIT',
  'score': 0.41539605833640836},
 {'Product_id': '21984',
  'Description': 'PACK OF 12 PINK PAISLEY TISSUES',
  'score': 0.3978062317433716},
 {'Product_id': '21980',
  'Description': 'PACK OF 12 RED RETROSPOT TISSUES',
  'score': 0.38056172330465543}]

In [20]:
# Cell 18 - Offline evaluation (approximate): precision@K on a small sample of users
from random import sample
def precision_at_k(recommended, actual, k=5):
    rec_ids = [r['Product_id'] for r in recommended[:k]]
    if not rec_ids:
        return 0.0
    actual_set = set(actual)
    return sum(1 for r in rec_ids if r in actual_set) / k

# sample customers with at least 2 transactions
customers_with_history = sales.groupby('customer_id')['InvoiceNo'].nunique()
eligible_customers = customers_with_history[customers_with_history >= 2].index.tolist()
sample_customers = sample(eligible_customers, min(200, len(eligible_customers)))

results = {'cf':[], 'content':[], 'hybrid':[]}
for cid in sample_customers:
    user_txns = sales[sales['customer_id']==cid].sort_values('InvoiceDate')
    if len(user_txns) < 2:
        continue
    train_products = user_txns.iloc[:-1]['Product_id'].unique().tolist()
    test_products = user_txns.iloc[-1:]['Product_id'].unique().tolist()
    # naive: use current recommenders (they depend on full data; this is an approximation)
    rec_cf = recommend_item_based_cf(cid, top_n=5)
    rec_content = recommend_content_based(cid, top_n=5)
    rec_hybrid = recommend_hybrid(cid, top_n=5)
    results['cf'].append(precision_at_k(rec_cf, test_products, 5))
    results['content'].append(precision_at_k(rec_content, test_products, 5))
    results['hybrid'].append(precision_at_k(rec_hybrid, test_products, 5))

for k,v in results.items():
    print(f"{k} mean precision@5 (sample): {np.mean(v):.4f} (n={len(v)})")


cf mean precision@5 (sample): 0.0000 (n=200)
content mean precision@5 (sample): 0.0000 (n=200)
hybrid mean precision@5 (sample): 0.0000 (n=200)


In [21]:
# Cell 19 - Persist artifacts for Flask or later reuse
import pickle, os
ARTIFACTS_DIR = Path("artifacts")
ARTIFACTS_DIR.mkdir(exist_ok=True)

pickle.dump(tfidf, open(ARTIFACTS_DIR/"tfidf_vectorizer.pkl","wb"))
pickle.dump(product_df, open(ARTIFACTS_DIR/"product_df.pkl","wb"))
pickle.dump(prod_idx_to_id, open(ARTIFACTS_DIR/"prod_idx_to_id.pkl","wb"))
pickle.dump(prod_id_to_idx, open(ARTIFACTS_DIR/"prod_id_to_idx.pkl","wb"))
pickle.dump(product_product_sim, open(ARTIFACTS_DIR/"product_product_sim.pkl","wb"))
pickle.dump(item_ids, open(ARTIFACTS_DIR/"item_ids.pkl","wb"))
pickle.dump(item_item_sim_matrix, open(ARTIFACTS_DIR/"item_item_sim_matrix.pkl","wb"))
pickle.dump(user_item, open(ARTIFACTS_DIR/"user_item.pkl","wb"))
pickle.dump(rules_processed, open(ARTIFACTS_DIR/"apriori_rules_processed.pkl","wb"))

print("Saved artifacts:", list(ARTIFACTS_DIR.iterdir()))


Saved artifacts: [WindowsPath('artifacts/apriori_rules_processed.pkl'), WindowsPath('artifacts/item_ids.pkl'), WindowsPath('artifacts/item_item_sim_matrix.pkl'), WindowsPath('artifacts/product_df.pkl'), WindowsPath('artifacts/product_product_sim.pkl'), WindowsPath('artifacts/prod_idx_to_id.pkl'), WindowsPath('artifacts/prod_id_to_idx.pkl'), WindowsPath('artifacts/tfidf_vectorizer.pkl'), WindowsPath('artifacts/user_item.pkl')]


In [None]:
# Cell 20 (REPLACE existing): Robust per-product OpenRouter explanation generator
# NOTE: set OPENROUTER_API_KEY in your environment before running this cell.
import os, json, time, socket, logging, requests, hashlib
from urllib.parse import urlparse

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("explain")

OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", None)
# Correct host+path per OpenRouter docs
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
# Choose a model you have access to
MODEL_NAME = "meta-llama/llama-3-3.3b-instruct:free"

# Few-shot examples to show the model how to vary style and content (3 different flavors)
_FEW_SHOT_EXAMPLES = [
    {
        "product": "Vintage Tea Tin",
        "ex": "Because you’ve chosen several classic kitchen pieces, this Vintage Tea Tin complements that aesthetic while preserving tea freshness. Its airtight seal and retro finish make it both practical and a decorative accent for your counter."
    },
    {
        "product": "Childrens Apron - Red Spot",
        "ex": "Perfect for family baking, this apron’s washable fabric and adjustable straps make it a safe, long-lasting choice for kids. It matches the colorful, playful kitchen accessories you’ve been buying."
    },
    {
        "product": "Ceramic Money Bank - Retro",
        "ex": "A decorative tabletop accent that doubles as a practical savings tool, it suits your taste for functional vintage objects. Its sturdy ceramic build and charming pattern make it a keepsake as well as a helper for saving."
    }
]

def _host_resolves(hostname: str) -> bool:
    try:
        socket.gethostbyname(hostname)
        return True
    except Exception as e:
        logger.warning("DNS resolution failed for %s: %s", hostname, e)
        return False

def _get_product_features_text(description: str, max_len=140) -> str:
    """
    Create a short 'features' string from the product description (or other metadata if available).
    Truncate to keep prompts compact.
    """
    s = description.strip()
    return (s[:max_len] + "...") if len(s) > max_len else s

def _fallback_explanation(product_name: str, product_features_text: str, user_history: list) -> str:
    """Deterministic fallback explanation that is still contextual & unique-ish."""
    hist_snippet = ""
    if user_history:
        hist_snippet = " Since you bought " + ", ".join(user_history[:3]) + ","
    return f"{product_name} pairs well with items in your recent purchases.{hist_snippet} it offers {product_features_text}. This combination makes it both practical and a good match for your style."

def _call_openrouter_single(product_name: str, product_features_text: str, user_history: list, max_tokens=200):
    """
    Calls OpenRouter for a single product and returns the explanation string or (None, err)
    """
    if not OPENROUTER_API_KEY:
        return None, "no_api_key"
    if not _host_resolves("openrouter.ai"):
        return None, "dns_fail"

    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json"
    }

    # Build few-shot block
    few_shot_block = "\n\n".join([f"Product: {ex['product']}\nExplanation: {ex['ex']}" for ex in _FEW_SHOT_EXAMPLES])

    prompt = (
        f"You are a skilled e-commerce assistant that writes short, persuasive, and personal explanations.\n\n"
        f"Customer recent purchases (most recent first): {user_history}\n\n"
        f"{few_shot_block}\n\n"
        f"Now write a unique, personalized 2–3 sentence explanation for WHY the following product would be useful for this customer:\n\n"
        f"Product: \"{product_name}\"\n"
        f"Product features (summary): {product_features_text}\n\n"
        "Requirements:\n"
        "- Keep it to 2–3 sentences.\n"
        "- Directly reference the customer's purchases or needs when relevant.\n"
        "- Avoid generic phrases like 'based on your past purchases'. Be concrete (mention features, use-case, benefit).\n"
        "- Output ONLY the explanation text (no JSON wrapper)."
    )

    payload = {
        "model": MODEL_NAME,
        "messages": [
            {"role": "system", "content": "You are a creative retail assistant who writes personalized recommendation explanations."},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": max_tokens,
        "temperature": 0.9
    }

    try:
        resp = requests.post(OPENROUTER_URL, headers=headers, json=payload, timeout=30)
    except Exception as e:
        logger.warning("OpenRouter network error: %s", e)
        return None, str(e)

    if resp.status_code != 200:
        logger.warning("OpenRouter returned status %s: %s", resp.status_code, resp.text[:300])
        return None, f"status_{resp.status_code}"

    try:
        data = resp.json()
        content = data.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
    except Exception as e:
        logger.warning("OpenRouter JSON parse error: %s", e)
        content = ""

    # Paraphrase retry if content too short or clearly templated
    if not content or len(content.split()) < 6:
        paraphrase_prompt = prompt + "\n\nIf you produced a generic response, please paraphrase now into a different, more specific style (2–3 sentences)."
        payload["messages"][1]["content"] = paraphrase_prompt
        try:
            resp2 = requests.post(OPENROUTER_URL, headers=headers, json=payload, timeout=25)
            if resp2.status_code == 200:
                data2 = resp2.json()
                content2 = data2.get("choices", [{}])[0].get("message", {}).get("content", "").strip()
                if content2:
                    content = content2
        except Exception as e:
            logger.warning("OpenRouter paraphrase attempt failed: %s", e)

    return content if content else None, None

def generate_explanation_openrouter_notebook(customer_id, recommended_products, user_history, max_tokens=200):
    """
    Main function to call in notebook.
    - customer_id: not used in prompt but passed for logging/caching.
    - recommended_products: list of dicts {'Product_id','Description','score'}
    - user_history: list of description strings (recent purchases)
    Returns: list of dicts: [{'product': desc, 'explanation': text}, ...]
    """
    out = []
    # Limit history length in prompts
    history_for_prompt = user_history[:6] if isinstance(user_history, list) else []
    # Simple in-memory cache for the function call (per session)
    # Key: hash of (customer_id + recommended product descriptions)
    try:
        cache = generate_explanation_openrouter_notebook._cache
    except AttributeError:
        cache = {}
        generate_explanation_openrouter_notebook._cache = cache

    key_source = json.dumps({"cid": str(customer_id), "recs": [r["Description"] for r in recommended_products]}, sort_keys=True)
    key = hashlib.sha256(key_source.encode()).hexdigest()
    if key in cache:
        logger.info("Returning cached explanations for key %s", key[:8])
        return cache[key]

    for r in recommended_products:
        desc = r.get("Description", "")
        feat = _get_product_features_text(desc)
        text, err = _call_openrouter_single(desc, feat, history_for_prompt, max_tokens=max_tokens)
        if text:
            # Post-process: ensure 2-3 sentences, truncate if too long
            sentences = [s.strip() for s in text.split('.') if s.strip()]
            if len(sentences) > 3:
                text = '. '.join(sentences[:3]).strip() + '.'
            out.append({"product": desc, "explanation": text})
        else:
            # fallback deterministic explanation
            fb = _fallback_explanation(desc, feat, history_for_prompt)
            out.append({"product": desc, "explanation": fb})
        # small delay to be polite and avoid rate-limits
        time.sleep(1.25)

    # final uniqueness pass: if exact duplicates exist, tweak the duplicate ones
    seen = {}
    for i, item in enumerate(out):
        txt = item["explanation"].strip()
        if txt in seen:
            # add a short product-specific clause to differentiate
            out[i]["explanation"] = txt + " Notably, this item stands out with its " + (_get_product_features_text(item["product"], max_len=60)) + "."
        else:
            seen[txt] = True

    # cache results
    cache[key] = out
    return out


In [None]:
# Cell 21 - High-level API wrapper for getting recommendations (used by Flask later)
def recommend_for_customer(customer_id, n=10, weights={'cf': 0.5, 'content': 0.4, 'apriori': 0.1}, include_explanations=False):
    """
    Returns recommendations and optional LLM-generated explanations.
    """
    if customer_id not in user_item.index:
        # fallback: top popular products
        top_pop = product_df.sort_values('total_revenue', ascending=False).head(n)
        results = [
            {'Product_id': str(r['Product_id']), 'Description': r['Description'], 'score': float(r['total_revenue'])}
            for _, r in top_pop.iterrows()
        ]
        explanation = None

    else:
        # Hybrid recommendation (CF + content + apriori mix)
        results = recommend_hybrid(customer_id, top_n=n)
        explanation = None

        if include_explanations:
            # Prepare the recent purchase history for context (limit to latest 10)
            if 'InvoiceDate' in sales.columns and 'Description' in sales.columns and 'CustomerID' in sales.columns:
                user_history = (
                    sales[sales['CustomerID'] == customer_id]
                    .sort_values('InvoiceDate', ascending=False)
                    .head(10)['Description']
                    .dropna()
                    .unique()
                    .tolist()
                )
            else:
                # fallback to previously defined helper
                user_history = sales[sales['customer_id']==customer_id]['Description'].unique().tolist()

            # Generate personalized explanations using improved function from Cell 20
            explanation = generate_explanation_openrouter_notebook(customer_id, results, user_history)

    # Build unified response object
    return {
        'customer_id': customer_id,
        'recommendations': results,
        'explanation': explanation
    }


In [25]:
# Cell 22 - Example usage: get 5 recommendations and optionally explanation (if API key available)
sample_customer = user_item.index[0] if len(user_item.index)>0 else None
if sample_customer is not None:
    out = recommend_for_customer(sample_customer, n=5, include_explanations=False)
    import json
    print(json.dumps(out, indent=2))
else:
    print("No customers in user_item; ensure user_item was built correctly.")


{
  "customer_id": 12346.0,
  "recommendations": [
    {
      "Product_id": "23165",
      "Description": "LARGE CERAMIC TOP STORAGE JAR",
      "score": 0.8926262007317242
    },
    {
      "Product_id": "23167",
      "Description": "SMALL CERAMIC TOP STORAGE JAR",
      "score": 0.6254960044126419
    },
    {
      "Product_id": "22652",
      "Description": "TRAVEL SEWING KIT",
      "score": 0.41539605833640836
    },
    {
      "Product_id": "21984",
      "Description": "PACK OF 12 PINK PAISLEY TISSUES",
      "score": 0.3978062317433716
    },
    {
      "Product_id": "21980",
      "Description": "PACK OF 12 RED RETROSPOT TISSUES",
      "score": 0.38056172330465543
    }
  ],
  "explanation": null
}


In [26]:
# Cell 23 - (Optional) Write a simple Flask app + index.html (writes files to disk)
# This writes app.py and templates/index.html so you can run `python app.py` to serve the recommender.
from pathlib import Path
app_py = r"""
from flask import Flask, render_template, request, jsonify
import pickle
from pathlib import Path

ARTIFACTS_DIR = Path("artifacts")
product_df = pickle.load(open(ARTIFACTS_DIR/"product_df.pkl","rb"))
user_item = pickle.load(open(ARTIFACTS_DIR/"user_item.pkl","rb"))
item_ids = pickle.load(open(ARTIFACTS_DIR/"item_ids.pkl","rb"))
item_item_sim_matrix = pickle.load(open(ARTIFACTS_DIR/"item_item_sim_matrix.pkl","rb"))

from flask import Flask, render_template, request, jsonify
app = Flask(__name__)

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/api/recommend', methods=['GET'])
def api_recommend():
    cid = request.args.get('customer_id', type=float)
    n = request.args.get('n', default=10, type=int)
    if cid is None:
        return jsonify({'error':'customer_id required'}), 400
    if cid not in user_item.index:
        top_pop = product_df.sort_values('total_revenue', ascending=False).head(n)
        recs = [{'Product_id': r['Product_id'], 'Description': r['Description'], 'score': float(r['total_revenue'])} for _, r in top_pop.iterrows()]
        return jsonify({'customer_id': cid, 'recommendations': recs})
    # simple item-based scoring
    user_vector = user_item.loc[cid].values
    scores = user_vector.dot(item_item_sim_matrix)
    purchased_mask = user_vector > 0
    scores[purchased_mask] = -1e9
    top_idxs = scores.argsort()[::-1][:n]
    recs = []
    for idx in top_idxs:
        pid = item_ids[idx]
        desc = product_df.loc[product_df['Product_id']==pid, 'Description'].values[0]
        recs.append({'Product_id': pid, 'Description': desc, 'score': float(scores[idx])})
    return jsonify({'customer_id': cid, 'recommendations': recs})

if __name__ == '__main__':
    app.run(debug=True, port=5000)
"""
templates_html = r"""
<!doctype html>
<html>
  <head>
    <meta charset="utf-8">
    <title>Product Recommender</title>
    <style>
      body { font-family: Arial; background:#f4f6f8; color:#333; }
      .container { width:800px; margin:40px auto; background:white; padding:20px; border-radius:8px; box-shadow:0 4px 18px rgba(0,0,0,0.08); }
      input, button { padding:10px; font-size:16px; }
      .rec { padding:10px; border-bottom:1px solid #eee; }
    </style>
  </head>
  <body>
    <div class="container">
      <h2>Product Recommender</h2>
      <input id="customer_id" placeholder="customer id (e.g. 17850)" />
      <button onclick="getRec()">Get Recommendations</button>
      <div id="results"></div>
    </div>
    <script>
      async function getRec(){
        const cid = document.getElementById('customer_id').value;
        if(!cid){ alert('enter customer id'); return; }
        const res = await fetch(`/api/recommend?customer_id=${cid}&n=10`);
        const data = await res.json();
        if(data.error){ alert(data.error); return; }
        const el = document.getElementById('results');
        let html = '<h3>Recommendations</h3>';
        data.recommendations.forEach(r=>{
          html += `<div class="rec"><b>${r.Description}</b><div>Product ID: ${r.Product_id} | Score: ${r.score.toFixed(3)}</div></div>`;
        });
        el.innerHTML = html;
      }
    </script>
  </body>
</html>
"""
Path("app.py").write_text(app_py)
Path("templates").mkdir(exist_ok=True)
Path("templates/index.html").write_text(templates_html)
print("Wrote app.py and templates/index.html. Run `python app.py` to start.")


Wrote app.py and templates/index.html. Run `python app.py` to start.
