In [1]:
%pip install pandas numpy tensorflow scikit-learn

Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Downloading numpy-2.2.6-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting tensorflow
  Using cached tensorflow-2.19.0-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.3.0-py3-none-any.whl.metadata (2.4 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Using cached flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel("Superstore.xlsx")
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2013-152156,2013-11-09,2013-11-12,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2013-152156,2013-11-09,2013-11-12,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2013-138688,2013-06-13,2013-06-17,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2012-108966,2012-10-11,2012-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2012-108966,2012-10-11,2012-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [3]:
df.columns

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit'],
      dtype='object')

In [4]:
interactions_cols =  ["Product ID", "Customer ID"]
interactions = df[interactions_cols]
interactions.head()

Unnamed: 0,Product ID,Customer ID
0,FUR-BO-10001798,CG-12520
1,FUR-CH-10000454,CG-12520
2,OFF-LA-10000240,DV-13045
3,FUR-TA-10000577,SO-20335
4,OFF-ST-10000760,SO-20335


In [5]:
products_cols = ["Product ID", "Product Name"]
products = df[products_cols]
products.head()

Unnamed: 0,Product ID,Product Name
0,FUR-BO-10001798,Bush Somerset Collection Bookcase
1,FUR-CH-10000454,"Hon Deluxe Fabric Upholstered Stacking Chairs,..."
2,OFF-LA-10000240,Self-Adhesive Address Labels for Typewriters b...
3,FUR-TA-10000577,Bretford CR4500 Series Slim Rectangular Table
4,OFF-ST-10000760,Eldon Fold 'N Roll Cart System


In [6]:
# useless_cols = ["Row ID", "Order ID", "Customer Name", "Ship Date", "Ship Mode", "Region", "Product Name", "Profit", "Quantity", "Sales"]
# df = df.drop(columns=useless_cols)
# df.head()

In [7]:
df.columns = [col.strip().replace(" ", "_") for col in df.columns]

In [8]:
from sklearn.preprocessing import LabelEncoder

user_enc = LabelEncoder()
item_enc = LabelEncoder()

In [9]:
df['user_id'] = user_enc.fit_transform(df['Customer_ID'])
df['item_id'] = item_enc.fit_transform(df['Product_ID'])

In [10]:
num_users = df['user_id'].nunique()
num_items = df['item_id'].nunique()

In [11]:
item_features_df = df[['item_id', 'Category', 'Sub-Category', 'Discount']].drop_duplicates('item_id')
item_features_df = pd.get_dummies(item_features_df.set_index('item_id'), columns=['Category', 'Sub-Category'])
item_features = item_features_df.sort_index().values.astype('float32')  # shape: (num_items, feature_dim)

In [12]:
interactions = df[['user_id', 'item_id']].drop_duplicates()

In [13]:
user_item_set = interactions.groupby('user_id')['item_id'].apply(set).to_dict()

In [14]:
product_info = df[['item_id', 'Product_ID', 'Product_Name']].drop_duplicates('item_id')
product_id_to_name = dict(zip(product_info['item_id'], product_info['Product_Name']))

In [15]:
def sample_data(interactions, num_items, num_negatives=3):
    user_ids, item_ids, labels = [], [], []

    for user, item in interactions.to_numpy():
        user_ids.append(user)
        item_ids.append(item)
        labels.append(1)

        for _ in range(num_negatives):
            neg = np.random.randint(0, num_items)
            while neg in user_item_set[user]:
                neg = np.random.randint(0, num_items)
            user_ids.append(user)
            item_ids.append(neg)
            labels.append(0)
    
    return np.array(user_ids), np.array(item_ids), np.array(labels)

In [16]:
user_ids, item_ids, labels = sample_data(interactions, num_items)

In [17]:
from tensorflow.keras import layers, Model
import tensorflow as tf

embedding_dim = 64
item_content_tensor = tf.convert_to_tensor(item_features)  # constant

class HybridRecommender(Model):
    def __init__(self, num_users, item_feature_dim, embedding_dim):
        super().__init__()
        self.user_embedding = layers.Embedding(num_users, embedding_dim)
        self.item_content_encoder = tf.keras.Sequential([
            layers.Dense(128, activation='relu'),
            layers.Dense(embedding_dim)
        ])
        self.item_features = tf.constant(item_content_tensor)

    def call(self, inputs):
        user_input, item_input = inputs
        user_emb = self.user_embedding(user_input)
        item_feat = tf.gather(self.item_features, item_input)
        item_emb = self.item_content_encoder(item_feat)
        dot_product = tf.reduce_sum(user_emb * item_emb, axis=1)
        return dot_product


In [18]:
from sklearn.model_selection import train_test_split

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(
    np.stack([user_ids, item_ids], axis=1), labels, test_size=0.1, random_state=42
)

# Prepare tf.data datasets
train_ds = tf.data.Dataset.from_tensor_slices(((X_train[:,0], X_train[:,1]), y_train)).batch(256).shuffle(1024)
val_ds = tf.data.Dataset.from_tensor_slices(((X_val[:,0], X_val[:,1]), y_val)).batch(256)

# Instantiate and compile model
model = HybridRecommender(num_users, item_features.shape[1], embedding_dim)
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])

# Train
model.fit(train_ds, validation_data=val_ds, epochs=5)


Epoch 1/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.0098 - loss: 0.6575 - val_accuracy: 0.0000e+00 - val_loss: 0.5745
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0051 - loss: 0.5637 - val_accuracy: 0.0000e+00 - val_loss: 0.5723
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0082 - loss: 0.5574 - val_accuracy: 0.0000e+00 - val_loss: 0.5743
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0000e+00 - loss: 0.5494 - val_accuracy: 0.0000e+00 - val_loss: 0.5825
Epoch 5/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0031 - loss: 0.5334 - val_accuracy: 0.0000e+00 - val_loss: 0.5937


<keras.src.callbacks.history.History at 0x22f67c62e70>

In [20]:
model.save('hybrid_recommender_model.h5')



In [21]:
def get_user_product_status_df(user_str, top_k=5):
    try:
        user_id = user_enc.transform([user_str])[0]
    except ValueError:
        raise ValueError(f"User '{user_str}' not found in the dataset.")
    
    # --- Seen products ---
    seen_item_ids = list(user_item_set.get(user_id, []))
    seen_product_ids = item_enc.inverse_transform(seen_item_ids)
    seen_names = [product_id_to_name.get(iid, 'Unknown') for iid in seen_item_ids]

    seen_df = pd.DataFrame({
        'Product ID': seen_product_ids,
        'Product Name': seen_names,
        'Status': ['purchased previously'] * len(seen_item_ids)
    })

    # --- Predicted products (excluding seen) ---
    all_items = np.arange(num_items)
    candidate_items = np.setdiff1d(all_items, seen_item_ids)
    user_tensor = np.full_like(candidate_items, fill_value=user_id)

    scores = model.predict([user_tensor, candidate_items], batch_size=512)
    top_indices = np.argsort(scores)[-top_k:][::-1]
    top_item_ids = candidate_items[top_indices]
    predicted_product_ids = item_enc.inverse_transform(top_item_ids)
    predicted_names = [product_id_to_name.get(iid, 'Unknown') for iid in top_item_ids]

    predicted_df = pd.DataFrame({
        'Product ID': predicted_product_ids,
        'Product Name': predicted_names,
        'Status': ['predicted'] * top_k
    })

    # --- Combine both ---
    return pd.concat([seen_df, predicted_df], ignore_index=True)


In [22]:
user_product_df = get_user_product_status_df('CG-12520', top_k=5)
user_product_df

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step


Unnamed: 0,Product ID,Product Name,Status
0,FUR-BO-10001798,Bush Somerset Collection Bookcase,purchased previously
1,OFF-PA-10003001,Xerox 1986,purchased previously
2,FUR-CH-10000454,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",purchased previously
3,FUR-FU-10004952,C-Line Cubicle Keepers Polyproplyene Holder w/...,purchased previously
4,OFF-ST-10000615,"SimpliFile Personal File, Black Granite, 15w x...",purchased previously
5,FUR-FU-10004712,Westinghouse Mesh Shade Clip-On Gooseneck Lamp...,predicted
6,FUR-FU-10004748,"Howard Miller 16"" Diameter Gallery Wall Clock",predicted
7,FUR-FU-10004845,"Deflect-o EconoMat Nonstudded, No Bevel Mat",predicted
8,FUR-FU-10004864,"Howard Miller 14-1/2"" Diameter Chrome Round Wa...",predicted
9,FUR-FU-10004960,"Seth Thomas 12"" Clock w/ Goldtone Case",predicted
