In [1]:
import numpy as np
import pandas as pd

In [2]:
data= pd.read_csv('/content/drive/MyDrive/Retail/sample_merged.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,aisle,department
0,22678753,2392060,2604,7,0,51072,prior,35,6,11,3.0,Lemon Zest Sorbetto,37,1,ice cream ice,frozen
1,13346427,1408550,34243,1,1,25449,prior,14,6,11,1.0,Organic Baby Broccoli,83,4,fresh vegetables,produce
2,22925325,2418006,21616,35,1,138390,prior,12,0,15,30.0,Organic Baby Arugula,123,4,packaged vegetables fruits,produce
3,17216127,1816124,43209,3,0,42441,prior,1,6,10,,Large Eggs,86,16,eggs,dairy eggs
4,30363018,3202917,43368,2,0,81257,prior,14,5,12,3.0,Wild Salmon Florentine Cat Food,41,8,cat food care,pets


In [3]:
# Filter for add_to_cart_order < 5
df = data[data['add_to_cart_order'] < 5]

In [5]:
# Encode users and products to integer indices for matrix representation
user_ids = df['user_id'].unique()
product_ids = df['product_id'].unique()
user_map = {id: idx for idx, id in enumerate(user_ids)}
product_map = {id: idx for idx, id in enumerate(product_ids)}
df['user_idx'] = df['user_id'].map(user_map)
df['product_idx'] = df['product_id'].map(product_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['user_idx'] = df['user_id'].map(user_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['product_idx'] = df['product_id'].map(product_map)


In [6]:
df['user_idx'].head(),df['product_idx'].head()

(1    0
 3    1
 4    2
 5    3
 8    4
 Name: user_idx, dtype: int64,
 1    0
 3    1
 4    2
 5    3
 8    4
 Name: product_idx, dtype: int64)

In [7]:
# Define model parameters
n_users, n_items = len(user_ids), len(product_ids)
n_factors = 10  # Number of latent factors
lr = 0.01  # Learning rate
n_epochs = 50  # Number of training epochs
reg = 0.01  # Regularization term to prevent overfitting

# Initialize user and item matrices with random values
user_matrix = np.random.normal(scale=1./n_factors, size=(n_users, n_factors))
item_matrix = np.random.normal(scale=1./n_factors, size=(n_items, n_factors))

# Training with Stochastic Gradient Descent (SGD)
for epoch in range(n_epochs):
    for _, row in df.iterrows():
        u = row['user_idx']
        i = row['product_idx']
        rating = row['add_to_cart_order']

        # Predicted rating
        pred = np.dot(user_matrix[u], item_matrix[i])

        # Error
        error = rating - pred

        # Update user and item latent factors
        user_matrix[u] += lr * (error * item_matrix[i] - reg * user_matrix[u])
        item_matrix[i] += lr * (error * user_matrix[u] - reg * item_matrix[i])

    # Calculate loss
    loss = 0
    for _, row in df.iterrows():
        u = row['user_idx']
        i = row['product_idx']
        rating = row['add_to_cart_order']
        pred = np.dot(user_matrix[u], item_matrix[i])
        loss += (rating - pred) ** 2 + reg * (np.linalg.norm(user_matrix[u]) + np.linalg.norm(item_matrix[i]))

    print(f"Epoch {epoch+1}/{n_epochs}, Loss: {loss}")

Epoch 1/50, Loss: 2517.5685995844115
Epoch 2/50, Loss: 2507.2105754185473
Epoch 3/50, Loss: 2496.7754856187044
Epoch 4/50, Loss: 2486.1967239992896
Epoch 5/50, Loss: 2475.407369513135
Epoch 6/50, Loss: 2464.339662100563
Epoch 7/50, Loss: 2452.9245835024235
Epoch 8/50, Loss: 2441.0915673281256
Epoch 9/50, Loss: 2428.768374371203
Epoch 10/50, Loss: 2415.8811811980972
Epoch 11/50, Loss: 2402.354941040727
Epoch 12/50, Loss: 2388.1140834932526
Epoch 13/50, Loss: 2373.0836194687677
Epoch 14/50, Loss: 2357.1907047432314
Epoch 15/50, Loss: 2340.366682434293
Epoch 16/50, Loss: 2322.5495653117305
Epoch 17/50, Loss: 2303.6868291694936
Epoch 18/50, Loss: 2283.7382720030937
Epoch 19/50, Loss: 2262.678566073459
Epoch 20/50, Loss: 2240.499022832662
Epoch 21/50, Loss: 2217.2080508210743
Epoch 22/50, Loss: 2192.8298653113497
Epoch 23/50, Loss: 2167.401240254111
Epoch 24/50, Loss: 2140.9664684070235
Epoch 25/50, Loss: 2113.5711407988374
Epoch 26/50, Loss: 2085.2557401705158
Epoch 27/50, Loss: 2056.05021

In [8]:
# Function to predict for a given user and product
def predict(user_id, product_id):
    u = user_map.get(user_id)
    i = product_map.get(product_id)
    if u is None or i is None:
        return "User or product not found."
    return np.dot(user_matrix[u], item_matrix[i])

# Example prediction for a specific user and product
user_id = 41658
product_id = 9550
prediction = predict(user_id, product_id)
print(f"Prediction (user {user_id} for product {product_id}): {prediction}")

# Generate recommendations
def recommend(user_id, n_recommendations=5):
    u = user_map.get(user_id)
    if u is None:
        return "User not found."
    scores = np.dot(user_matrix[u], item_matrix.T)
    product_indices = np.argsort(-scores)[:n_recommendations]
    recommended_products = [product_ids[i] for i in product_indices]
    return recommended_products

# Example recommendation for a specific user
user_id = 41658
recommendations = recommend(user_id, n_recommendations=3)
print(f"Top recommendations for user {user_id}: {recommendations}")

Prediction (user 41658 for product 9550): User or product not found.
Top recommendations for user 41658: User not found.


In [10]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

# SGD Training Function for Matrix Factorization
def matrix_factorization_sgd(df, user_matrix, item_matrix, n_epochs=30, lr=0.01, reg=0.02):

    losses = []

    for epoch in range(n_epochs):
        total_loss = 0
        for _, row in df.iterrows():
            u_idx = row['user_idx']
            i_idx = row['product_idx']
            actual = row['add_to_cart_order']

            # Predicted score
            pred = np.dot(user_matrix[u_idx], item_matrix[i_idx].T)
            error = actual - pred

            # Update latent factors
            user_matrix[u_idx] += lr * (error * item_matrix[i_idx] - reg * user_matrix[u_idx])
            item_matrix[i_idx] += lr * (error * user_matrix[u_idx] - reg * item_matrix[i_idx])

            total_loss += error ** 2

        mse = total_loss / len(df)
        losses.append(mse)
        print(f"Epoch {epoch+1}/{n_epochs}, Loss: {mse:.4f}")

    return user_matrix, item_matrix, losses

# Prediction Function
def predict(df, user_matrix, item_matrix):
    """Predict user-product interactions using trained matrices."""
    return [np.dot(user_matrix[row['user_idx']], item_matrix[row['product_idx']]) for _, row in df.iterrows()]

# Load and preprocess data
df = pd.read_csv('/content/drive/MyDrive/Retail/sample_merged.csv')

# Map users and products to indices
user_map = {id_: idx for idx, id_ in enumerate(df['user_id'].unique())}
product_map = {id_: idx for idx, id_ in enumerate(df['product_id'].unique())}

df['user_idx'] = df['user_id'].map(user_map)
df['product_idx'] = df['product_id'].map(product_map)

# Hyperparameter Tuning
latent_factors = [10, 20, 30, 40]
learning_rates = [0.01, 0.005]
regularization_params = [0.01, 0.02, 0.05]

best_loss = float('inf')
best_params = {}

for n_factors in latent_factors:
    for lr in learning_rates:
        for reg in regularization_params:
            user_matrix = np.random.normal(scale=1./n_factors, size=(len(user_map), n_factors))
            item_matrix = np.random.normal(scale=1./n_factors, size=(len(product_map), n_factors))

            _, _, losses = matrix_factorization_sgd(df, user_matrix, item_matrix, n_epochs=20, lr=lr, reg=reg)
            final_loss = losses[-1]

            print(f"Tuning params: Factors={n_factors}, LR={lr}, Reg={reg}, Final MSE={final_loss:.4f}")
            if final_loss < best_loss:
                best_loss = final_loss
                best_params = {'n_factors': n_factors, 'lr': lr, 'reg': reg}

print("Optimal Parameters Found:", best_params)

Epoch 1/20, Loss: 127.9556
Epoch 2/20, Loss: 127.4126
Epoch 3/20, Loss: 126.7051
Epoch 4/20, Loss: 125.5683
Epoch 5/20, Loss: 123.4545
Epoch 6/20, Loss: 119.3356
Epoch 7/20, Loss: 112.6704
Epoch 8/20, Loss: 105.1877
Epoch 9/20, Loss: 96.2691
Epoch 10/20, Loss: 86.2895
Epoch 11/20, Loss: 76.2942
Epoch 12/20, Loss: 67.0055
Epoch 13/20, Loss: 58.5854
Epoch 14/20, Loss: 51.0903
Epoch 15/20, Loss: 44.5582
Epoch 16/20, Loss: 38.9188
Epoch 17/20, Loss: 34.0589
Epoch 18/20, Loss: 29.8728
Epoch 19/20, Loss: 26.2685
Epoch 20/20, Loss: 23.1628
Tuning params: Factors=10, LR=0.01, Reg=0.01, Final MSE=23.1628
Epoch 1/20, Loss: 127.9285
Epoch 2/20, Loss: 127.3740
Epoch 3/20, Loss: 126.6169
Epoch 4/20, Loss: 125.3244
Epoch 5/20, Loss: 122.7924
Epoch 6/20, Loss: 118.1259
Epoch 7/20, Loss: 112.2344
Epoch 8/20, Loss: 104.9623
Epoch 9/20, Loss: 95.3905
Epoch 10/20, Loss: 84.9546
Epoch 11/20, Loss: 75.3264
Epoch 12/20, Loss: 66.8538
Epoch 13/20, Loss: 59.1155
Epoch 14/20, Loss: 51.9351
Epoch 15/20, Loss: 4