In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [11]:
merged_data = pd.read_csv("/content/drive/MyDrive/Retail/sample_merged.csv")

In [12]:
merged_data.shape

(1000, 16)

In [13]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              1000 non-null   int64  
 1   order_id                1000 non-null   int64  
 2   product_id              1000 non-null   int64  
 3   add_to_cart_order       1000 non-null   int64  
 4   reordered               1000 non-null   int64  
 5   user_id                 1000 non-null   int64  
 6   eval_set                1000 non-null   object 
 7   order_number            1000 non-null   int64  
 8   order_dow               1000 non-null   int64  
 9   order_hour_of_day       1000 non-null   int64  
 10  days_since_prior_order  932 non-null    float64
 11  product_name            1000 non-null   object 
 12  aisle_id                1000 non-null   int64  
 13  department_id           1000 non-null   int64  
 14  aisle                   1000 non-null   o

In [14]:
merged_data.head()

Unnamed: 0.1,Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,aisle,department
0,22678753,2392060,2604,7,0,51072,prior,35,6,11,3.0,Lemon Zest Sorbetto,37,1,ice cream ice,frozen
1,13346427,1408550,34243,1,1,25449,prior,14,6,11,1.0,Organic Baby Broccoli,83,4,fresh vegetables,produce
2,22925325,2418006,21616,35,1,138390,prior,12,0,15,30.0,Organic Baby Arugula,123,4,packaged vegetables fruits,produce
3,17216127,1816124,43209,3,0,42441,prior,1,6,10,,Large Eggs,86,16,eggs,dairy eggs
4,30363018,3202917,43368,2,0,81257,prior,14,5,12,3.0,Wild Salmon Florentine Cat Food,41,8,cat food care,pets


In [15]:
# Filter for reordered items
user_item_data = merged_data[merged_data['reordered'] == 1]

# Aggregate to handle duplicates
user_item_data = user_item_data.groupby(['user_id', 'product_id']).agg({'reordered': 'sum'}).reset_index()

# Create user-item interaction matrix
user_item_matrix = user_item_data.pivot(index='user_id', columns='product_id', values='reordered').fillna(0)
user_item_matrix = user_item_matrix.values  # Convert to NumPy array
print("User-Item Matrix Shape:", user_item_matrix.shape)

User-Item Matrix Shape: (579, 458)


In [16]:
def optimized_matrix_factorization(R, K, steps=1000, alpha=0.002, beta=0.02, sample_size=1000):
    num_users, num_items = R.shape
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K)).T

    for step in range(steps):
        # Sample user-item pairs
        user_indices = np.random.randint(0, num_users, sample_size)
        item_indices = np.random.randint(0, num_items, sample_size)

        for i, j in zip(user_indices, item_indices):
            if R[i][j] > 0:
                eij = R[i][j] - np.dot(P[i, :], Q[:, j])
                P[i, :] += alpha * (eij * Q[:, j] - beta * P[i, :])
                Q[:, j] += alpha * (eij * P[i, :] - beta * Q[:, j])

        if step % 100 == 0:
            R_hat = np.dot(P, Q)
            non_zero_indices = R > 0
            error = np.mean((R[non_zero_indices] - R_hat[non_zero_indices]) ** 2)
            print(f"Iteration: {step}, Error: {error}")

    return P, Q.T

In [17]:
# Set parameters
K = 2  # Number of latent features
steps = 1000
alpha = 0.002
beta = 0.02
sample_size = 1000  # Number of user-item pairs to sample in each iteration

# Train the model
P, Q = optimized_matrix_factorization(user_item_matrix, K, steps, alpha, beta, sample_size)

# Reconstructed user-item matrix
R_hat = np.dot(P, Q.T)

Iteration: 0, Error: 1.139051597656341
Iteration: 100, Error: 1.1374266464950806
Iteration: 200, Error: 1.1350572038586346
Iteration: 300, Error: 1.1333955730892344
Iteration: 400, Error: 1.131385144926978
Iteration: 500, Error: 1.1293097813615378
Iteration: 600, Error: 1.127471836462331
Iteration: 700, Error: 1.1255120392421782
Iteration: 800, Error: 1.1238950006238146
Iteration: 900, Error: 1.1218176282251047


In [18]:
# Calculate Mean Squared Error
mse = mean_squared_error(user_item_matrix[user_item_matrix.nonzero()], R_hat[user_item_matrix.nonzero()])
print("Mean Squared Error:", mse)

Mean Squared Error: 1.1200334306657034


In [19]:
import numpy as np
from sklearn.model_selection import ParameterGrid

def empirical_tuning(user_item_matrix, param_grid):
    best_params = None
    best_error = float('inf')

    for params in ParameterGrid(param_grid):
        K = params['K']
        alpha = params['alpha']
        beta = params['beta']
        steps = params['steps']

        # Train the model with the current hyperparameters
        P, Q = optimized_matrix_factorization(user_item_matrix, K, steps, alpha, beta)

        # Reconstruct the user-item matrix
        R_hat = np.dot(P, Q.T)

        # Calculate the error
        non_zero_indices = user_item_matrix > 0
        error = np.sum((user_item_matrix[non_zero_indices] - R_hat[non_zero_indices]) ** 2)

        # Check if this is the best error
        if error < best_error:
            best_error = error
            best_params = params

    return best_params, best_error

# Define the parameter grid
param_grid = {
    'K': [2, 5, 10],
    'alpha': [0.001, 0.002, 0.005],
    'beta': [0.01, 0.02, 0.05],
    'steps': [500, 1000]
}

# Perform empirical tuning
best_params, best_error = empirical_tuning(user_item_matrix, param_grid)

print(f"Best Parameters: {best_params}")
print(f"Best Error: {best_error}")

Iteration: 0, Error: 1.1041115703959405
Iteration: 100, Error: 1.1031248829451423
Iteration: 200, Error: 1.1020164344153953
Iteration: 300, Error: 1.101208050733718
Iteration: 400, Error: 1.100337645696658
Iteration: 0, Error: 1.0829513805194477
Iteration: 100, Error: 1.081874668625943
Iteration: 200, Error: 1.0810779446129295
Iteration: 300, Error: 1.0802112370658297
Iteration: 400, Error: 1.0795017939314584
Iteration: 500, Error: 1.0786536619455895
Iteration: 600, Error: 1.0778534378725537
Iteration: 700, Error: 1.07722796409647
Iteration: 800, Error: 1.0762899411947269
Iteration: 900, Error: 1.0754336885562457
Iteration: 0, Error: 1.0791203315705449
Iteration: 100, Error: 1.0784219690202765
Iteration: 200, Error: 1.0776713624510406
Iteration: 300, Error: 1.0766972935487098
Iteration: 400, Error: 1.0760645092524777
Iteration: 0, Error: 1.142510520209722
Iteration: 100, Error: 1.1416066703043672
Iteration: 200, Error: 1.140695242034401
Iteration: 300, Error: 1.1395745105793975
Iterati

In [20]:
# Hyperparameter tuning for K, alpha, and beta
results = []

for K in [1, 2, 5]:
    for alpha in [0.001, 0.002, 0.005]:
        for beta in [0.01, 0.02, 0.05]:
            P, Q = optimized_matrix_factorization(user_item_matrix, K, steps, alpha, beta)
            R_hat = np.dot(P, Q.T)
            mse = mean_squared_error(user_item_matrix[user_item_matrix.nonzero()], R_hat[user_item_matrix.nonzero()])
            results.append({'K': K, 'alpha': alpha, 'beta': beta, 'MSE': mse})

# Print results
results_df = pd.DataFrame(results)
print(results_df)

Iteration: 0, Error: 1.9473545770735812
Iteration: 100, Error: 1.938088687561665
Iteration: 200, Error: 1.9336570675057099
Iteration: 300, Error: 1.9280527856491347
Iteration: 400, Error: 1.9237917509560296
Iteration: 500, Error: 1.9154563269005882
Iteration: 600, Error: 1.9079711005400035
Iteration: 700, Error: 1.902803794023156
Iteration: 800, Error: 1.8946939747249238
Iteration: 900, Error: 1.8899766014479982
Iteration: 0, Error: 2.012870676540365
Iteration: 100, Error: 1.9979210251795243
Iteration: 200, Error: 1.9840776672103575
Iteration: 300, Error: 1.9779049361272945
Iteration: 400, Error: 1.9720679481770245
Iteration: 500, Error: 1.9626577204100755
Iteration: 600, Error: 1.9557047819381945
Iteration: 700, Error: 1.9457439984354787
Iteration: 800, Error: 1.9412671597105906
Iteration: 900, Error: 1.934146623223897
Iteration: 0, Error: 1.988975774652747
Iteration: 100, Error: 1.9825498854095673
Iteration: 200, Error: 1.9779787088070873
Iteration: 300, Error: 1.971032819279738
Iter

In [21]:
# Train the final model with the best parameters
P, Q = optimized_matrix_factorization(user_item_matrix,
                                      int(best_params['K']),
                                      int(best_params['steps']),
                                      best_params['alpha'],
                                      best_params['beta'])

# Reconstruct the user-item matrix
R_hat = np.dot(P, Q.T)

# Calculate final MSE
mse = mean_squared_error(user_item_matrix[user_item_matrix.nonzero()], R_hat[user_item_matrix.nonzero()])
print("Final Mean Squared Error:", mse)

Iteration: 0, Error: 1.0010672260060332
Iteration: 100, Error: 0.9995365969666714
Iteration: 200, Error: 0.9981585971307025
Iteration: 300, Error: 0.9967435325148463
Iteration: 400, Error: 0.9953091717168268
Iteration: 500, Error: 0.9938635486962866
Iteration: 600, Error: 0.9923546232905696
Iteration: 700, Error: 0.9908795815211273
Iteration: 800, Error: 0.989097594675815
Iteration: 900, Error: 0.9876880669273901
Final Mean Squared Error: 0.986064773905864


In [22]:
from sklearn.metrics import mean_absolute_error, precision_score, recall_score
from collections import defaultdict

In [23]:
from sklearn.metrics import mean_absolute_error

def calculate_mse(actual, predicted):
    return mean_squared_error(actual, predicted)

def calculate_rmse(actual, predicted):
    return np.sqrt(mean_squared_error(actual, predicted))

def calculate_mae(actual, predicted):
    return mean_absolute_error(actual, predicted)

def precision_at_k(actual, predicted, k):
    actual_set = set(np.nonzero(actual)[0])
    predicted_indices = np.argsort(predicted)[-k:][::-1]
    hits = len(set(predicted_indices).intersection(actual_set))
    return hits / k if k > 0 else 0

def recall_at_k(actual, predicted, k):
    actual_set = set(np.nonzero(actual)[0])
    predicted_indices = np.argsort(predicted)[-k:][::-1]
    hits = len(set(predicted_indices).intersection(actual_set))
    return hits / len(actual_set) if len(actual_set) > 0 else 0

def hit_rate(actual, predicted, k):
    actual_set = set(np.nonzero(actual)[0])
    predicted_indices = np.argsort(predicted)[-k:][::-1]
    return 1 if len(set(predicted_indices).intersection(actual_set)) > 0 else 0

def coverage(user_item_matrix, predicted, k):
    recommended_items = set(np.argsort(predicted)[-k:][::-1])
    return len(recommended_items) / user_item_matrix.shape[1]

In [24]:
# Assuming R_hat is the predicted ratings and user_item_matrix is the actual ratings
actual = user_item_matrix
predicted = R_hat

# Flatten the arrays for evaluation
actual_flat = actual.flatten()
predicted_flat = predicted.flatten()

# Calculate metrics
mse = calculate_mse(actual_flat[actual_flat > 0], predicted_flat[actual_flat > 0])
rmse = calculate_rmse(actual_flat[actual_flat > 0], predicted_flat[actual_flat > 0])
mae = calculate_mae(actual_flat[actual_flat > 0], predicted_flat[actual_flat > 0])

k = 3  # You can change this value to evaluate at different K
precision = precision_at_k(actual_flat, predicted_flat, k)
recall = recall_at_k(actual_flat, predicted_flat, k)
hit = hit_rate(actual_flat, predicted_flat, k)
cov = coverage(user_item_matrix, predicted_flat, k)

# Print metrics
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("Precision at K:", precision)
print("Recall at K:", recall)
print("Hit Rate:", hit)
print("Coverage:", cov)

Mean Squared Error (MSE): 0.986064773905864
Root Mean Squared Error (RMSE): 0.9930079425190234
Mean Absolute Error (MAE): 0.9887123534736547
Precision at K: 0.0
Recall at K: 0.0
Hit Rate: 0
Coverage: 0.006550218340611353
