In [29]:
# Import the required modules
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import numpy as np





Prepare the Data

In [30]:
# Read in the app-data.csv file into a Pandas Dataframe 
consumer_data_df = pd.read_csv('shopping_behavior_updated.csv')

# Review the DataFrame
consumer_data_df.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [31]:
# Check for null values and the data types.
consumer_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             3900 non-null   int64  
 1   Age                     3900 non-null   int64  
 2   Gender                  3900 non-null   object 
 3   Item Purchased          3900 non-null   object 
 4   Category                3900 non-null   object 
 5   Purchase Amount (USD)   3900 non-null   int64  
 6   Location                3900 non-null   object 
 7   Size                    3900 non-null   object 
 8   Color                   3900 non-null   object 
 9   Season                  3900 non-null   object 
 10  Review Rating           3900 non-null   float64
 11  Subscription Status     3900 non-null   object 
 12  Shipping Type           3900 non-null   object 
 13  Discount Applied        3900 non-null   object 
 14  Promo Code Used         3900 non-null   

In [32]:
# Drop rows with null values
df_clean = consumer_data_df.dropna().copy()
consumer_df = df_clean.copy()

In [None]:
# Store original item names
original_item_names = consumer_data_df['Item Purchased'].unique()

Load and preprocess the data:

In [None]:
# Encoding categorical variables
label_encoders = {}
for column in consumer_data_df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    consumer_data_df[column] = le.fit_transform(consumer_data_df[column])
    label_encoders[column] = le

In [None]:
# Create a user-item interaction matrix
interaction_matrix = consumer_data_df.pivot_table(index='Customer ID', columns='Item Purchased', values='Purchase Amount (USD)', fill_value=0)


Apply Truncated SVD:

In [None]:
# Apply Truncated SVD
svd = TruncatedSVD(n_components=20, random_state=42)
svd_matrix = svd.fit_transform(interaction_matrix)

# Reconstruct the interaction matrix
reconstructed_matrix = np.dot(svd_matrix, svd.components_)

# Calculate RMSE for the reconstruction
original_matrix = interaction_matrix.values
rmse = np.sqrt(mean_squared_error(original_matrix, reconstructed_matrix))
print(f'RMSE: {rmse}')

Find the adjusted R squared

In [None]:


def adjusted_r2_score(y_true, y_pred, n_samples, n_features):
    r2 = r2_score(y_true, y_pred)
    adjusted_r2 = 1 - (1 - r2) * ((n_samples - 1) / (n_samples - n_features - 1))
    return adjusted_r2

# Assuming n_samples is the number of samples in your dataset
# and n_features is the number of features in your model

n_samples = interaction_matrix.shape[0]  # Number of samples
n_features = 20  # Number of features (components used in Truncated SVD)

# Calculate adjusted R-squared
adjusted_r2 = adjusted_r2_score(original_matrix.flatten(), reconstructed_matrix.flatten(), n_samples, n_features)
print(f'Adjusted R-squared: {adjusted_r2}')

Generate recommendations:

In [None]:
def recommend_items(user_id, original_matrix, reconstructed_matrix, n_recommendations=5):
    user_index = user_id - 1
    user_ratings = reconstructed_matrix[user_index]
    original_user_ratings = original_matrix[user_index]
    recommendations = np.argsort(user_ratings - original_user_ratings)[::-1][:n_recommendations]
    return recommendations

# Function to get item names from indices
def get_item_names(recommendations, item_mapping):
    return [reverse_item_mapping[idx] for idx in recommendations]

# Create a reverse mapping from encoded values to original item names
reverse_item_mapping = {encoded: original for original, encoded in zip(original_item_names, label_encoders['Item Purchased'].transform(original_item_names))}

# Function to get item names from indices
def get_item_names(recommendations, item_mapping):
    return [item_mapping[idx] for idx in recommendations]

# Function to get recommendations for a given Customer ID
def get_recommendations_for_user(customer_id):
    recommendations = recommend_items(customer_id, original_matrix, reconstructed_matrix)
    recommended_item_names = get_item_names(recommendations, reverse_item_mapping)
    return recommended_item_names

# Example usage for a user-input Customer ID
customer_id = int(input("Enter Customer ID: "))
recommended_item_names = get_recommendations_for_user(customer_id)
print(f'Recommended items for customer ID {customer_id}: {recommended_item_names}')


Find total number of different items purchased.

In [None]:
number_of_different_items = consumer_data_df['Item Purchased'].nunique()
number_of_different_items

Find recommended items for all customer IDs

In [None]:
def get_recommendations_for_all_users(original_matrix, reconstructed_matrix, n_recommendations=5):
    recommendations_for_all_users = {}
    for user_index in range(original_matrix.shape[0]):
        user_ratings = reconstructed_matrix[user_index]
        original_user_ratings = original_matrix[user_index]
        recommendations = np.argsort(user_ratings - original_user_ratings)[::-1][:n_recommendations]
        recommended_item_names = get_item_names(recommendations, reverse_item_mapping)
        recommendations_for_all_users[user_index + 1] = recommended_item_names
    return recommendations_for_all_users

# Example usage to get recommendations for all users
all_recommendations = get_recommendations_for_all_users(original_matrix, reconstructed_matrix)
for user_id, recommended_item_names in all_recommendations.items():
    print(f'Recommended items for customer ID {user_id}: {recommended_item_names}')