# Content Based Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

from multi_hot_encoder import MultiHotEncoder

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

from sklearn.neighbors import NearestNeighbors

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import joblib
import json

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hiral\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
products = pd.read_csv("data/final-products.csv")
user_item_matrix = pd.read_csv('data/user-item-matrix.csv', index_col=0)
user_product_data = pd.read_csv('data/user-product-data.csv')

In [3]:
# Calculate Top 10 Products
product_scores = user_item_matrix.sum(axis=0)
top_products_ids = product_scores.sort_values(ascending=False)[:10].index.astype('int').tolist()
top_products = products[products['Product ID'].isin(top_products_ids)].reset_index()

# Extract user IDs and store them in a variable.
user_ids = user_product_data['User ID'].unique()

In [4]:
# class MultiHotEncoder(BaseEstimator, TransformerMixin):
#     def __init__(self):
#         self.mlbs = list()
#         self.n_columns = 0
#         self.categories_ = self.classes_ = list()

#     def fit(self, X, y=None):
#         for col in X.columns:
#             mlb = MultiLabelBinarizer(sparse_output=False)
#             mlb.fit(X[col])
#             self.mlbs.append(mlb)
#             self.classes_.append(mlb.classes_)
#             self.n_columns += 1
#         return self

#     def transform(self, X:pd.DataFrame):
#         if self.n_columns == 0:
#             raise ValueError('Please fit the transformer first.')
#         if self.n_columns != X.shape[1]:
#             raise ValueError(f'The fit transformer deals with {self.n_columns} columns '
#                              f'while the input has {X.shape[1]}.'
#                             )
#         result = list()
#         for i in range(self.n_columns):
#             result.append(self.mlbs[i].transform(X.iloc[:,i]))

#         result = np.concatenate(result, axis=1)
#         return result

In [5]:
def get_word2vec_vector(text, model, vector_size=100):
    if not text:
        return np.zeros(vector_size)
    words = word_tokenize(text.lower())
    words = [word for word in words if word in model.wv.key_to_index]
    if len(words) == 0:
        return np.zeros(vector_size)
    word_vectors = [model.wv[word] for word in words]
    return np.mean(word_vectors, axis=0)

In [6]:
# Preprocessing the data to transform it into a format suitable for the model.
products['Multi Categories'] = products['Categories'].str.split(',')
products['Combined Text'] = products['Name'] + ' ' + products['Description'] + ' ' + products['Short Description']
products['Size'].fillna('', inplace=True)
X = products.drop(['Product ID','Status','Stock Status', 'Categories', 'Name', 'Description', 'Short Description', 'Combined Text'], axis=1, errors='ignore')

categorical_features = ['Size']
numerical_features = ['Final Price']
multi_categorical_features = ['Multi Categories']
# text_features = 'Combined Text'

product_text = products['Combined Text'].fillna('')
tokenized_text = [word_tokenize(text.lower()) for text in product_text]
word_vec_model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

# Transform the 'Name', 'Description', and 'Short Description' columns
name_vectors = np.array([get_word2vec_vector(text, word_vec_model) for text in products['Name']])
description_vectors = np.array([get_word2vec_vector(text, word_vec_model) for text in products['Description']])
short_desc_vectors = np.array([get_word2vec_vector(text, word_vec_model) for text in products['Short Description']])
text_features_combined = np.hstack([name_vectors, description_vectors, short_desc_vectors])

X[numerical_features] = X[numerical_features].astype('float')
preprocessor = ColumnTransformer(
    remainder = 'passthrough',
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
        ('multicat', MultiHotEncoder(), multi_categorical_features),
        # ('text', TfidfVectorizer(stop_words='english'), text_features)
    ]
)

X_processed = preprocessor.fit_transform(X)
X = np.hstack([X_processed, text_features_combined])

### k-Nearest Neighbors

In [7]:
# KNN model for the product matrix
knn = NearestNeighbors(n_neighbors=5, algorithm='auto', metric='cosine')
knn.fit(X)

In [8]:
# Test the model on the first product
distances, indices = knn.kneighbors(X[0].reshape(1, -1))
print("Top 5 similar products to the first product:")
similar_product_indices = indices.flatten()[1:]
recommended_products = products.iloc[similar_product_indices].copy()
recommended_products['Similarity'] = 1 - distances.flatten()[1:]
print(recommended_products[['Product ID', 'Similarity']])

Top 5 similar products to the first product:
    Product ID  Similarity
2         4013    0.995752
14        4026    0.808974
8         4020    0.804988
4         4015    0.804891


In [9]:
# Recommends similar products based on the KNN model.
def recommend_products(product_index, knn_model, X, products, n_recommendations=5):
    product_vector = X[product_index].reshape(1, -1)
    distances, indices = knn_model.kneighbors(product_vector, n_neighbors=n_recommendations + 1)  # +1 to exclude itself
    similar_product_indices = indices.flatten()[1:]
    recommended_products = products.iloc[similar_product_indices].copy()
    recommended_products['Similarity'] = 1 - distances.flatten()[1:]  # Convert cosine distance to similarity
    return recommended_products

In [10]:
# Select a random product and use the KNN model to find similar products.
product_index = y = random.randrange(len(products))
recommendations = recommend_products(product_index, knn, X, products)
print("Top 5 similar products to the product Index:", product_index, 'Product ID:', products.loc[product_index,'Product ID'] )
print(recommendations[['Product ID', 'Similarity']])

Top 5 similar products to the product Index: 5 Product ID: 4016
    Product ID  Similarity
20        6080    0.999226
22         499    0.995392
7         4019    0.932625
36        8053    0.886238
11        4023    0.874958


In [11]:
# Recommends products for a user based on their purchase history.
def recommend_for_user(user_product_indices, knn_model, X, products, n_recommendations=5):
    # If the user has no bought products
    if not user_product_indices:
        top_products['Similarity'] = 1
        return top_products
    all_recommendations = pd.DataFrame()
    for product_index in user_product_indices:
        recommendations = recommend_products(product_index, knn_model, X, products, n_recommendations)
        all_recommendations = pd.concat([all_recommendations, recommendations])
    all_recommendations = all_recommendations.drop_duplicates(subset='Product ID')
    all_recommendations = all_recommendations.sort_values(by='Similarity', ascending=False)
    return all_recommendations[:n_recommendations]

In [12]:
# Select a random user and recommend similar products based on their purchase history. 
user_id = random.choice(user_ids)
product_bought_ids = user_product_data[(user_product_data['User ID']== user_id) & (user_product_data['Product Bought'] != 0)]['Product ID'].tolist()
product_bought =  products[products['Product ID'].isin(product_bought_ids)]
user_bought_indices = product_bought.index.tolist()
user_recommendations = recommend_for_user(user_bought_indices, knn, X, products)

print(f"Recommendation for user: {user_id}")
print("Bought products:")
print(product_bought['Product ID'])
print("Recommended products:")
print(user_recommendations[['Product ID', 'Similarity']])

Recommendation for user: 5074
Bought products:
38    16104
Name: Product ID, dtype: int64
Recommended products:
    Product ID  Similarity
25        1707    0.999888
33        4211    0.901243
39       16751    0.889257
37       15535    0.844229
27        1981    0.733149


In [13]:
# Export the model for live processing
joblib.dump(preprocessor, 'model/column_transformer.pkl')
word_vec_model.save('model/word2vec_model.model')
joblib.dump(knn, 'model/knn_model.pkl')

['model/knn_model.pkl']