In [2]:
!pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
     ---------------------------------------- 0.0/154.4 kB ? eta -:--:--
     -------------------- ------------------ 81.9/154.4 kB 2.3 MB/s eta 0:00:01
     ------------------------------ ------- 122.9/154.4 kB 1.4 MB/s eta 0:00:01
     -------------------------------------- 154.4/154.4 kB 1.3 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml): started
  Building wheel for scikit-surprise (pyproject.toml): finished with status 'done'
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp

In [33]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split

In [34]:
# Load datasets
users_df = pd.read_csv('users.csv')       # User interactions
products_df = pd.read_csv('products.csv') # Product attributes

In [35]:
users_df


Unnamed: 0,user_id,product_id,interaction,timestamp
0,528,326,purchase,2025-01-02 08:39:43
1,227,214,view,2025-01-18 06:10:07
2,800,379,purchase,2025-01-05 16:48:16
3,475,226,add_to_cart,2025-01-07 18:41:04
4,729,196,view,2025-01-10 19:49:42
...,...,...,...,...
4995,320,462,purchase,2025-01-18 20:11:41
4996,955,95,purchase,2025-01-15 05:42:28
4997,855,496,add_to_cart,2025-01-20 06:08:59
4998,650,183,purchase,2025-01-15 02:51:06


In [36]:
# We have categorical values (interaction),need to convert into numbers by assigning values
interaction_mapping = {
    "view": 0.2,
    "add_to_cart": 0.7,
    "purchase": 1.0
}

# Apply the mapping to the interaction column
users_df['interaction'] = users_df['interaction'].map(interaction_mapping)

In [37]:
# ================================
# Step 1: Collaborative Filtering
# ================================
# Prepare data for collaborative filtering
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(users_df[['user_id', 'product_id', 'interaction']], reader)

In [38]:
# Split the data
trainset, testset = train_test_split(data, test_size=0.2)


In [39]:
# Train collaborative filtering model
cf_model = SVD()
cf_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2a0a54ab950>

In [40]:
products_df

Unnamed: 0,product_id,product_name,description,category,price
0,1,Product 1,Different second floor Mr treatment join.,Home,446.08
1,2,Product 2,Now newspaper air smile.,Electronics,168.93
2,3,Product 3,Likely even husband use rate discuss.,Home,164.12
3,4,Product 4,Its or television their effect election so.,Beauty,419.89
4,5,Product 5,Glass phone even step show year.,Toys,89.62
...,...,...,...,...,...
495,496,Product 496,Show power land fire seven choose.,Beauty,276.63
496,497,Product 497,Agency return develop history exist trouble.,Home,38.81
497,498,Product 498,Question become go article first.,Books,350.82
498,499,Product 499,Smile fire project nation.,Beauty,427.35


In [41]:
# ================================
# Step 2: Content-Based Filtering
# ================================
# Combine product features (category, price, description) for similarity
products_df['combined_features'] = (
    products_df['category'] + " " +
    products_df['description']
)

In [42]:
# Vectorize combined features
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(products_df['combined_features'])



In [43]:
# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [44]:

# Map product IDs to indices for easy lookup
product_indices = pd.Series(products_df.index, index=products_df['product_id']).to_dict()


In [45]:
# Function to get content-based recommendations
def get_content_recommendations(product_id, top_n=5):
    idx = product_indices.get(product_id, None)
    if idx is None:
        return []
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i[0] for i in similarity_scores[1:top_n + 1]]
    return products_df.iloc[top_indices]['product_id'].tolist()

In [49]:
# ================================
# Step 3: Hybrid Recommendation System
# ================================
def hybrid_recommendations(user_id, top_n=5, weight_cf=0.7, weight_cb=0.3):
    # Collaborative filtering predictions
    all_product_ids = products_df['product_id'].unique()
    cf_scores = []
    for product_id in all_product_ids:
        pred = cf_model.predict(user_id, product_id)
        cf_scores.append((product_id, pred.est))
    cf_scores = sorted(cf_scores, key=lambda x: x[1], reverse=True)[:top_n]

    # Content-based recommendations
    user_history = users_df[users_df['user_id'] == user_id]['product_id']
    cb_scores = []
    for product_id in user_history:
        cb_scores.extend(get_content_recommendations(product_id))
    cb_scores = pd.Series(cb_scores).value_counts().head(top_n).index.tolist()

    # Merge scores (Weighted)
    hybrid_scores = {}
    for product_id, score in cf_scores:
        hybrid_scores[product_id] = hybrid_scores.get(product_id, 0) + score * weight_cf
    for product_id in cb_scores:
        hybrid_scores[product_id] = hybrid_scores.get(product_id, 0) + weight_cb

    # Sort by hybrid score and return top N
    hybrid_scores = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    recommended_product_ids = [product_id for product_id, _ in hybrid_scores[:top_n]]
    return products_df[products_df['product_id'].isin(recommended_product_ids)]


In [51]:
# ================================
# Step 4: Save the Hybrid Model
# ================================
hybrid_model = {
    'cf_model': cf_model,
    'content_sim': cosine_sim,
    'tfidf': tfidf,
    'product_indices': product_indices,
    'products_df': products_df
}

with open('hybrid_recommender.pkl', 'wb') as f:
    pickle.dump(hybrid_model, f)

print("Hybrid recommendation system saved as hybrid_recommender.pkl")

Hybrid recommendation system saved as hybrid_recommender.pkl
