In [72]:
import pandas as pd
df= pd.read_csv('retail_combined_data.csv')

In [73]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization


In [74]:
customer_encoder = LabelEncoder()
product_encoder = LabelEncoder()
df["customer_id"] = customer_encoder.fit_transform(df["customer_id"])
df["product_id"] = product_encoder.fit_transform(df["product_id"])
df

Unnamed: 0,customer_id,product_id,purchase_score,name,description
0,269,2,5,Headphones,Noise-canceling wireless headphones
1,53,4,2,Smartwatch,Smartwatch with health tracking features
2,412,3,4,Camera,DSLR camera with 4K video recording
3,130,0,5,Laptop,High-performance laptop with latest processor
4,609,1,4,Smartphone,Feature-rich smartphone with excellent camera
...,...,...,...,...,...
19995,52,3,2,Camera,DSLR camera with 4K video recording
19996,788,2,4,Headphones,Noise-canceling wireless headphones
19997,396,1,2,Smartphone,Feature-rich smartphone with excellent camera
19998,193,3,3,Camera,DSLR camera with 4K video recording


In [75]:
df['product_id'].unique()

array([2, 4, 3, 0, 1], dtype=int64)

In [76]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

## Collaborative filtering

In [77]:
num_customers = df["customer_id"].nunique()+1
num_products = df["product_id"].nunique()+1
embedding_dim = 10

In [78]:
# Input layers
customer_input = Input(shape=(1,), name="customer_input")
product_input = Input(shape=(1,), name="product_input")

# Embeddings
customer_embedding = Embedding(num_customers, embedding_dim, name="customer_embedding")(customer_input)
product_embedding = Embedding(num_products, embedding_dim, name="product_embedding")(product_input)
customer_vec = Flatten()(customer_embedding)
product_vec = Flatten()(product_embedding)

# Merge
collab_layer = Concatenate()([customer_vec, product_vec])
dense_layer = Dense(128, activation='relu')(collab_layer)
dense_layer = Dense(64, activation='relu')(dense_layer)
collab_output = Dense(1, activation='linear', name="collab_output")(dense_layer)

collab_model = Model(inputs=[customer_input, product_input], outputs=collab_output)
collab_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [79]:
collab_model.fit(
    [train_data['customer_id'], train_data['product_id']], train_data['purchase_score'],
    validation_data=([test_data['customer_id'], test_data['product_id']], test_data['purchase_score']),
    epochs=1, batch_size=200
)

[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - loss: 7.4246 - mae: 2.3178 - val_loss: 2.0279 - val_mae: 1.2307


<keras.src.callbacks.history.History at 0x2154008d110>

## Content filtering

In [80]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
raw_text = np.hstack([df.name.str.lower(), df.description.str.lower()])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(raw_text)
description_token= tokenizer.texts_to_sequences(df.description.str.lower())
name_token = tokenizer.texts_to_sequences(df.name.str.lower())
#description_token_test= tokenizer.texts_to_sequences(test_data.description.str.lower())
#name_token_test = tokenizer.texts_to_sequences(test_data.name.str.lower())

In [81]:
max_description_token_length = pd.Series(description_token).map(len).max()
max_name_token_length = pd.Series(name_token).map(len).max()
vocab_size = np.max(np.concatenate([np.concatenate(description_token),np.concatenate(name_token)]))+1

In [82]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
desc_padded = pad_sequences(description_token, maxlen=5)
name_padded = pad_sequences(name_token, maxlen=2)

In [83]:
# Content based model
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
desc_input = Input(shape=(5,), name="desc_input")
name_input= Input(shape=(2,), name="name_input")
desc_embedding = Embedding(input_dim=vocab_size, output_dim=5)(desc_input)
name_embedding = Embedding(input_dim=vocab_size, output_dim=5)(name_input)
desc_rnn = GRU(16)(desc_embedding)
name_rnn = GRU(8)(name_embedding)
merged = Concatenate()([desc_rnn, name_rnn])
dense_layer = Dense(64, activation='relu')(merged)
dense_layer = Dense(64, activation='relu')(dense_layer)
content_output = Dense(1, activation='linear', name="content_output")(dense_layer)

In [84]:
name_train, name_test, desc_train, desc_test = train_test_split(
    name_padded, desc_padded, test_size=0.2, random_state=42
)

In [85]:
print(name_train.shape)
print(desc_train.shape) 
print(train_data['purchase_score'].shape) 

(16000, 2)
(16000, 5)
(16000,)


In [86]:
# content based model
content_model = Model(inputs=[name_input,desc_input], outputs=content_output)
content_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
train_data['purchase_score']= np.array(train_data['purchase_score'])
content_model.fit([name_train,desc_train], train_data['purchase_score'],validation_data=([name_test,desc_test],
                                                            test_data['purchase_score']),epochs=1, batch_size=2)

[1m8000/8000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 7ms/step - loss: 2.2415 - mae: 1.2799 - val_loss: 2.0064 - val_mae: 1.2101


<keras.src.callbacks.history.History at 0x21543fbdc90>

## Hybrid filtering

In [87]:
# Combine collaborative and content models
merged_layer = Concatenate()([collab_output, content_output])
hybrid_dense = Dense(64, activation='relu')(merged_layer)
hybrid_output = Dense(1, activation='linear', name="hybrid_output")(hybrid_dense)

hybrid_model = Model(inputs=[customer_input, product_input,name_input, desc_input], outputs=hybrid_output)
hybrid_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

hybrid_model.fit(
    [train_data['customer_id'], train_data['product_id'],name_train, desc_train],
    train_data['purchase_score'],
    validation_data=([test_data['customer_id'], test_data['product_id'],name_test, desc_test], test_data['purchase_score']),
    epochs=2, batch_size=20
)


Epoch 1/2
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 12ms/step - loss: 2.4399 - mae: 1.3189 - val_loss: 2.0067 - val_mae: 1.2160
Epoch 2/2
[1m800/800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - loss: 2.0284 - mae: 1.2332 - val_loss: 2.0411 - val_mae: 1.2343


<keras.src.callbacks.history.History at 0x2155275e590>

In [111]:
customer_map = {customer_id: idx for idx, customer_id in enumerate(train_data['customer_id'].unique())}
product_map = {product_id: idx for idx, product_id in enumerate(train_data['product_id'].unique())}

def recommend_products(customer_id, top_n=3):
    customer_index = customer_map.get(customer_id, None)
    if customer_index is None:
        return "Customer not found."
        
    product_indices = np.array(list(product_map.values()))

    # Select only relevant padded sequences
    filtered_names = name_padded[product_indices]
    filtered_descs = desc_padded[product_indices]

    # Predict content scores for only mapped products
    content_scores = content_model.predict([filtered_names, filtered_descs]).flatten()
    
    # Predict scores for all products
    collab_scores = collab_model.predict([np.array([customer_index] * len(all_products)), all_products]).flatten()
    
    # Predict content scores
     content_scores = content_model.predict([name_padded,desc_padded]).flatten()
    
    # Hybrid score: weighted sum of collaborative & content scores
    hybrid_scores = 0.5 * collab_scores + 0.5 * content_scores

    # Get top N recommendations
    top_indices = np.argsort(-hybrid_scores)[:top_n]
    recommended_products = [list(product_map.keys())[i] for i in top_indices]

    return recommended_products

In [112]:
# Get recommendations
print(recommend_products(270, top_n=2))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
[2, 4]
