In [48]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

In [49]:
product_descriptions = pd.read_csv('./data/product_descriptions.csv/product_descriptions.csv')
product_descriptions.shape

(124428, 2)

In [50]:
# Missing values

product_descriptions = product_descriptions.dropna()
product_descriptions.shape
product_descriptions.head()

Unnamed: 0,product_uid,product_description
0,100001,"Not only do angles make joints stronger, they ..."
1,100002,BEHR Premium Textured DECKOVER is an innovativ...
2,100003,Classic architecture meets contemporary design...
3,100004,The Grape Solar 265-Watt Polycrystalline PV So...
4,100005,Update your bathroom with the Delta Vero Singl...


In [51]:
product_descriptions1 = product_descriptions.head(500)
# product_descriptions1.iloc[:,1]

product_descriptions1["product_description"].head(10)

0    Not only do angles make joints stronger, they ...
1    BEHR Premium Textured DECKOVER is an innovativ...
2    Classic architecture meets contemporary design...
3    The Grape Solar 265-Watt Polycrystalline PV So...
4    Update your bathroom with the Delta Vero Singl...
5    Achieving delicious results is almost effortle...
6    The Quantum Adjustable 2-Light LED Black Emerg...
7    The Teks #10 x 1-1/2 in. Zinc-Plated Steel Was...
8    Get the House of Fara 3/4 in. x 3 in. x 8 ft. ...
9    Valley View Industries Metal Stakes (4-Pack) a...
Name: product_description, dtype: object

In [52]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(product_descriptions['product_description'])
X

<124428x268559 sparse matrix of type '<class 'numpy.float64'>'
	with 8256891 stored elements in Compressed Sparse Row format>

In [53]:
# Apply Latent Semantic Analysis (LSA)
num_topics = 100
lsa = TruncatedSVD(n_components=num_topics)
X_lsa = lsa.fit_transform(X)

In [54]:
# Split data into training and testing sets
num_train_samples = 100000
X_train = X_lsa[:num_train_samples]
X_test = X_lsa[num_train_samples:]

In [55]:
# Choose the optimal number of clusters (as before)
num_clusters = 10

In [56]:
# K-Means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_train)

  super()._check_params_vs_input(X, default_n_init=10)


In [57]:
# Map cluster IDs to product indices
cluster_indices = {}
for i, cluster_id in enumerate(kmeans.labels_):
    if cluster_id not in cluster_indices:
        cluster_indices[cluster_id] = []
    cluster_indices[cluster_id].append(i)


In [58]:
# Calculate cluster centroids
cluster_centroids = kmeans.cluster_centers_

In [70]:
def recommend_products(query, num_recommendations=5):
    query_vector = vectorizer.transform([query])
    query_vector_lsa = lsa.transform(query_vector)
    predicted_cluster = kmeans.predict(query_vector_lsa)[0]
    cluster_products = cluster_indices[predicted_cluster]

    similarity_scores = cosine_similarity(query_vector_lsa, [cluster_centroids[predicted_cluster]])[0]
    top_indices = similarity_scores.argsort()[-num_recommendations:][::-1]

    recommended_products = [cluster_products[i] for i in top_indices]
    return recommended_products

In [73]:
# Example usage
query = "playstation 4"
recommended_indices = recommend_products(query)
recommended_products = product_descriptions.iloc[recommended_indices]

In [74]:
print("Recommended Products:")
print(recommended_products)

Recommended Products:
   product_uid                                product_description
0       100001  Not only do angles make joints stronger, they ...
