In [50]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

In [51]:
# Load the main product descriptions dataset
product_descriptions = pd.read_csv('./data/product_descriptions.csv/product_descriptions.csv')
product_descriptions = product_descriptions.dropna()

# Load the additional dataset
additional_dataset = pd.read_csv('./data/train.csv/train.csv', encoding = "ISO-8859-1")

In [52]:
# Merge the main product descriptions with the additional dataset
combined_dataset = pd.merge(product_descriptions, additional_dataset, on='product_uid')

In [53]:
combined_dataset.head()

Unnamed: 0,product_uid,product_description,id,product_title,search_term,relevance
0,100001,"Not only do angles make joints stronger, they ...",2,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0
1,100001,"Not only do angles make joints stronger, they ...",3,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5
2,100002,BEHR Premium Textured DECKOVER is an innovativ...,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0
3,100005,Update your bathroom with the Delta Vero Singl...,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
4,100005,Update your bathroom with the Delta Vero Singl...,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67


In [54]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(combined_dataset['product_description'])

In [55]:
# Apply Latent Semantic Analysis (LSA)
num_topics = 300
lsa = TruncatedSVD(n_components=num_topics)
X_lsa = lsa.fit_transform(X)

In [56]:
# Choose the optimal number of clusters (as before)
num_clusters = 10

In [57]:
# K-Means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_lsa)

  super()._check_params_vs_input(X, default_n_init=10)


In [58]:
# Map cluster IDs to product indices
cluster_indices = {}
for i, cluster_id in enumerate(kmeans.labels_):
    if cluster_id not in cluster_indices:
        cluster_indices[cluster_id] = []
    cluster_indices[cluster_id].append(i)


In [59]:
# Calculate cluster centroids
cluster_centroids = kmeans.cluster_centers_

In [60]:
def recommend_products(query_product_title, num_recommendations=5):
    query_vector = vectorizer.transform([query_product_title])
    query_vector_lsa = lsa.transform(query_vector)
    
    predicted_clusters = kmeans.predict(query_vector_lsa)
    
    recommended_products = []
    for cluster_id in predicted_clusters:
        cluster_products = cluster_indices.get(cluster_id, [])
        similarity_scores = cosine_similarity(query_vector_lsa, [cluster_centroids[cluster_id]])[0]
        top_indices = similarity_scores.argsort()[-num_recommendations:][::-1]
        recommended_products.extend([(cluster_id, cluster_products[i], similarity_scores[i]) for i in top_indices])
    
    return recommended_products

In [61]:
# Example usage
product_title = "metal saw"
num_recommendations = 5
recommended_indices = recommend_products(product_title, num_recommendations)
recommended_products = []

for cluster_id, index, _ in recommended_indices:
    recommended_products.append(combined_dataset.iloc[index])
print("Recommended Products:")
for product in recommended_products:
    print(product['product_title'])

Recommended Products:
Simpson Strong-Tie 12-Gauge Angle


In [63]:
import pickle

# ... Your existing code ...

# Save components to files
with open('kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

with open('cluster_indices.pkl', 'wb') as f:
    pickle.dump(cluster_indices, f)

with open('cluster_centroids.pkl', 'wb') as f:
    pickle.dump(cluster_centroids, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('lsa_model.pkl', 'wb') as f:
    pickle.dump(lsa, f)


In [65]:
combined_dataset.to_csv('./data/combined_dataset.csv')