In [1]:
import pandas as pd
import pandas as pd
import ast

In [2]:
data= pd.read_csv("flipkart_com-ecommerce_sample.csv")

In [3]:
# Combine relevant columns for recommendation
data['product_category_tree'] = data['product_category_tree'].apply(lambda x: ' '.join(ast.literal_eval(x)) if isinstance(x, str) else '')
data['combined_features'] = data['product_name'].astype(str) + ' ' + \
                            data['description'].astype(str) + ' ' + \
                            data['product_category_tree'].astype(str)


In [4]:
# Fill missing values
data['combined_features'] = data['combined_features'].fillna('')

# Display a sample
print(data[['product_name', 'combined_features']].head())

                            product_name  \
0    Alisha Solid Women's Cycling Shorts   
1    FabHomeDecor Fabric Double Sofa Bed   
2                             AW Bellies   
3    Alisha Solid Women's Cycling Shorts   
4  Sicons All Purpose Arnica Dog Shampoo   

                                   combined_features  
0  Alisha Solid Women's Cycling Shorts Key Featur...  
1  FabHomeDecor Fabric Double Sofa Bed FabHomeDec...  
2  AW Bellies Key Features of AW Bellies Sandals ...  
3  Alisha Solid Women's Cycling Shorts Key Featur...  
4  Sicons All Purpose Arnica Dog Shampoo Specific...  


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [6]:
# Vectorize combined features
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['combined_features'])

In [7]:
# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [9]:
# Function to recommend products
def recommend_products(keyword, data, cosine_sim, top_n=5):
    matching_indices = data[data['combined_features'].str.contains(keyword, case=False, na=False)].index

    if len(matching_indices) == 0:
        return ["No products found for the given keyword."]
    
    idx = matching_indices[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top N similar products
    top_indices = [i[0] for i in sim_scores[1:top_n+1]]
    return data.iloc[top_indices][['product_name', 'product_url']]


In [10]:
# Test recommendation function
keyword = "pants"
recommendations = recommend_products(keyword, data, cosine_sim)
print("Top Recommendations:")
print(recommendations)

Top Recommendations:
                                 product_name  \
7603      o.h.m Solid Men's Black Track Pants   
10488        Finger's Solid Men's Track Pants   
8070   TeeSort Solid Women's Grey Track Pants   
10492      Quiksilver Solid Men's Track Pants   
1419           TAMBAS Solid Men's Track Pants   

                                             product_url  
7603   http://www.flipkart.com/o-h-m-solid-men-s-blac...  
10488  http://www.flipkart.com/finger-s-solid-men-s-t...  
8070   http://www.flipkart.com/teesort-solid-women-s-...  
10492  http://www.flipkart.com/quiksilver-solid-men-s...  
1419   http://www.flipkart.com/tambas-solid-men-s-tra...  


In [11]:
import pickle
# Save the similarity matrix
with open('cosine_sim.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)
