In [79]:
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
files = os.listdir("../data/raw")

data = pd.DataFrame()

for file in files:
    temp_data = pd.read_csv(
        "../data/raw/" + file, 
        quotechar='"', 
        usecols=lambda column: column != 'Unnamed: 0'
    )
    temp_data = temp_data.sample(frac=0.1, random_state=1)  # Obtener solo el 50% de los datos
    data = pd.concat([data, temp_data])


In [168]:
data = pd.read_csv("../data/raw/All Electronics.csv", quotechar='"')
data = data.sample(n=3000, random_state=1)

In [169]:
data.columns

Index(['name', 'main_category', 'sub_category', 'image', 'link', 'ratings',
       'no_of_ratings', 'discount_price', 'actual_price'],
      dtype='object')

In [170]:
data.shape

(3000, 9)

In [171]:
data["desc"] = data["main_category"] + " " + data["sub_category"] + " " + data["name"]

In [174]:
from sklearn.feature_extraction import text

stop_words = text.ENGLISH_STOP_WORDS

data["desc"] = data["desc"].apply(lambda x: ' '.join([word.lower() for word in x.split() if word not in stop_words]))
data["index"] = [i for i in range(data.shape[0])]

In [176]:
data.to_csv("../data/interim/data.csv", index=False)

In [177]:
data = pd.read_csv("../data/interim/data.csv")

In [178]:
vectorizer = TfidfVectorizer(max_features=5000, max_df=0.85, min_df=5)
tfidf_matrix = vectorizer.fit_transform(data['desc'])

In [179]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [180]:
import numpy as np

np.save("../data/interim/cosine_sim.npy", cosine_sim)

In [183]:
def recommend(product_index, data = data, cosine_sim=cosine_sim):
    
    similar_products = list(enumerate(cosine_sim[product_index]))
    similar_products = sorted(similar_products, key=lambda x: x[1], reverse=True)[1:6]  # Top 5 similares
    return data.iloc[[i[0] for i in similar_products]]

recommend(10)

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,desc,index
669,realme Buds Air 2 True Wireless in Ear Earbuds...,"tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/I/71WdDANbqD...,https://www.amazon.in/realme-Wireless-Cancella...,3.7,3629,"₹3,299","₹4,999","tv, audio & cameras electronics realme buds ai...",669
167,truke Buds PRO Hybrid Active Noise Cancelling ...,"tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/I/51Bb4BJB9h...,https://www.amazon.in/truke-Cancelling-Bluetoo...,3.4,345,"₹1,499","₹4,499","tv, audio & cameras electronics truke buds pro...",167
1818,truke Buds PRO Hybrid Active Noise Cancelling ...,"tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/I/51Bb4BJB9h...,https://www.amazon.in/truke-Cancelling-Bluetoo...,3.4,345,"₹1,499","₹4,499","tv, audio & cameras electronics truke buds pro...",1818
1862,truke Buds PRO Hybrid Active Noise Cancelling ...,"tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/I/51VJxkEBRH...,https://www.amazon.in/truke-Cancelling-Bluetoo...,3.5,1361,"₹1,499","₹4,499","tv, audio & cameras electronics truke buds pro...",1862
651,Boult Audio Curve ANC Wireless in Ear Wireless...,"tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/Boult-Audio-Wireless-Ear...,3.8,104356,"₹1,499","₹5,499","tv, audio & cameras electronics boult audio cu...",651


In [136]:
data["ratings"] = pd.to_numeric(data["ratings"], errors='coerce')
data["ratings"].mean()

np.float64(4.081876476544043)

In [None]:
def weighted_rating(df, min_ratings=50):
    df["ratings"] = pd.to_numeric(data["ratings"], errors='coerce')
    df["no_of_ratings"] = pd.to_numeric(data["no_of_ratings"], errors='coerce')
    C = df['ratings'].mean()  # Promedio de todas las calificaciones
    m = df['no_of_ratings'].quantile(0.75)  # Cantidad mínima de calificaciones (percentil 75)
    
    def bayesian_rating(row):
        v = row['no_of_ratings']
        R = row['ratings']
        return (v / (v + m) * R) + (m / (m + v) * C)

    df = df.copy()
    df['score'] = df.apply(bayesian_rating, axis=1)
    return df.sort_values('score', ascending=False)

popular_products = weighted_rating(data)
popular_products.head()

Unnamed: 0.1,Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,desc,score
2600,3081,ESR Boost Kickstand Case for Samsung Galaxy S2...,"tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/W/IMAGERENDE...,https://www.amazon.in/ESR-S23-Ultra-Military-G...,4.7,685.0,"₹1,329","₹2,099","tv, audio & cameras all electronics esr boost ...",4.436421
1510,4371,"Anker Usb C, 20W Pd Fast Powerport Iii Charger...","tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/I/213SOaOt0Q...,https://www.amazon.in/Charger-Anker-PowerPort-...,4.6,931.0,"₹1,299","₹1,699","tv, audio & cameras all electronics anker usb ...",4.4168
304,7270,Spigen Liquid Air Back Cover Case Compatible F...,"tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/I/71TBlK2o0p...,https://www.amazon.in/Spigen-Liquid-Compatible...,4.6,804.0,"₹1,099","₹1,499","tv, audio & cameras all electronics spigen liq...",4.399083
2484,9192,Pentel Mechanical Pencil Graphgear500 - 0.5mm ...,"tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/I/51CGmhM0Eo...,https://www.amazon.in/Pentel-Mechanical-Pencil...,4.6,777.0,,₹599,"tv, audio & cameras all electronics pentel mec...",4.394865
2611,940,SWAPKART 5-in-1 Cleaning Soft Brush Keyboard C...,"tv, audio & cameras",All Electronics,https://m.media-amazon.com/images/I/51L5QxMDop...,https://www.amazon.in/SWAPKART-Cleaning-Multi-...,4.6,702.0,₹149,₹499,"tv, audio & cameras all electronics swapkart 5...",4.382164
