In [18]:
import pandas as pd
import numpy as np

try:
  df = pd.read_csv('amazon.csv')
except:
  df = pd.read_csv('amazon.csv', encoding='ISO-8859-1')
df_clean = df[['product_id', 'product_name', 'category', 'actual_price', 'rating', 'rating_count', 'about_product', 'review_content', 'img_link']].copy()
df_clean.columns = ['amazon_id', 'name', 'category', 'price', 'rating', 'rating_count', 'description', 'review', 'image']

df_clean['price'] = df_clean['price'].astype(str).str.replace("â‚¹","", regex=False).str.replace(",","", regex=False)
df_clean['price'] = pd.to_numeric(df_clean['price'], errors='coerce').fillna(0)*290

df_clean['rating_count'] = df_clean['rating_count'].astype(str).str.replace(",","", regex=False)
df_clean['rating_count'] = pd.to_numeric(df_clean['rating_count'], errors='coerce').fillna(0)

df_clean['category'] = df_clean['category'].astype(str).apply(lambda x: x.split("|")[-1])
def fix_link(url):
  url = str(url)
  if '/images/I/' in url :
    url = "https://m.media-amazon.com/images/I/" + url.split("/")[-1]
  return url

df_clean['image'] = df_clean['image'].apply(fix_link)

df_clean['id'] = range(1, len(df_clean) + 1)
df_clean = df_clean[['id', 'name', 'price', 'category', 'description', 'review', 'image', 'rating_count', 'amazon_id']]
df_clean['tags'] = (df_clean['name'] + " " + df_clean['category'] + " " + df_clean['description'] + df_clean['review']).astype(str).str.lower()
df_clean.to_csv('final_dataset.csv', index = False)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn

In [20]:
app = FastAPI()
try:
  df = pd.read_csv('final_dataset.csv')
except:
  df = pd.read_csv('final_dataset.csv', encoding='ISO-8859-1')
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['tags'].fillna(''))
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [21]:
viewed_ids = [1,2,3]
valid_indices = df.index[df['id'].isin(viewed_ids)].tolist()
viewed_vectors = tfidf_matrix[valid_indices]
user_profile = np.asarray(viewed_vectors.mean(axis=0))
score = cosine_similarity(tfidf_matrix, user_profile).flatten()
top_indecies = np.argsort(score)[-15:][::-1]
result_ids = df['id'].iloc[top_indecies].tolist()

In [22]:
final_recs = [pid for pid in result_ids if pid not in viewed_ids][:10]
final_recs

[615, 624, 370, 221, 43, 90, 380, 81, 378, 623]

In [23]:
def clean_str(text):

    return str(text).replace("'", "''").replace("\\", "")

In [24]:
with open('insert_products.sql', 'w', encoding='utf-8') as f:
    f.write("SET FOREIGN_KEY_CHECKS = 0;\n")
    f.write("TRUNCATE TABLE product;\n")
    f.write("SET FOREIGN_KEY_CHECKS = 1;\n")

    for index, row in df_clean.iterrows():
        sql = (f"INSERT INTO product (id, product_name, price, category, image, description, rating_count, rating) "
               f"VALUES ({row['id']}, '{clean_str(row['name'])}', {row['price']}, "
               f"'{clean_str(row['category'])}', '{clean_str(row['image'])}', "
               f"'{clean_str(row['description'])}', 0, 0);\n")
        f.write(sql)