# Import Libraries

In [74]:
import pandas as pd
import json
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

from text_preprocessing import preprocess_text

# API Usage

In [75]:
# Base URL of your Flask server
BASE_URL = "http://127.0.0.1:5000"

# Example 1: Hello world
res = requests.get(f"{BASE_URL}/")
# print(res.text)

# Example 2: Get all products
res = requests.get(f"{BASE_URL}/getAllProduct")
# print(res.json())

# Example 3: Get all categories
res = requests.get(f"{BASE_URL}/getAllCategory")
# print(res.json())

# Example 4: Get all reviews
res = requests.get(f"{BASE_URL}/getAllReview")
# print(res.json())

# Example 5: Get products by category
category_id = "1"  # Replace with actual category ID
res = requests.get(f"{BASE_URL}/getAllProductByCategory", params={"category": category_id})
# print(res.json())

# Example 6: Get reviews by product
product_id = "4"  # Replace with actual product ID
res = requests.get(f"{BASE_URL}/getAllReviewByProduct", params={"product": product_id})
# print(res.json())

# Example 7: Get reviews by category
res = requests.get(f"{BASE_URL}/getAllReviewByCategory", params={"category": category_id})
# print(res.json())


## In this example, categoryID 1 is utilized

In [76]:
category_id = "1"  # Replace with actual category ID
products = requests.get(f"{BASE_URL}/getAllProductByCategory", params={"category": category_id})
print(len(products.json()['data']))

73


In [77]:
products_df = pd.DataFrame(products.json()['data'])

In [78]:
products_df.head()

Unnamed: 0,categoryId,currentPrice,discount,id,imgUrl,name,originalPrice,stock
0,1,46999.0,59.13,1,https://images.tokopedia.net/img/cache/500-squ...,(BELI 2pcs DAPAT HADIAH) BITZEN Ikat Pinggang ...,115000.0,376
1,1,212000.0,0.0,16,https://images.tokopedia.net/img/cache/500-squ...,Anting Emas Asli Model Mrican Polos (Bisa buat...,0.0,241
2,1,130000.0,77.59,33,https://images.tokopedia.net/img/cache/500-squ...,BIDEN Jam Tangan Wanita Anggun Otomatis Date B...,580000.0,207
3,1,34999.0,53.33,34,https://images.tokopedia.net/img/cache/500-squ...,BITZEN Ikat Pinggang Pria Sabuk Kulit Gesper K...,75000.0,1713
4,1,36200.0,54.18,35,https://images.tokopedia.net/img/cache/500-squ...,BITZEN Ikat Pinggang Wanita Kulit Imitasi Slim...,79000.0,1079


Set the 'originalPrice' to be the same as 'currentPrice' if there is no discount.

In [79]:
products_df['originalPrice'][products_df['discount'] == 0.00] = products_df['currentPrice']

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  products_df['originalPrice'][products_df['discount'] == 0.00] = products_df['currentPrice']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retu

In [80]:
products_df.head()

Unnamed: 0,categoryId,currentPrice,discount,id,imgUrl,name,originalPrice,stock
0,1,46999.0,59.13,1,https://images.tokopedia.net/img/cache/500-squ...,(BELI 2pcs DAPAT HADIAH) BITZEN Ikat Pinggang ...,115000.0,376
1,1,212000.0,0.0,16,https://images.tokopedia.net/img/cache/500-squ...,Anting Emas Asli Model Mrican Polos (Bisa buat...,212000.0,241
2,1,130000.0,77.59,33,https://images.tokopedia.net/img/cache/500-squ...,BIDEN Jam Tangan Wanita Anggun Otomatis Date B...,580000.0,207
3,1,34999.0,53.33,34,https://images.tokopedia.net/img/cache/500-squ...,BITZEN Ikat Pinggang Pria Sabuk Kulit Gesper K...,75000.0,1713
4,1,36200.0,54.18,35,https://images.tokopedia.net/img/cache/500-squ...,BITZEN Ikat Pinggang Wanita Kulit Imitasi Slim...,79000.0,1079


In [81]:
products_df.groupby('id')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f90fdf176e0>

In [82]:
print(products_df)

    categoryId  currentPrice  discount   id  \
0            1       46999.0     59.13    1   
1            1      212000.0      0.00   16   
2            1      130000.0     77.59   33   
3            1       34999.0     53.33   34   
4            1       36200.0     54.18   35   
..         ...           ...       ...  ...   
68           1        6500.0     67.50  374   
69           1       28400.0     60.28  384   
70           1        2900.0      0.00  401   
71           1      369000.0     38.40  405   
72           1      105000.0      0.00  410   

                                               imgUrl  \
0   https://images.tokopedia.net/img/cache/500-squ...   
1   https://images.tokopedia.net/img/cache/500-squ...   
2   https://images.tokopedia.net/img/cache/500-squ...   
3   https://images.tokopedia.net/img/cache/500-squ...   
4   https://images.tokopedia.net/img/cache/500-squ...   
..                                                ...   
68  https://images.tokopedia.net/img

# Pre-process

In [83]:
cleaned_df = products_df.copy()

In [84]:
cleaned_df['name'] = cleaned_df['name'].apply(preprocess_text)

# Feature Selection

In [85]:
new_df = cleaned_df.drop(columns=['categoryId', 'id'])

In [86]:
new_df['name'] = new_df['name'].fillna('').str.lower()

In [87]:
tfidf = TfidfVectorizer()

name_features = tfidf.fit_transform(new_df['name'])

In [88]:
# Combine with scaled numerical features
numerical_features = new_df[['currentPrice', 'originalPrice', 'discount', 'stock']].fillna(0)
scaler = MinMaxScaler()
scaled_numerical = scaler.fit_transform(numerical_features)

In [89]:
import scipy.sparse
from scipy.sparse import hstack

combined_features = hstack([name_features, scaled_numerical])

In [90]:
cos_sim = cosine_similarity(combined_features)

In [91]:
cos_sim.shape

(73, 73)

# Inference

### Test 1

In [92]:
product_id = 34
product_index = cleaned_df[cleaned_df['id'] == product_id].index[0]

print(f"Product: {cleaned_df[cleaned_df['id'] == product_id]["name"].values[0]}")

Product: bitzen ikat pinggang pria sabuk kulit gesper klasik belt hitam


In [93]:
# Get similarity scores for that product
similarities = list(enumerate(cos_sim[product_index]))

# Sort by similarity score (excluding itself)
similar_products = sorted(similarities, key=lambda x: x[1], reverse=True)

# Show top 5 similar products (excluding the original)
top_similar = similar_products[1:6]  # skip index 0 (itself)

# Print results
for idx, score in top_similar:
    print(f"Product: {cleaned_df.iloc[idx]['name']}, Similarity: {score:.4f}")

Product: guten inc belt ikat pinggang gesper sabuk kulit pria black, Similarity: 0.7396
Product: bitzen ikat pinggang pria model rel sabuk gesper kulit sintetis bbelt, Similarity: 0.6929
Product: bitzen ikat pinggang wanita kulit imitasi slim casual warna warni sesuai gesper belt outfit perempuan bbeltputih, Similarity: 0.5678
Product: beli pcs hadiah bitzen ikat pinggang pria premium bahan aluminium zinc alloy nylon canvas sabuk hitam, Similarity: 0.5433
Product: picalela ikat pinggang wanita kulit asli women fashion belt hitam, Similarity: 0.4844


### Test 2

In [94]:
product_id = 410
product_index = cleaned_df[cleaned_df['id'] == product_id].index[0]

print(f"Product: {cleaned_df[cleaned_df['id'] == product_id]["name"].values[0]}")

Product: tali ikat pinggang kulit asli lebar jual tali


In [95]:
# Get similarity scores for that product
similarities = list(enumerate(cos_sim[product_index]))

# Sort by similarity score (excluding itself)
similar_products = sorted(similarities, key=lambda x: x[1], reverse=True)

# Show top 5 similar products (excluding the original)
top_similar = similar_products[1:6]  # skip index 0 (itself)

# Print results
for idx, score in top_similar:
    print(f"Product: {cleaned_df.iloc[idx]['name']}, Similarity: {score:.4f}")

Product: tali ikat pinggang model rel kulit sapi asli kepala cokelat, Similarity: 0.4106
Product: picalela ikat pinggang wanita kulit asli women fashion belt hitam, Similarity: 0.3072
Product: ikat pinggang kulit asli impor bruno cavalli sims gesper pria cokelat, Similarity: 0.2610
Product: tali ikat pinggang rhodey canvas army military tactical abuabu, Similarity: 0.2193
Product: gelang tali kam anak child anti lost strap m m anti hilang tali tali kam gandeng anak bayi child safety live biru bunda biru m, Similarity: 0.2012


### Test 3

In [96]:
product_id = 374
product_index = cleaned_df[cleaned_df['id'] == product_id].index[0]

print(f"Product: {cleaned_df[cleaned_df['id'] == product_id]["name"].values[0]}")

Product: wds kacamata bingkai wanita retro jepang korea ins eropa amerika gradient color optical glass f kc hitam


In [97]:
# Get similarity scores for that product
similarities = list(enumerate(cos_sim[product_index]))

# Sort by similarity score (excluding itself)
similar_products = sorted(similarities, key=lambda x: x[1], reverse=True)

# Show top 5 similar products (excluding the original)
top_similar = similar_products[1:6]  # skip index 0 (itself)

# Print results
for idx, score in top_similar:
    print(f"Product: {cleaned_df.iloc[idx]['name']}, Similarity: {score:.4f}")

Product: kacamata korea kotak anti radiasi blueray photocromic in metal full black phtc normal, Similarity: 0.5070
Product: gray pink kacamata sunglasses anti uv tr metal fashion wanita black blueray, Similarity: 0.4846
Product: reckblud topi bisbol polos baseball simpel hitam unisex hitam, Similarity: 0.4842
Product: kacamata hitam sunglasses colorein tr polarized retro style s black sunglases minus, Similarity: 0.4670
Product: tali jam tangan wanita leather strap asli ukur mm black mm, Similarity: 0.4625


# Search Engine Idea

In [98]:
# Convert query to lowercase to match preprocessed names
query = "ikat pinggang".lower()

# Find the index of the product that contains the query string
matches = new_df[new_df['name'].str.contains(query, case=False, na=False)]
print(f"Query: {query}")
print(matches[:5])

Query: ikat pinggang
    currentPrice  discount                                             imgUrl  \
0        46999.0     59.13  https://images.tokopedia.net/img/cache/500-squ...   
3        34999.0     53.33  https://images.tokopedia.net/img/cache/500-squ...   
4        36200.0     54.18  https://images.tokopedia.net/img/cache/500-squ...   
7        25999.0     60.00  https://images.tokopedia.net/img/cache/500-squ...   
22       46250.0     78.98  https://images.tokopedia.net/img/cache/500-squ...   

                                                 name  originalPrice  stock  
0   beli pcs hadiah bitzen ikat pinggang pria prem...       115000.0    376  
3   bitzen ikat pinggang pria sabuk kulit gesper k...        75000.0   1713  
4   bitzen ikat pinggang wanita kulit imitasi slim...        79000.0   1079  
7   bitzen ikat pinggang pria model rel sabuk gesp...        65000.0    229  
22  guten inc belt ikat pinggang gesper sabuk kuli...       220000.0      0  


In [99]:
# Convert query to lowercase to match preprocessed names
query = "anting".lower()

# Find the index of the product that contains the query string
matches = new_df[new_df['name'].str.contains(query, case=False, na=False)]
print(matches[:3])

    currentPrice  discount                                             imgUrl  \
1       212000.0       0.0  https://images.tokopedia.net/img/cache/500-squ...   
34       35000.0       0.0  https://images.tokopedia.net/img/cache/500-squ...   

                                                 name  originalPrice  stock  
1   anting emas asli model mrican polos baby newbo...       212000.0    241  
34  kalung anak titanium awet anti gatal anti irit...        35000.0     45  


In [100]:
# Convert query to lowercase to match preprocessed names
query = "jam tangan wanita".lower()

# Find the index of the product that contains the query string
matches = new_df[new_df['name'].str.contains(query, case=False, na=False)]
print(matches[:3])

    currentPrice  discount                                             imgUrl  \
2       130000.0     77.59  https://images.tokopedia.net/img/cache/500-squ...   
30       70442.0     60.87  https://images.tokopedia.net/img/cache/500-squ...   
31       86900.0     51.72  https://images.tokopedia.net/img/cache/500-squ...   

                                                 name  originalPrice  stock  
2   biden jam tangan wanita anggun otomatis date b...       580000.0    207  
30  jam tangan wanita digital tali stainless skmei...       180000.0    174  
31  jam tangan wanita digital tali stainless skmei...       180000.0    145  
