In [1]:
# import nltk
# nltk.download("stopwords")
# nltk.download("punkt")

In [44]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from string import punctuation

In [45]:
engine = create_engine("mysql+pymysql://kopi:kopi@localhost/mp_recomsys")
dbCon = engine.connect()

In [46]:
df = pd.read_sql("SELECT * FROM product", dbCon)

In [47]:
text = df.description
text = [''.join(c for c in s if c not in punctuation) for s in text]

# Stopwords - Indonesia


In [48]:
sw_indo = stopwords.words("indonesian")

# Step 1: Encode description

In [49]:
tfidf = TfidfVectorizer(ngram_range=(1, 3), tokenizer=word_tokenize, stop_words=sw_indo)

In [50]:
tfidf_matrix = tfidf.fit_transform(text)
# tfidf.get_feature_names()

# Step 2: Document Search

In [136]:
sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix)
sim

array([[1.        , 0.07565987, 0.01401481, ..., 0.00601971, 0.00367563,
        0.00259445]])

**get index**

In [137]:
rec_idx = sim.argsort()[0, -11:-1]
rec_idx = rec_idx[::-1]
rec_idx

array([1507, 1646,  609, 1509, 1517, 1118, 1136,  225, 1137,  658])

**reverse index to get score value**

In [138]:
score = sim[0, rec_idx]
score

array([1.        , 0.74319571, 0.74319571, 0.73371261, 0.7300676 ,
       0.38625081, 0.37826817, 0.15549031, 0.14701977, 0.14551412])

**convert decimal to percentage**

In [139]:
coba = []
for i in score:
    precent = "{:.0%}".format(i)
    coba.append(precent)
# print(precent)
coba

['100%', '74%', '74%', '73%', '73%', '39%', '38%', '16%', '15%', '15%']

# Step 3: Recommend

In [145]:
df1 = df.loc[rec_idx]

In [174]:
df1['score'] = coba
df1.head(3)

Unnamed: 0,id_product,id_store,url,image,name,price,review,rate,sold,view,item_condition,keywords,description,created_at,score
1507,11107,12107,https://shopee.co.id/ORIGINAL-Charger-Adaptor-...,https://cf.shopee.co.id/file/4d7d40050f145b1cc...,ORIGINAL Charger Adaptor Lenovo T410 T420 T430...,189900,2,5.0,3,0,baru,thinkpad x230,KEUNGGULAN PRODUK KAMI :\nAdaptor Charger Lapt...,2021-01-26 14:08:12,100%
1646,21141,22141,https://www.bukalapak.com/p/komputer/aksesoris...,https://s1.bukalapak.com/img/17842044911/large...,Adaptor Charger Lenovo IBM Thinkpad SL410 L410...,119000,8,4.9,19,0,baru,thinkpad x230,Deskripsi\nSUDAH KOMPLIT DENGAN KABEL POWER\nO...,2021-01-27 07:03:36,74%
609,2119,2219,https://www.bukalapak.com/p/komputer/aksesoris...,https://s1.bukalapak.com/img/17842044911/large...,Adaptor Charger Lenovo IBM Thinkpad SL410 L410...,119000,8,4.9,19,0,baru,thinkpad x230,Deskripsi\nSUDAH KOMPLIT DENGAN KABEL POWER\nO...,2021-01-27 06:11:18,74%


# ML Engineering

In [175]:
from sklearn.metrics.pairwise import cosine_similarity

class RecommenderSystem:

    def __init__(self, data, content_col):
        self.engine = create_engine("mysql+pymysql://kopi:kopi@localhost/mp_recomsys")
        self.dbCon = self.engine.connect()
        self.df = pd.read_sql(data, self.dbCon)
        
        self.content_col = content_col
        self.encoder = None
        self.bank = None
    
    def fit(self):
        self.encoder = TfidfVectorizer(ngram_range=(1, 2), tokenizer=word_tokenize, stop_words=sw_indo)
        self.bank = self.encoder.fit_transform(self.df[self.content_col])
    
    def recommend(self, idx):
        desc = self.df.loc[idx, self.content_col]
        desc = self.encoder.transform([desc])
        sim = cosine_similarity(desc, self.bank)
        rec_idx = sim.argsort()[0, -6:-1]
        rec_idx = rec_idx[::-1]
        
        reverse_idx = sim[0, rec_idx]
        
        score = []
        for p in reverse_idx:
            percentage = "{:.0%}".format(p)
            score.append(percentage)
        new_df = df.loc[rec_idx, ['id_product', 'url', 'image', 'name', 'price', 'score']]
        new_df['score'] = score
        
        return new_df
    

In [176]:
recsys = RecommenderSystem(data="SELECT * FROM product", content_col="description")
recsys.fit()

In [177]:
recsys.recommend(0)

Unnamed: 0,id_product,url,image,name,price,score
1507,11107,https://shopee.co.id/ORIGINAL-Charger-Adaptor-...,https://cf.shopee.co.id/file/4d7d40050f145b1cc...,ORIGINAL Charger Adaptor Lenovo T410 T420 T430...,189900,100%
1517,11117,https://shopee.co.id/Adaptor-Charger-Lenovo-Th...,https://cf.shopee.co.id/file/9c37506ce0b597e04...,Adaptor Charger Lenovo Thinkpad T430 T420 T410...,120000,95%
1509,11109,https://shopee.co.id/Adaptor-Charger-Lenovo-Th...,https://cf.shopee.co.id/file/33cba35b8eff70ae4...,Adaptor Charger Lenovo Thinkpad X200 X220 X230...,109080,95%
1136,3196,https://www.tokopedia.com/firstaccsupplier/ada...,https://ecs7.tokopedia.net/img/cache/500-squar...,Adaptor Charger Lenovo Thinkpad X200 X220 X230...,110000,94%
1468,3568,https://www.tokopedia.com/jasminsport/gitar-ak...,https://ecs7.tokopedia.net/img/cache/500-squar...,Gitar Akustik Elektrik Yamaha APX 500ii paling...,520000,52%
