In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import warnings
from gensim import corpora, models, similarities
import jieba
import re

In [3]:
products = pd.read_csv("Product_clean.csv")
products.head(2)

Unnamed: 0,item_id,name,description,rating,price,list_price,brand,group,url,image,product_infomation
0,48102821,Tai nghe Bluetooth Inpods 12 - Cảm biến vân ta...,THÔNG TIN CHI TIẾT\nDung lượng pin 300\nThời g...,4.0,77000,300000,OEM,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-inpods-12-cam-bien-...,https://salt.tikicdn.com/cache/280x280/ts/prod...,inpods cảm_biến vân chống nước_màu_sắc đa_dạng...
1,52333193,Tai nghe bluetooth không dây F9 True wireless ...,THÔNG TIN CHI TIẾT\nDung lượng pin 2000mah\nTh...,4.5,132000,750000,OEM,Thiết Bị Số - Phụ Kiện Số/Thiết Bị Âm Thanh và...,https://tai-nghe-bluetooth-khong-day-f9-true-w...,https://salt.tikicdn.com/cache/280x280/ts/prod...,không dây true wireless dock sạc báo képthông ...


In [4]:
products.columns

Index(['item_id', 'name', 'description', 'rating', 'price', 'list_price',
       'brand', 'group', 'url', 'image', 'product_infomation'],
      dtype='object')

In [5]:
# Tokenize (split) the sentiment into words
product_information_token = [[text for text in x.split()] for x in products.product_infomation]

In [6]:
# Obtain the number of features based on dictionary: use corpora.Dictionary
dictionary=corpora.Dictionary(product_information_token)

In [7]:
# List of features in dictionary
dictionary.token2id

{'.': 0,
 'airpod': 1,
 'bấm': 2,
 'chuẩn': 3,
 'chạm': 4,
 'chọnthông': 5,
 'chống': 6,
 'chờ': 7,
 'cuộc_gọi': 8,
 'cải_thiện': 9,
 'cảm_biến': 10,
 'cảm_ứng': 11,
 'cắm': 12,
 'cồng_kềnh': 13,
 'dock': 14,
 'dễ_dàng': 15,
 'giá': 16,
 'hiện_hành': 17,
 'huawei': 18,
 'hàng': 19,
 'hãng': 20,
 'inpod': 21,
 'inpods': 22,
 'kết_nối': 23,
 'lenovo': 24,
 'liên_tục': 25,
 'luật': 26,
 'lược': 27,
 'lựa': 28,
 'màu_sắc': 29,
 'mô_tả': 30,
 'nhiên': 31,
 'nhét': 32,
 'nhạc': 33,
 'nhạy_cảm_biến': 34,
 'nâng_cấp': 35,
 'nút': 36,
 'nước_màu_sắc': 37,
 'oem': 38,
 'oemthiết': 39,
 'oppo': 40,
 'phiên_bản': 41,
 'phát': 42,
 'phí': 43,
 'phương_thức': 44,
 'phụ_kiện': 45,
 'phụ_phí': 46,
 'quốc': 47,
 'sku': 48,
 'sạc': 49,
 'sạc_chất': 50,
 'sản_phẩm': 51,
 'tablet': 52,
 'thiết_bị': 53,
 'thiết_kế': 54,
 'thuận_tiện': 55,
 'thuế': 56,
 'thân': 57,
 'thương_hiệu': 58,
 'thời': 59,
 'thời_lượng': 60,
 'tiết': 61,
 'tiện_lợi': 62,
 'trọng_lượng': 63,
 'tuỳ': 64,
 'tác': 65,
 'tương_thích': 66

In [8]:
# Numbers of features (word) in dictionary
feature_cnt=len(dictionary.token2id)
feature_cnt

15102

In [9]:
# Obtain corpus based on dictionary (dense matrix: ma tran thua)
corpus=[dictionary.doc2bow(text) for text in product_information_token]
corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 2),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 2),
 (15, 2),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 2),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 3),
 (24, 1),
 (25, 3),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 4),
 (34, 1),
 (35, 1),
 (36, 2),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 2),
 (44, 1),
 (45, 2),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 4),
 (50, 1),
 (51, 3),
 (52, 1),
 (53, 3),
 (54, 1),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 2),
 (59, 9),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 1),
 (65, 1),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 2),
 (70, 2),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 2),
 (75, 1),
 (76, 1),
 (77, 1),
 (78, 2)]

In [10]:
# Use TF-IDF Model to process corpus, obtaining index
tfidf = models.TfidfModel(corpus)
tfidf

<gensim.models.tfidfmodel.TfidfModel at 0x2ad80c90370>

In [11]:
# Tính toán sự tương tự trong ma trận thưa thớt
index = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features = feature_cnt)

In [12]:
# When user choose one product: 1059892
product_ID = 1059892
product_selection = products[products.item_id == product_ID]
product_selection

Unnamed: 0,item_id,name,description,rating,price,list_price,brand,group,url,image,product_infomation
78,1059892,Dây Cáp Sạc Lightning Cho iPhone Anker PowerLi...,THÔNG TIN CHI TIẾT\nThương hiệu Anker\nKích th...,4.7,314000,330000,Anker,Thiết Bị Số - Phụ Kiện Số/Phụ Kiện Điện Thoại ...,https://day-cap-sac-lightning-cho-iphone-anker...,https://salt.tikicdn.com/cache/280x280/media/c...,dây_cáp sạc lightning anker powerline hàng hãn...


In [13]:
# sản phẩm đang xem
name_description_pre = product_selection['product_infomation'].to_string(index=False)
name_description_pre

'dây_cáp sạc lightning anker powerline hàng hãng...'

In [14]:
view_product = name_description_pre.lower().split()

In [15]:
# Suggest other products for customers
def recommendation (view_product, dictionary, tfidf, index):
    # Convert search words into Sparse Vectors
    view_product = view_product.lower().split()
    kw_vector = dictionary.doc2bow(view_product)
    print("View product 's vector:")
    print(kw_vector)
    # Similarity calculation
    sim = index[tfidf[kw_vector]]
    
    # print result
    list_id = []
    list_score = []
    for i in range(len(sim)):
        list_id.append(i)
        list_score.append(sim[i])
    
    df_result = pd.DataFrame({'id': list_id,
                              'score': list_score})
    
    # 10 highest scores
    five_highest_score = df_result.sort_values(by='score', ascending=False).head(11)
    print("Five highest scores:")
    print(five_highest_score)
    print("Ids to list:")
    idToList = list(five_highest_score['id'])
    print(idToList)
    
    products_find = products[products.index.isin(idToList)]
    results = products_find[['item_id','name']]
    results = pd.concat([results, five_highest_score], axis=1).sort_values(by='score', ascending=False)
    return results

In [16]:
results = recommendation(name_description_pre, dictionary, tfidf, index)

View product 's vector:
[(19, 1), (49, 1), (461, 1), (662, 1), (887, 1), (1616, 1)]
Five highest scores:
        id     score
120    120  0.854944
78      78  0.794256
643    643  0.714968
953    953  0.646477
154    154  0.641248
735    735  0.592818
659    659  0.584305
482    482  0.540879
2516  2516  0.505354
161    161  0.399520
943    943  0.393195
Ids to list:
[120, 78, 643, 953, 154, 735, 659, 482, 2516, 161, 943]


In [17]:
# Recommender 5 similarities products for the selected product
results = results[results.item_id!=product_ID]
results

Unnamed: 0,item_id,name,id,score
120,1060082,Dây Cáp Sạc Lightning Cho iPhone Anker PowerLi...,120,0.854944
643,49661643,Dây Cáp Sạc USB Type-C Anker PowerLine Select+...,643,0.714968
953,3220143,Dây Cáp Sạc Lightning Cho iPhone Anker PowerLi...,953,0.646477
154,60039593,Dây Cáp Sạc Anker PowerLine III USB-C to USB-C...,154,0.641248
735,234886,Dây Cáp Sạc Lightning Cho iPhone Anker PowerLi...,735,0.592818
659,54017205,Dây Cáp Sạc Lightning Cho iPhone Anker PowerLi...,659,0.584305
482,15682170,Dây Cáp Sạc USB-C to Lightning Chuẩn MFi Cho i...,482,0.540879
2516,512224,Bộ Chuyển Đổi Ethernet Powerline Nano AV600 TP...,2516,0.505354
161,249953,Dây Cáp Sạc Micro USB Anker PowerLine 0.9m - A...,161,0.39952
943,35460128,Cáp sạc cho iPhone 11 pro Max USB Type-C san...,943,0.393195


In [18]:
# Save Content_Based_Filtering_Gensim_Dictionary

In [19]:
dictionary.save("Content_Based_Filtering_Gensim_Dictionary.sav")

In [20]:
dictionary.load("Content_Based_Filtering_Gensim_Dictionary.sav")

<gensim.corpora.dictionary.Dictionary at 0x2adec4864c0>

In [21]:
#Solution 1 va Solution 2 co 2 danh sach goi y khac nhau