In [3]:
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors

### Step1：讀取 data 資料檔

In [5]:
data = pd.read_csv("data.csv")

In [6]:
data.head()

Unnamed: 0,user_id,product_id,reordered,product_name
0,1,196,10,Soda
1,15,196,4,Soda
2,19,196,2,Soda
3,21,196,0,Soda
4,31,196,1,Soda


### Step2：過濾商品，挑出購買次數超過200人的商品

In [7]:
df = data['product_id'].value_counts()
product_list = df[df >= 200].keys()

data_new = data[data['product_id'].isin(product_list)].drop(['product_name'],axis=1)

In [8]:
data_new.head()

Unnamed: 0,user_id,product_id,reordered
0,1,196,10
1,15,196,4
2,19,196,2
3,21,196,0
4,31,196,1


### Step3：準備item-user matrix

In [9]:
# (1) pivot ratings into product features
df_product_features = data_new.pivot(index='product_id', columns='user_id', values='reordered').fillna(0)

# (2) 標準化
min_max_scaler = MinMaxScaler(feature_range=(0, 5))
product_features = min_max_scaler.fit_transform(df_product_features)

# (3) convert dataframe of product features to scipy sparse matrix
mat_product_features = csr_matrix(product_features)

### Step4：建立KNN模型

In [10]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(mat_product_features)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

### Step5：測試模型結果

In [14]:
# （1）讀取 products 表格
products = pd.read_csv("instacart_2017_05_01/products.csv").set_index('product_id')

In [15]:
# （2）建立product_id 和 query index 的對照表
id_and_index = {}
for index, product_id in enumerate(df_product_features.index.to_list()):
    id_and_index[product_id] = index

In [16]:
# (3) 根據product_id測試推薦結果
product_id = 1
query_index= id_and_index[product_id]
distances, indices = model_knn.kneighbors(np.array(df_product_features.iloc[query_index, :]).reshape(1, -1), n_neighbors = 6)

for i in range(0,len(distances.flatten())):
    if i == 0 :
        print("Recommendation for: {} \n".format(products.iloc[product_id-1,0]))
    else:
        print("{}: {}, with distance of {}".format(i, products.iloc[df_product_features.index[indices.flatten()[i]],0], distances.flatten()[i]))

Recommendation for: Chocolate Sandwich Cookies 

1: Roasted Pine Nut Hummus, with distance of 0.9082108161440906
2: 7 Whole Grain Nuggets Cereal, with distance of 0.9143195521216906
3: Triple Distilled Irish Whiskey, with distance of 0.9206812944970674
4: Pain Reliever (NSAID)/Nighttime Sleep Aid, Ibuprofen 200mg, with distance of 0.9356391447492662
5: Valdosta Pecans With Cranberries, Black Pepper & Orange Zest, with distance of 0.9414971616070033
