# Recommender System Early Demo Code #

### import necessary libraries ###

In [11]:
import pymysql
import pandas as pd

# jaccard similarity 
from scipy.spatial.distance import pdist, squareform

# 引入套件計算以餘弦相似度
from sklearn.metrics.pairwise import cosine_similarity

### Making a sql connection to database ###

In [12]:
conn = pymysql.connect(host = 'localhost',
                                    database='project_v5',
                                    user='Aung',
                                    password='')

### Create a cursor ###

In [13]:
cursor = conn.cursor()

### Execute SQL ###

In [14]:
cursor.execute("""SELECT
	concat(dname,tpname) drink_name, dname,typename, dprice,tpname AS topping,  tpprice,  (dprice+tpprice) AS total 
FROM 
	(SELECT dname,dprice,typename FROM project_v5.dmenu 
			where dprice > 1  
			AND dname NOT LIKE '%元' and dname NOT LIKE '%特價%' and typename NOT LIKE '%外送%') d
		CROSS JOIN
	(SELECT tpname, tpprice FROM project_v5.toppings where tpprice > 1
			AND tpno < '000037' and tpno not in ('000024','000026','000027'))t  order by drink_name;""")

2268

### Creating DataFrame

In [15]:
# get column names
fields = [field_md[0] for field_md in cursor.description]
# zip column names and rows
result = [dict(zip(fields,row)) for row in cursor.fetchall()]
df = pd.DataFrame(result)

In [16]:
df

Unnamed: 0,drink_name,dname,typename,dprice,topping,tpprice,total
0,仙草凍奶茶(L)+仙草凍,仙草凍奶茶(L),奶茶(L),40,+仙草凍,5,45
1,仙草凍奶茶(L)+咖啡凍,仙草凍奶茶(L),奶茶(L),40,+咖啡凍,5,45
2,仙草凍奶茶(L)+布丁,仙草凍奶茶(L),奶茶(L),40,+布丁,5,45
3,仙草凍奶茶(L)+椰果,仙草凍奶茶(L),奶茶(L),40,+椰果,5,45
4,仙草凍奶茶(L)+檸檬,仙草凍奶茶(L),奶茶(L),40,+檸檬,10,50
...,...,...,...,...,...,...,...
2263,鮮葡萄柚綠(XL)咖啡凍*2,鮮葡萄柚綠(XL),特調(XL),50,咖啡凍*2,10,60
2264,鮮葡萄柚綠(XL)布丁*2,鮮葡萄柚綠(XL),特調(XL),50,布丁*2,10,60
2265,鮮葡萄柚綠(XL)椰果*2,鮮葡萄柚綠(XL),特調(XL),50,椰果*2,10,60
2266,鮮葡萄柚綠(XL)珍珠*2,鮮葡萄柚綠(XL),特調(XL),50,珍珠*2,10,60


### Load the ingredient data

In [17]:
ingredient = pd.read_csv("list_v5_dmenu.csv")
ingredient

Unnamed: 0,dname,總甜度,熱品/冷品,主要成分,次要成分
0,仙草凍奶茶(L),3,熱/冷,奶茶,仙草凍
1,仙草凍奶茶(XL),3,熱/冷,奶茶,仙草凍
2,仙草甘茶(L),1,熱/冷,茶類,仙草茶
3,仙草甘茶(XL),1,熱/冷,茶類,仙草茶
4,仙草蜜(L),3,冷,仙草茶,蜂蜜
...,...,...,...,...,...
157,鮮奶綠茶(XL),2,熱/冷,綠茶,鮮奶
158,鮮奶青茶(L),2,熱/冷,青茶,鮮奶
159,鮮奶青茶(XL),2,熱/冷,青茶,鮮奶
160,鮮葡萄柚綠(L),1,冷,綠茶,葡萄柚汁


### Merge two dfs (inner join)

In [18]:
df_final = pd.merge(df,ingredient, how='inner', on='dname')
df_final

Unnamed: 0,drink_name,dname,typename,dprice,topping,tpprice,total,總甜度,熱品/冷品,主要成分,次要成分
0,仙草凍奶茶(L)+仙草凍,仙草凍奶茶(L),奶茶(L),40,+仙草凍,5,45,3,熱/冷,奶茶,仙草凍
1,仙草凍奶茶(L)+咖啡凍,仙草凍奶茶(L),奶茶(L),40,+咖啡凍,5,45,3,熱/冷,奶茶,仙草凍
2,仙草凍奶茶(L)+布丁,仙草凍奶茶(L),奶茶(L),40,+布丁,5,45,3,熱/冷,奶茶,仙草凍
3,仙草凍奶茶(L)+椰果,仙草凍奶茶(L),奶茶(L),40,+椰果,5,45,3,熱/冷,奶茶,仙草凍
4,仙草凍奶茶(L)+檸檬,仙草凍奶茶(L),奶茶(L),40,+檸檬,10,50,3,熱/冷,奶茶,仙草凍
...,...,...,...,...,...,...,...,...,...,...,...
2263,鮮葡萄柚綠(XL)咖啡凍*2,鮮葡萄柚綠(XL),特調(XL),50,咖啡凍*2,10,60,1,冷,綠茶,葡萄柚汁
2264,鮮葡萄柚綠(XL)布丁*2,鮮葡萄柚綠(XL),特調(XL),50,布丁*2,10,60,1,冷,綠茶,葡萄柚汁
2265,鮮葡萄柚綠(XL)椰果*2,鮮葡萄柚綠(XL),特調(XL),50,椰果*2,10,60,1,冷,綠茶,葡萄柚汁
2266,鮮葡萄柚綠(XL)珍珠*2,鮮葡萄柚綠(XL),特調(XL),50,珍珠*2,10,60,1,冷,綠茶,葡萄柚汁


### Checking Null Values

In [19]:
df_final.isna().value_counts()

drink_name  dname  typename  dprice  topping  tpprice  total  總甜度    熱品/冷品  主要成分    次要成分
False       False  False     False   False    False    False  False  False  False  False    2268
dtype: int64

In [20]:
df_final.columns[1:]


Index(['dname', 'typename', 'dprice', 'topping', 'tpprice', 'total', '總甜度',
       '熱品/冷品', '主要成分', ' 次要成分'],
      dtype='object')

### One-hot Encoding Using pd.get_dummies

In [21]:
df_test = pd.get_dummies(df_final,columns=df_final.columns[1:])

### Set drink_name as Index

In [22]:
df_test.set_index("drink_name",inplace=True)
df_test

Unnamed: 0_level_0,dname_仙草凍奶茶(L),dname_仙草凍奶茶(XL),dname_仙草甘茶(L),dname_仙草甘茶(XL),dname_仙草蜜(L),dname_仙草蜜(XL),dname_仙草鮮奶凍(L),dname_仙草鮮奶凍(XL),dname_冬瓜仙草茶(L),dname_冬瓜仙草茶(XL),...,次要成分_薑汁,次要成分_蘆薈汁,次要成分_蜂蜜,次要成分_金桔,次要成分_青茶,次要成分_養樂多,次要成分_香草,次要成分_鮮奶,次要成分_鮮奶巧克力,次要成分_鮮奶茶
drink_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
仙草凍奶茶(L)+仙草凍,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
仙草凍奶茶(L)+咖啡凍,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
仙草凍奶茶(L)+布丁,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
仙草凍奶茶(L)+椰果,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
仙草凍奶茶(L)+檸檬,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
鮮葡萄柚綠(XL)咖啡凍*2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
鮮葡萄柚綠(XL)布丁*2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
鮮葡萄柚綠(XL)椰果*2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
鮮葡萄柚綠(XL)珍珠*2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Jaccard Similarity

In [23]:

# 計算所有商品間的 jaccard distance
jaccard_distances = pdist(df_test.values, metric='jaccard')

# 將原本 items 轉換成為 square matrix
# 原先距離最遠的會計算出 1，因此相似度就要以 1 - 原先值
jaccard_similarity_array = 1 - squareform(jaccard_distances)

# 將資料轉換成為 DataFrame 的格式
jaccard_similarity_df = pd.DataFrame(jaccard_similarity_array,
                                     index=df_test.index,
                                     columns=df_test.index)


In [24]:
jaccard_similarity_df

drink_name,仙草凍奶茶(L)+仙草凍,仙草凍奶茶(L)+咖啡凍,仙草凍奶茶(L)+布丁,仙草凍奶茶(L)+椰果,仙草凍奶茶(L)+檸檬,仙草凍奶茶(L)+珍珠,仙草凍奶茶(L)+蘆薈,仙草凍奶茶(L)仙草凍*2,仙草凍奶茶(L)原料加量,仙草凍奶茶(L)咖啡凍*2,...,鮮葡萄柚綠(XL)+檸檬,鮮葡萄柚綠(XL)+珍珠,鮮葡萄柚綠(XL)+蘆薈,鮮葡萄柚綠(XL)仙草凍*2,鮮葡萄柚綠(XL)原料加量,鮮葡萄柚綠(XL)咖啡凍*2,鮮葡萄柚綠(XL)布丁*2,鮮葡萄柚綠(XL)椰果*2,鮮葡萄柚綠(XL)珍珠*2,鮮葡萄柚綠(XL)蘆薈*2
drink_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
仙草凍奶茶(L)+仙草凍,1.000000,0.818182,0.818182,0.818182,0.538462,0.818182,0.538462,0.538462,0.538462,0.538462,...,0.000000,0.052632,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
仙草凍奶茶(L)+咖啡凍,0.818182,1.000000,0.818182,0.818182,0.538462,0.818182,0.538462,0.538462,0.538462,0.538462,...,0.000000,0.052632,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
仙草凍奶茶(L)+布丁,0.818182,0.818182,1.000000,0.818182,0.538462,0.818182,0.538462,0.538462,0.538462,0.538462,...,0.000000,0.052632,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
仙草凍奶茶(L)+椰果,0.818182,0.818182,0.818182,1.000000,0.538462,0.818182,0.538462,0.538462,0.538462,0.538462,...,0.000000,0.052632,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
仙草凍奶茶(L)+檸檬,0.538462,0.538462,0.538462,0.538462,1.000000,0.538462,0.818182,0.818182,0.818182,0.818182,...,0.111111,0.000000,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.052632,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
鮮葡萄柚綠(XL)咖啡凍*2,0.000000,0.000000,0.000000,0.000000,0.052632,0.000000,0.052632,0.052632,0.052632,0.111111,...,0.818182,0.538462,0.818182,0.818182,0.818182,1.000000,0.818182,0.818182,0.818182,0.538462
鮮葡萄柚綠(XL)布丁*2,0.000000,0.000000,0.000000,0.000000,0.052632,0.000000,0.052632,0.052632,0.052632,0.052632,...,0.818182,0.538462,0.818182,0.818182,0.818182,0.818182,1.000000,0.818182,0.818182,0.538462
鮮葡萄柚綠(XL)椰果*2,0.000000,0.000000,0.000000,0.000000,0.052632,0.000000,0.052632,0.052632,0.052632,0.052632,...,0.818182,0.538462,0.818182,0.818182,0.818182,0.818182,0.818182,1.000000,0.818182,0.538462
鮮葡萄柚綠(XL)珍珠*2,0.000000,0.000000,0.000000,0.000000,0.052632,0.000000,0.052632,0.052632,0.052632,0.052632,...,0.818182,0.538462,0.818182,0.818182,0.818182,0.818182,0.818182,0.818182,1.000000,0.538462


### '仙草凍奶茶(L)+仙草凍'相似的前20個飲料

In [25]:
jaccard_similarity_df.loc['仙草凍奶茶(L)+仙草凍'].sort_values(ascending=False).head(20)

drink_name
仙草凍奶茶(L)+仙草凍    1.000000
仙草凍奶茶(L)+椰果     0.818182
仙草凍奶茶(L)+珍珠     0.818182
仙草凍奶茶(L)+咖啡凍    0.818182
仙草凍奶茶(L)+布丁     0.818182
焦糖奶茶(L)+仙草凍     0.666667
香草奶茶(L)+仙草凍     0.666667
草莓奶茶(L)+仙草凍     0.666667
咖啡凍奶茶(L)+仙草凍    0.666667
布丁奶茶(L)+仙草凍     0.666667
珍珠奶茶(L)+仙草凍     0.666667
榛果奶茶(L)+仙草凍     0.666667
椰果奶茶(L)+仙草凍     0.666667
芋香奶茶(L)+仙草凍     0.666667
珍珠奶茶(L)+咖啡凍     0.538462
珍珠奶茶(L)+布丁      0.538462
珍珠奶茶(L)+椰果      0.538462
芋香奶茶(L)+咖啡凍     0.538462
珍珠奶茶(L)+珍珠      0.538462
焦糖奶茶(L)+椰果      0.538462
Name: 仙草凍奶茶(L)+仙草凍, dtype: float64

### Cosine Similarity

In [26]:
# 計算餘弦相似度，並會以 np.array 形式輸出儲存
cosine_similarity_array = cosine_similarity(df_test)

# 將資料轉換為 DF 型態以方便進行資料檢視與處理
cosine_similarity_df = pd.DataFrame(cosine_similarity_array,
                                    index=df_test.index,
                                    columns=df_test.index)

In [27]:
cosine_similarity_df

drink_name,仙草凍奶茶(L)+仙草凍,仙草凍奶茶(L)+咖啡凍,仙草凍奶茶(L)+布丁,仙草凍奶茶(L)+椰果,仙草凍奶茶(L)+檸檬,仙草凍奶茶(L)+珍珠,仙草凍奶茶(L)+蘆薈,仙草凍奶茶(L)仙草凍*2,仙草凍奶茶(L)原料加量,仙草凍奶茶(L)咖啡凍*2,...,鮮葡萄柚綠(XL)+檸檬,鮮葡萄柚綠(XL)+珍珠,鮮葡萄柚綠(XL)+蘆薈,鮮葡萄柚綠(XL)仙草凍*2,鮮葡萄柚綠(XL)原料加量,鮮葡萄柚綠(XL)咖啡凍*2,鮮葡萄柚綠(XL)布丁*2,鮮葡萄柚綠(XL)椰果*2,鮮葡萄柚綠(XL)珍珠*2,鮮葡萄柚綠(XL)蘆薈*2
drink_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
仙草凍奶茶(L)+仙草凍,1.0,0.9,0.9,0.9,0.7,0.9,0.7,0.7,0.7,0.7,...,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
仙草凍奶茶(L)+咖啡凍,0.9,1.0,0.9,0.9,0.7,0.9,0.7,0.7,0.7,0.7,...,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
仙草凍奶茶(L)+布丁,0.9,0.9,1.0,0.9,0.7,0.9,0.7,0.7,0.7,0.7,...,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
仙草凍奶茶(L)+椰果,0.9,0.9,0.9,1.0,0.7,0.9,0.7,0.7,0.7,0.7,...,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
仙草凍奶茶(L)+檸檬,0.7,0.7,0.7,0.7,1.0,0.7,0.9,0.9,0.9,0.9,...,0.2,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
鮮葡萄柚綠(XL)咖啡凍*2,0.0,0.0,0.0,0.0,0.1,0.0,0.1,0.1,0.1,0.2,...,0.9,0.7,0.9,0.9,0.9,1.0,0.9,0.9,0.9,0.7
鮮葡萄柚綠(XL)布丁*2,0.0,0.0,0.0,0.0,0.1,0.0,0.1,0.1,0.1,0.1,...,0.9,0.7,0.9,0.9,0.9,0.9,1.0,0.9,0.9,0.7
鮮葡萄柚綠(XL)椰果*2,0.0,0.0,0.0,0.0,0.1,0.0,0.1,0.1,0.1,0.1,...,0.9,0.7,0.9,0.9,0.9,0.9,0.9,1.0,0.9,0.7
鮮葡萄柚綠(XL)珍珠*2,0.0,0.0,0.0,0.0,0.1,0.0,0.1,0.1,0.1,0.1,...,0.9,0.7,0.9,0.9,0.9,0.9,0.9,0.9,1.0,0.7


### '仙草凍奶茶(L)+仙草凍'相似的前20個飲料

In [28]:
cosine_similarity_df.loc['仙草凍奶茶(L)+仙草凍'].sort_values(ascending=False).head(20)

drink_name
仙草凍奶茶(L)+仙草凍    1.0
仙草凍奶茶(L)+椰果     0.9
仙草凍奶茶(L)+珍珠     0.9
仙草凍奶茶(L)+咖啡凍    0.9
仙草凍奶茶(L)+布丁     0.9
焦糖奶茶(L)+仙草凍     0.8
香草奶茶(L)+仙草凍     0.8
草莓奶茶(L)+仙草凍     0.8
咖啡凍奶茶(L)+仙草凍    0.8
布丁奶茶(L)+仙草凍     0.8
珍珠奶茶(L)+仙草凍     0.8
榛果奶茶(L)+仙草凍     0.8
椰果奶茶(L)+仙草凍     0.8
芋香奶茶(L)+仙草凍     0.8
珍珠奶茶(L)+咖啡凍     0.7
珍珠奶茶(L)+布丁      0.7
珍珠奶茶(L)+椰果      0.7
芋香奶茶(L)+咖啡凍     0.7
珍珠奶茶(L)+珍珠      0.7
焦糖奶茶(L)+椰果      0.7
Name: 仙草凍奶茶(L)+仙草凍, dtype: float64