# 1. 재료 추천 기반 컨텐츠 베이스 알고리즘 (Material Recommendation Based Content Base Algorithm)

# 칵테일 재료 데이터 읽기(Read cocktail ingredient data)

In [None]:
import pandas as pd

In [95]:
csv = pd.read_csv('data/cocktail_ingredient_new.csv') # 칵테일 재료 데이터(Cocktail ingredient data)

In [96]:
csv.head()

Unnamed: 0.1,Unnamed: 0,name_ko,type_string,ingredient_amount,ingredient_unit,type_string.1,cocktail_id,ingredient_id
0,0,"Cachaça, Sagatiba Velha",Rum,59.10,ml,Rum,1,386
1,1,Orgeat 아몬드 시럽,General,14.80,ml,General,1,1300
2,2,"Elderflower liqueur, St. Germain",Liqueur,14.80,ml,Liqueur,1,739
3,3,라임 (4조각으로 나눠진),General,1⁄2,,General,1,1114
4,4,"Light rum, Flor de Caña Dry 4",Rum,29.60,ml,Rum,2,1106


# TF-IDF 를 위한 데이터프레임 만들기(Creating a data frame for TF-IDF)

In [101]:
ingredient_list = []
id_list = csv.drop_duplicates(['cocktail_id'])['cocktail_id'] # 칵테일 아이디 중복 제거(Cocktail ID deduplication)
for cid in id_list: # 칵테일 id(Cocktail ID)
    ingredient = '' # 칵테일 재료 정보(Cocktail Ingredients Information)
    for row in range(len(csv[csv['cocktail_id'] == cid])):
        ingredient += csv[csv['cocktail_id'] == cid].iloc[row]['name_ko'] + ' '
    ingredient = ingredient.replace(',','')
    ingredient_list.append(ingredient)

In [102]:
ingredient_list[0]

'Cachaça Sagatiba Velha Orgeat 아몬드 시럽 Elderflower liqueur St. Germain 라임 (4조각으로 나눠진) '

In [103]:
data = pd.DataFrame({
    'cocktail_id' : id_list,
    'cocktail_ingredient' : ingredient_list
})

In [131]:
data = data.reset_index() # 인덱스 초기화(Index initialization)
data.head()
# len(data) => 3676

Unnamed: 0,index,cocktail_id,cocktail_ingredient
0,0,1,Cachaça Sagatiba Velha Orgeat 아몬드 시럽 Elderflow...
1,4,2,Light rum Flor de Caña Dry 4 Peach liqueur Rot...
2,12,3,Cognac VSOP 간단한 시럽 비터스 Bittermens 'Elemakule T...
3,17,4,Campari Eau de vie of Douglas Fir Gin Ginger l...
4,22,5,Rye Aquavit Cynar Herbal liqueur Green Chartre...


# TF-IDF 구현 (implement TF-IDF)

In [126]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [127]:
data['cocktail_ingredient'].isnull().sum() # 결측값 수(Number of missing values)

0

In [107]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['cocktail_ingredient'])
print('TF-IDF 행렬의 크기(shape) :',tfidf_matrix.shape)
# 출력: TF-IDF 행렬의 크기(shape) : (3676, 2036)
# TF-IDF 행렬의 크기는 3676의 행을 가지고 2036의 열을 가지는 행렬이다
# 3676개의 칵테일을 표현하기 위해서 총 2036개의 재료가 사용되었다
# Output: Size of TF-IDF matrix (shape): (3676, 2036)
# The size of the TF-IDF matrix is a matrix with rows of 3676 and columns of 2036
# A total of 2036 ingredients were used to express 3676 cocktails

TF-IDF 행렬의 크기(shape) : (3676, 2036)


# TF-IDF 행렬의 코사인 유사도 계산 (Calculate cosine similarity of TF-IDF matrix)

In [108]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print('코사인 유사도 연산 결과 :',cosine_sim.shape) # Cosine similarity calculation results

코사인 유사도 연산 결과 : (3676, 3676)


# 재료 기반 유사한 칵테일 추천 상위 6개(Top 6 similar cocktail recommendations based on ingredients)

In [134]:
def get_recommendations(cid, cosine_sim=cosine_sim):
    # 선택한 칵테일의 id로부터 해당 칵테일의 인덱스를 받아온다.
    # Gets the index of the cocktail from the id of the selected cocktail.
    idx = id_to_index[cid]
    
    # 해당 칵테일과 모든 칵테일의 유사도를 가져온다.
    # Bring the similarity of all cocktails to the corresponding cocktails
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 유사도에 따라 칵테일들을 정렬한다.
    # Sort cocktails according to similarity.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 6개의 칵테일를 받아온다.
    # I get the six most similar cocktails.
    sim_scores = sim_scores[1:7]

    # 가장 유사한 6개의 칵테일의 인덱스를 얻는다.
    # Get indexes of the six most similar cocktails.
    cocktail_indices = [idx[0] for idx in sim_scores]

    # 가장 유사한 6개의 칵테일의 인덱스를 리턴한다.
    # Returns the index of the six most similar cocktails.
    return cocktail_indices

In [135]:
id_to_index = dict(zip(data['cocktail_id'], data.index))
id_to_index[2]

1

In [136]:
get_recommendations(1)

[487, 920, 167, 1095, 769, 54]

# 결과 데이터프레임 생성 및 저장(Create and save the resulting data frame)

In [137]:
recommend_ingredient_1=[]
recommend_ingredient_2=[]
recommend_ingredient_3=[]
recommend_ingredient_4=[]
recommend_ingredient_5=[]
recommend_ingredient_6=[]
cocktail_id=[]

for cid in id_list: # 칵테일 id(Cocktail ID)
    re = get_recommendations(cid) # 재료 추천 칵테일 id (Ingredient recommendation cocktail id)
    recommend_ingredient_1.append(re[0]) 
    recommend_ingredient_2.append(re[1])
    recommend_ingredient_3.append(re[2])
    recommend_ingredient_4.append(re[3])
    recommend_ingredient_5.append(re[4])
    recommend_ingredient_6.append(re[5])
    cocktail_id.append(cid)

In [141]:
result = pd.DataFrame({
    'cocktail_id' : id_list,
    'recommend_ingredient_1' : recommend_ingredient_1,
    'recommend_ingredient_2' : recommend_ingredient_2,
    'recommend_ingredient_3' : recommend_ingredient_3,
    'recommend_ingredient_4' : recommend_ingredient_4,
    'recommend_ingredient_5' : recommend_ingredient_5,
    'recommend_ingredient_6' : recommend_ingredient_6,
    'cocktail_id' : cocktail_id
})

In [144]:
result.head()

Unnamed: 0,cocktail_id,recommend_ingredient_1,recommend_ingredient_2,recommend_ingredient_3,recommend_ingredient_4,recommend_ingredient_5,recommend_ingredient_6
0,1,487,920,167,1095,769,54
1,2,1392,2372,549,1555,2355,1557
2,3,332,666,201,1011,575,71
3,4,770,853,661,1014,228,293
4,5,49,604,1124,1015,591,114


In [145]:
result.to_csv('data/recommend_ingredient.csv')

# 2. 색상 추천 기반 컨텐츠 베이스 알고리즘 (Color Recommendation Based Content Base Algorithm)

In [146]:
csv = pd.read_csv('data/img_new28.csv')

In [148]:
csv.head()

Unnamed: 0.1,Unnamed: 0,cocktail_id,name,cocktail_color1,cocktail_color2,url
0,0,0,The Missionary,흰색,흰색,https://kindredcocktails.com/cocktail/missionary
1,1,1,Missionary's Downfall (Don the Beachcomber),노란색,노란색,https://kindredcocktails.com/cocktail/missiona...
2,2,2,Mode Pour les Jeunes,분홍색,흰색,https://kindredcocktails.com/cocktail/mode-pou...
3,3,3,Mizz Mazza,흰색,노란색,https://kindredcocktails.com/cocktail/mizz-mazza
4,4,4,"Mockingbird, Wish Me Luck",흰색,흰색,https://kindredcocktails.com/cocktail/mockingb...


In [149]:
csv = csv.dropna() # 결측치 제거(Remove missing values)

In [150]:
csv.head()

Unnamed: 0.1,Unnamed: 0,cocktail_id,name,cocktail_color1,cocktail_color2,url
0,0,0,The Missionary,흰색,흰색,https://kindredcocktails.com/cocktail/missionary
1,1,1,Missionary's Downfall (Don the Beachcomber),노란색,노란색,https://kindredcocktails.com/cocktail/missiona...
2,2,2,Mode Pour les Jeunes,분홍색,흰색,https://kindredcocktails.com/cocktail/mode-pou...
3,3,3,Mizz Mazza,흰색,노란색,https://kindredcocktails.com/cocktail/mizz-mazza
4,4,4,"Mockingbird, Wish Me Luck",흰색,흰색,https://kindredcocktails.com/cocktail/mockingb...


In [151]:
csv[csv['cocktail_id']==1].iloc[0]['cocktail_id']

1

# TF-IDF 를 위한 데이터프레임 만들기(Creating a data frame for TF-IDF)

In [152]:
csv['cocktail_id']

0          0
1          1
2          2
3          3
4          4
        ... 
3669    3669
3670    3670
3672    3672
3673    3673
3674    3674
Name: cocktail_id, Length: 3329, dtype: int64

In [154]:
color_list = []
for cid in csv['cocktail_id']: # 칵테일 id(Cocktail ID)
    # 칵테일 색상 2가지(Two cocktail colors)
    color = ''
    color += csv[csv['cocktail_id'] == cid].iloc[0]['cocktail_color1'] + ' '
    color += csv[csv['cocktail_id'] == cid].iloc[0]['cocktail_color2'] + ' '
    color_list.append(color)

In [155]:
color_list[2]

'분홍색 흰색 '

In [156]:
data = pd.DataFrame({
    'cocktail_id' : csv['cocktail_id'],
    'cocktail_color' : color_list
})

In [157]:
data = data.reset_index() # 인덱스 초기화(Index initialization)
data.head()

Unnamed: 0,index,cocktail_id,cocktail_color
0,0,0,흰색 흰색
1,1,1,노란색 노란색
2,2,2,분홍색 흰색
3,3,3,흰색 노란색
4,4,4,흰색 흰색


In [158]:
id_to_index = dict(zip(data['cocktail_id'], data.index))

# TF-IDF 구현 (implement TF-IDF)

In [159]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [160]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['cocktail_color'])
print('TF-IDF 행렬의 크기(shape) :',tfidf_matrix.shape)
# 출력: TF-IDF 행렬의 크기(shape) : (3676, 2036)
# TF-IDF 행렬의 크기는 3329의 행을 가지고 9의 열을 가지는 행렬이다
# 3329개의 칵테일을 표현하기 위해서 총 9개의 색상이 사용되었다
# Output: Size of TF-IDF matrix (shape): (3676, 2036)
# The size of the TF-IDF matrix is a matrix with rows of 3329 and columns of 9
# A total of nine colors were used to express 3329 cocktails

TF-IDF 행렬의 크기(shape) : (3329, 9)


# TF-IDF 행렬의 코사인 유사도 계산 (Calculate cosine similarity of TF-IDF matrix)

In [161]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print('코사인 유사도 연산 결과 :',cosine_sim.shape) # Cosine similarity calculation results

코사인 유사도 연산 결과 : (3329, 3329)


# 색상 기반 유사한 칵테일 추천 상위 6개(Top 6 color-based similar cocktails recommendations)

In [162]:
def get_recommendations(cid, cosine_sim=cosine_sim):
    # 선택한 칵테일의 id로부터 해당 칵테일의 인덱스를 받아온다.
    # Gets the index of the cocktail from the id of the selected cocktail..
    idx = id_to_index[cid]
    
    # 해당 칵테일과 모든 칵테일의 유사도를 가져온다.
    # Bring the similarity of all cocktails to the corresponding cocktails
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 유사도에 따라 칵테일들을 정렬한다.
    # Sort cocktails according to similarity.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 6개의 칵테일를 받아온다.
    # I get the six most similar cocktails.
    sim_scores = sim_scores[1:7]

    # 가장 유사한 6개의 칵테일의 인덱스를 얻는다.
    # Get indexes of the six most similar cocktails
    cocktail_indices = [idx[0] for idx in sim_scores]

    # 가장 유사한 6개의 칵테일의 인덱스를 리턴한다.
    # Returns the index of the six most similar cocktails.
    return cocktail_indices

In [163]:
get_recommendations(1)

[43, 70, 76, 152, 201, 256]

# 결과 데이터프레임 생성 및 저장(Create and save the resulting data frame)

In [165]:
recommend_color_1=[]
recommend_color_2=[]
recommend_color_3=[]
recommend_color_4=[]
recommend_color_5=[]
recommend_color_6=[]
cocktail_id=[]

for cid in csv['cocktail_id']: # 칵테일 id(Cocktail ID)
    re = get_recommendations(cid) # 색상 추천 칵테일 id (Color recommendation cocktail id)
    recommend_color_1.append(re[0])
    recommend_color_2.append(re[1])
    recommend_color_3.append(re[2])
    recommend_color_4.append(re[3])
    recommend_color_5.append(re[4])
    recommend_color_6.append(re[5])
    cocktail_id.append(cid)

In [166]:
result = pd.DataFrame({
    'cocktail_id' : id_list,
    'recommend_color_1' : recommend_color_1,
    'recommend_color_2' : recommend_color_2,
    'recommend_color_3' : recommend_color_3,
    'recommend_color_4' : recommend_color_4,
    'recommend_color_5' : recommend_color_5,
    'recommend_color_6' : recommend_color_6,
    'cocktail_id' : cocktail_id
})

In [167]:
result.head()

Unnamed: 0,cocktail_id,recommend_color_1,recommend_color_2,recommend_color_3,recommend_color_4,recommend_color_5,recommend_color_6
0,0,4,5,7,9,13,15
1,1,43,70,76,152,201,256
2,2,26,32,41,51,52,57
3,3,17,20,25,31,36,40
4,4,4,5,7,9,13,15


In [168]:
result.to_csv('data/recommend_color.csv')