In [2]:
import pandas as pd

food = pd.read_csv('./foodData4_24.csv')
food.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165897 entries, 0 to 165896
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             165897 non-null  int64  
 1   Name                   165897 non-null  object 
 2   Images                 165896 non-null  object 
 3   RecipeCategory         165715 non-null  object 
 4   Keywords               161515 non-null  object 
 5   RecipeIngredientParts  165897 non-null  object 
 6   Calories               165897 non-null  float64
 7   FatContent             165897 non-null  float64
 8   SaturatedFatContent    165897 non-null  float64
 9   CholesterolContent     165897 non-null  float64
 10  SodiumContent          165897 non-null  float64
 11  CarbohydrateContent    165897 non-null  float64
 12  FiberContent           165897 non-null  float64
 13  SugarContent           165897 non-null  float64
 14  ProteinContent         165897 non-nu

In [4]:
food['Keywords'].unique()

array(['Dessert, Low Protein, Low Cholesterol, Healthy, Free Of..., Summer, Weeknight, Freezer, Easy',
       'Chicken Thigh & Leg, Chicken, Poultry, Meat, Asian, Indian, Weeknight, Stove Top',
       'Low Protein, Low Cholesterol, Healthy, Summer, < 60 Mins', ...,
       'Meat, South American, < 30 Mins, For Large Groups',
       'High In..., Weeknight, < 30 Mins, Easy, Inexpensive, From Scratch',
       'Nuts, Chinese, Asian, < 60 Mins'], dtype=object)

In [12]:
import numpy as np
# 고유한 키워드의 배열 가져오기
unique_keywords = food['Keywords'].unique()

# 고유한 키워드의 개수
num_unique_keywords = len(unique_keywords)

# 각 키워드의 등장 횟수 구하기
keyword_counts = food['Keywords'].value_counts()
print(keyword_counts)

Keywords
< 15 Mins, Easy                                                                                             3094
Easy                                                                                                        3021
< 60 Mins                                                                                                   2767
< 4 Hours                                                                                                   1827
< 30 Mins                                                                                                   1825
                                                                                                            ... 
Dessert, Berries, Fruit, Low Protein, Brunch, < 60 Mins, Easy                                                  1
Ham, Pork, Poultry, Meat, < 30 Mins, Refrigerator                                                              1
Cookie & Brownie, Grains, Kid Friendly, Christmas, Mixer, Oven, Refrigerator, Small App

In [24]:
dessert = food[food['Keywords'] == 'Oven']
print(dessert['Name'])

37                Appetizers-Easy Stuffed Mushrooms
38                                   Anzac Biscuits
74                               Blue Jimmy Pillows
151                               Golden Baked Fish
198       Vegetarian Baked Stuffed Red Bell Peppers
                            ...                    
107634                         Chili Cornbread Bake
120833              Oven Baked  Spicy Wedgie  Fries
127500                         Chow Mein Hot Dish I
157598             Troops BBQ Pizza Oven Taco Pizza
163228             Garlic &amp; Lemon Tri-Tip Roast
Name: Name, Length: 109, dtype: object


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
food['Keywords'] = food['Keywords'].fillna('')

In [30]:
# 각 단어의 중요성을 계산
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(food['Keywords'])
print('TF-IDF 행렬의 크기(shape) :',tfidf_matrix.shape)

TF-IDF 행렬의 크기(shape) : (165897, 339)


In [32]:
# cosine_similarity(문서1, 문서2)
# 문서1과 문서2의 코사인 유사도 연산(배열)
cosine_sim = cosine_similarity(tfidf_matrix[:20000], tfidf_matrix[:20000])
print('코사인 유사도 연산 결과 :',cosine_sim.shape)

코사인 유사도 연산 결과 : (20000, 20000)


In [40]:
name_to_index = dict(zip(food['Name'], food.index))

# 음식 이름 Biryani 의 인덱스를 리턴
idx = name_to_index['Biryani']  #음식기반, 키워드 기반으로 수정해야함
print(idx)

1


In [41]:
def get_recommendations(Name, cosine_sim=cosine_sim):
    # 선택한 음식의 이름으로부터 해당 음식의 인덱스를 받아온다.
    idx = name_to_index[Name]

    # 해당 영화와 모든 영화와의 유사도를 가져온다.
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 유사도에 따라 음식들을 정렬한다.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 10개의 음식을 받아온다.
    sim_scores = sim_scores[1:11]

    # 가장 유사한 10개의 음식의 인덱스를 얻는다.
    food_indices = [idx[0] for idx in sim_scores]

    # 가장 유사한 10개의 움식의 제목을 리턴한다.
    return food['Name'].iloc[food_indices]

In [37]:
get_recommendations("Biryani")

10995           Crock Pot Cider-Braised Chicken
18696            Hoisin Five-Spice Chicken Legs
525                    Slow Cooker Chicken Stew
5993                Chipotle Oven Fried Chicken
184                     Curried Chicken Muffins
16621        Chicken Ceylon with Massalla Gravy
18165    Uncle Bill's Barbecue Tandoori Chicken
18666                           Caramel Chicken
16786                 Sweet and Sour Drumsticks
36                                Chicken Curry
Name: Name, dtype: object

In [38]:
food[food['Name']=='Biryani']

Unnamed: 0.1,Unnamed: 0,Name,Images,RecipeCategory,Keywords,RecipeIngredientParts,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeInstructions
1,1,Biryani,https://img.sndimg.com/food/image/upload/w_555...,Chicken Breast,"Chicken Thigh & Leg, Chicken, Poultry, Meat, A...","saffron, milk, hot green chili peppers, onions...",1110.7,58.8,16.6,372.8,368.4,84.4,9.0,20.4,63.4,Soak saffron in warm milk for 5 minutes and pu...
