In [1]:
import pandas as pd
import numpy as np
import csv
import sklearn
import ast
import re
df = open("/content/drive/MyDrive/raw-data_recipe.csv", encoding='UTF8')
read_csv = pd.read_csv(df)

In [2]:
read_csv = read_csv.drop(columns=['reviews', 'recipe_id', 'image_url'])

In [4]:
def time_to_num(time_var):
  time_arr = re.split(' ', time_var)
  total_time = 0
  time_type = set(['m','h','d'])
  for element in time_arr:
    if element not in time_type:
      time = int(element)
    else:
      if element == 'd':
        total_time = 24*60*time
      elif element == 'h':
        total_time = 60*time
      else:
        total_time += time

  return total_time

In [5]:
def get_time(element):
  res = ast.literal_eval(element)
  arr = re.split('\n', res['directions'])
  if len(arr) > 5 and arr[4] == 'Ready In':
    time = time_to_num(arr[5])
    direction = ' '.join(arr[6:])
  elif len(arr) > 4 and arr[0] == 'Prep':
    time = time_to_num(arr[1]) + time_to_num(arr[3])
    direction = ' '.join(arr[4:])
  elif arr[0] == 'Cook':
    time = time_to_num(arr[3])
    direction = ' '.join(arr[4:])
  elif arr[0] == 'Ready In':
    time = time_to_num(arr[1])
    direction = ' '.join(arr[2:])
  elif arr[0] == 'Prep' :
    time = -1
    direction = 'None'
  else:
    time = -1
    direction = ' '.join(arr)
  return time, direction

In [6]:
read_csv["time"] = read_csv["cooking_directions"].apply(lambda x : get_time(x)[0])
read_csv["recipe"] = read_csv["cooking_directions"].apply(lambda x : get_time(x)[1])
read_csv = read_csv.drop(columns=['cooking_directions'])

In [7]:
import nltk
nltk.download('wordnet')
wnl = nltk.WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [8]:
def word_tokenize(element):
  x = element.replace('^', ' ').replace('(', ' ').replace(')', ' ').replace('®','').lower()
  arr = [wnl.lemmatize(word) for word in nltk.wordpunct_tokenize(x)]
  return ' '.join(arr)

In [9]:
read_csv['name'] = read_csv['recipe_name'].apply(lambda x: word_tokenize(x))

In [10]:
read_csv["ingredients_token"] = read_csv["ingredients"].apply(lambda x : word_tokenize(x))

In [11]:
read_csv["tool"] = read_csv["recipe"].apply(lambda x : word_tokenize(x))

In [12]:
read_csv["ing_tool"] = read_csv[['ingredients_token','tool']].agg(' '.join, axis=1)

In [13]:
read_csv = read_csv.drop(columns=['tool', 'ingredients_token'])

In [14]:
read_csv["tokens"] = read_csv[['name','ing_tool']].agg(' '.join, axis=1)

In [15]:
def str_set(element):
  return set(re.split(' ', element))

In [16]:
read_csv['name'] = read_csv['name'].apply(lambda x: str_set(x))

In [17]:
read_csv["ing_tool"] = read_csv["ing_tool"].apply(lambda x : str_set(x))

In [18]:
nut_dict = {'sugars':0, 'fat':0, 'calories':0, 'sodium':0}
for element in read_csv.nutritions:
  res = ast.literal_eval(element)
  if res['sugars']['name']:
    nut_dict['sugars'] += res['sugars']['amount']
  if res['fat']['name']:
    nut_dict['fat']  += res['fat']['amount']
  if res['calories']['name']:
    nut_dict['calories']  += res['calories']['amount']
  if res['sodium']['name']:
    nut_dict['sodium'] += res['sodium']['amount']

In [19]:
def low_check(element, nutrition):
  res = ast.literal_eval(element)
  if res[nutrition]['name'] and nut_dict[nutrition] > res[nutrition]['amount']*len_df:
    return True
  else:
    return False

In [20]:
len_df = len(read_csv)
for nut in nut_dict.keys():
  read_csv[nut] = read_csv["nutritions"].apply(lambda x : low_check(x, nut))

In [None]:
# In average, 20 ingredients are used in each dish. 
# Similar : 100 among 50000 -> 500 topics -> 200 topics -> 100 topics

In [21]:
count = 0
for element in read_csv['tokens']:
  count += len(re.split(' ', element))
print(count//len_df)

148


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df = 0.95, stop_words ='english')
cv_text = cv.fit_transform(read_csv['tokens'])
features = cv.get_feature_names()

In [23]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components = 100)
lda.fit(cv_text)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=100, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [None]:
# for index, topic in enumerate(lda.components_):
#     print('Top 10 ingredients for topic',index)
#     print([cv.get_feature_names()[i] for i in topic.argsort()[-10:]])
#     print('\n')

In [None]:
test = lda.transform(cv_text)

In [None]:
temp = []
for element in test:
    temp.append(np.argmax(element))
topic_class = temp

In [None]:
read_csv['topic'] = topic_class

In [None]:
read_csv = read_csv.drop(columns=['nutritions'])

In [None]:
read_csv.to_csv("/content/drive/My Drive/recipe_data.csv")

In [None]:
token_list = set()

for element in read_csv['ingredients']:
  x = element.replace('^', ' ').replace('(', ' ').replace(')', ' ').replace('®','').lower()
  arr = [wnl.lemmatize(word) for word in nltk.wordpunct_tokenize(x)]
  for ing in arr:
    token_list.add(ing)
for element in read_csv["recipe"]:
  x = ' '.join(re.split(r"\W+", element)).lower()
  arr = [wnl.lemmatize(word) for word in nltk.wordpunct_tokenize(x)]
  for tool in arr:
    token_list.add(tool)
for element in read_csv['recipe_name']:
  x = element.replace('^', ' ').replace('(', ' ').replace(')', ' ').replace('®','').lower()
  arr = [wnl.lemmatize(word) for word in nltk.wordpunct_tokenize(x)]
  for ing in arr:
    token_list.add(ing)
token_list = pd.DataFrame(token_list)

In [None]:
token_list.to_csv("/content/drive/My Drive/token_list.csv")