In [2]:
import pandas as pd
import numpy as np
import re
import requests
import os
pd.options.display.max_rows=500
from IPython.display import clear_output, display
from fuzzywuzzy import fuzz
from pymongo import MongoClient
import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors
from google.oauth2.credentials import Credentials

from dotenv import load_dotenv
load_dotenv()

True

# set up

## spoonacular API (through RapidAPI)

In [3]:
url = "https://spoonacular-recipe-food-nutrition-v1.p.rapidapi.com/recipes/parseIngredients"
headers = {
    'x-rapidapi-host': "spoonacular-recipe-food-nutrition-v1.p.rapidapi.com",
    'x-rapidapi-key': os.getenv("SPOON_API_KEY"), 
    'content-type': "application/x-www-form-urlencoded"
    }

## google authentication + youtube

In [4]:
scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"
api_service_name = "youtube"
api_version = "v3"
client_secrets_file = "client_secret.json"

# check if previously saved credentials exist 
if os.path.exists("credentials.txt"):
    credentials = Credentials.from_authorized_user_file('credentials.txt')

# gather new credentials & write to file
else:
    flow = google_auth_oauthlib.flow.InstalledAppFlow.from_client_secrets_file(
    client_secrets_file, scopes)
    creds = flow.run_console()

    creds_data = {
        "token": creds.token,
        "refresh_token": creds.refresh_token,
        "token_uri": creds.token_uri,
        "client_id": creds.client_id,
        "client_secret": creds.client_secret,
        "scopes": creds.scopes,
    }

    del creds_data["token"]

    with open("credentials.txt", "w") as outfile:
        json.dump(creds_data, outfile)

    print("credentials saved")

youtube = googleapiclient.discovery.build(
api_service_name, api_version, credentials=credentials)

## mongoDB configuration

In [5]:
client = MongoClient(os.getenv("MONGO_URI"))
db = client.desserts

# get new uploads

In [11]:
# get ids of uploads that have already been processed
upload_ids = db.videos.distinct('id')

In [12]:
# set up channel ids
hanse = 'UCZTavrg2A43lQMWxiK3yu7g'
cookingtree = 'UCtby6rJtBGgUm-2oD_E7bzw'
hida = 'UCcp9uRaBwInxl_SZqGRksDA'

channels = [hanse, cookingtree, hida]

In [18]:
newUploads = pd.DataFrame()

# go through each channel, appending new uploads to newUploads dataframe
for channel in channels:
    request = youtube.channels().list(
        part="contentDetails,snippet",
        id=channel
    )
    response = request.execute()
    uploadsId = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
    
    hasNext = True
    nextPage = ''
    
    # paginate through results
    while hasNext:
        request = youtube.playlistItems().list(
            part="contentDetails, snippet",
            playlistId=uploadsId,
            pageToken=nextPage
        )
        response = request.execute()
        videos = pd.json_normalize(response['items'])
        
        # check if video is a new upload
        for i in videos.index:
            if videos.loc[i,'contentDetails.videoId'] in upload_ids:
                print("no more new videos")
                hasNext = False
                break
            else: 
                newUploads = newUploads.append(videos.loc[i])
        try:
            nextPage = response['nextPageToken']
        except:
            hasNext = False

no more new videos
no more new videos
no more new videos


In [20]:
newUploads.reset_index(drop=True, inplace=True)

In [21]:
# parse out foreign language from titles and other miscellaneous things
def format_title(x):
    x = re.sub(r'[^a-zA-Z0-9\,()| \-]', '', x)
    x = re.sub(r'^,|[^a-zA-Z]\,','',x)
    x = re.sub(' +', ' ', x)
    x = x.replace("()", "").replace("( )", "").replace("Cooking ASMR","")
    x = x.replace("()", "").replace("( )", "")
    x = x.strip()
    
    if x == '#NAME?':
        x = "unknown"
    else:
        try:
            if math.isnan(x):
                x = "unknown"
        except:
            pass
    return x

newUploads['snippet.title'] = newUploads['snippet.title'].apply(lambda x: format_title(x))

# get ingredients

In [17]:
# library for detecting plurality
import inflect
p = inflect.engine()

In [18]:
# helper method to filter out instructional steps
def filterOutSteps(item): 
    if re.search(r'\b\d[.]', item) or re.search(r'\b\d[)]', item):
        return False
    else: 
        return True

In [19]:
# helper method to filter out items that are unlikely to be ingredients
def filterOutIngred(item): 
    if len(item)<3 or len(item)>50:
        return False
    elif re.search('[a-zA-Z]{3,}', item) and re.search(r'\b\d+[a-zA-z| ]', item) :
        return True
    elif item in ingredList:
        return True
    else: 
        return False

In [20]:
# helper method to clean the parsed out ingredient names
def cleanParsed(item):
    if item['name'] == '':
         item['name'] = re.sub(r'\d[a-zA-Z]?', '', item['original']).strip()    
    
    # for catching typos
    elif ingredDf[0].apply(lambda x: fuzz.ratio(x,item['name'])).max() >= 80:
        item['name'] = ingredDf[0][ingredDf[0].apply(lambda x: fuzz.ratio(x,item['name'])).idxmax(axis=0)]
        
    # for phrase-like ingredients
    cleaned_names = list(ingredDf[ingredDf[0].apply(lambda x: True if x in item['name'] else False)][0])
    if(cleaned_names):
        item['name'] = max(cleaned_names,key=len)

    # for eliminating plurality
    try:
        if p.singular_noun(item['name']):
            item['name'] = p.singular_noun(item['name'])
    except:
        print(item['name'])

    return item

In [21]:
# helper method to filter out parsed items that are unlikely to be ingredients
def filterParsed(item):
    if len(item['originalName']) < 3 or 'pan' in item['original']:
        return False
    elif item['name'] not in ingredList:
        print(item['name'])
    else:
        return True

In [22]:
# get list of ingredients from local file
ingredDf = pd.read_csv('ingredList.csv',header=None)
ingredList = list(ingredDf[0])

In [23]:
# get ingredient index from mongodb -- must continue to build upon this and then upsert what's in the database
IngredientIndex = pd.DataFrame(list(db.ingredients.find())).set_index('name').drop(columns=['_id'])

In [24]:
masterIngredientNames = []
masterIngredientDicts = []
newRecipe = True

In [25]:
# iterate through each newly uploaded video and parse out its ingredients
for i in range(len(newUploads)):
    clear_output(wait=True)
    print("index: ", i)
    newRecipe = True
    
    # prepare structures to collect recipe info
    ingredientNames = []
    ingredientDict = {}
    
    # get ingredient section, if exists
    desc = newUploads['snippet.description'][i].lower()
    if "ingredient" in desc:
        section = desc.split("ingredient",1)[1] 
    else:
        print("error at index",i)
        masterIngredientDicts.append(ingredientDict)
        masterIngredientNames.append(ingredientNames)
        continue
    
    # format out foreign language / some symbols -- account for fractions
    formatted = re.sub(r'[^a-zA-Z| |0-9|\n|,|\+|\|.|/|\(|\)]', '', section)
    formatted = re.sub(r'/[^0-9]', '',formatted)
    
    # filter out instructional steps
    ingredients = re.split('\n',formatted)
    ingredients = list(filter(filterOutSteps, ingredients))
    newLine = "\n"
    ingredients = newLine.join(ingredients)

    # break up multi-line ingredients and filter out invalid ones
    ingredients = re.split(',|\n|\+|\(|\)',ingredients)
    ingredients = list(map(lambda x: x.strip(),ingredients))
    ingredients = list(filter(filterOutIngred, ingredients))
    ingredients = newLine.join(ingredients)
    
    # call spoonacular API to parse ingredients
    payload = {'ingredientList': ingredients, 'servings': 1}
    response = requests.request("POST", url, data=payload, headers=headers)
    data = response.json()   
    
    # cleaning
    data = list(map(cleanParsed,data))
    
    # apply another filter
    dataTest = list(filter(filterParsed, data))

    # iterate through each entry returned (each ingredient parsed out)
    for ingredient in dataTest:
        ingredientName = ingredient['name']

        # check if ingredient has been used in recipe before (if so, then aggregate)
        if ingredientName in ingredientDict.keys():    
            unitName = ingredient['unitShort']
            amount = ingredient['amount']

            if unitName in ingredientDict[ingredientName].keys():
                ingredientDict[ingredientName][unitName] += amount
            else:
                ingredientDict[ingredientName][unitName] = amount
       
        # if new ingredient, create new entry in recipe
        else:
            ingredientInfo = dict((k, ingredient[k]) for k in ('amount', 'unitShort'))
            ingredientNames.append(ingredientName)

            unitName = ingredientInfo['unitShort']
            amount = ingredientInfo['amount']

            ingredientDict[ingredientName] = dict()
            ingredientDict[ingredientName][unitName] = amount
          
        # update the master ingredient index 
        if ingredientName not in list(IngredientIndex.index):
            IngredientIndex.loc[ingredientName, 'count'] = 1
            try:
                IngredientIndex.loc[ingredientName, 'aisle'] = ingredient['aisle']
            except:
                IngredientIndex.loc[ingredientName, 'aisle'] = "Other"
                
        elif newRecipe:
            IngredientIndex.loc[ingredientName, 'count'] += 1
            newRecipe = False
            
    masterIngredientDicts.append(ingredientDict)
    masterIngredientNames.append(ingredientNames)

index:  1
cm round mold
cmhario
cmiwaki


In [26]:
# add ingredient details and names to dataframe
newUploads['ingredientDetails'] = masterIngredientDicts
newUploads['ingredientNames'] = masterIngredientNames

# only keep those that had ingredients
newUploads = newUploads[newUploads['ingredientNames'].apply(lambda x: True if len(x)!= 0 else False)]

In [27]:
IngredientIndex.reset_index(inplace=True)
IngredientIndex.columns = ['name','count','aisle']

# df to mongo

In [29]:
# pull out key columns & rename
videoDf = newUploads[['snippet.resourceId.videoId','snippet.publishedAt','snippet.title','snippet.channelTitle','ingredientNames','ingredientDetails']]

videoDf.rename(columns={'snippet.resourceId.videoId':
                      'id', 'snippet.publishedAt':'published', 'snippet.title':'title',
                      'snippet.channelTitle':'channel'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [30]:
# insert into mongodb
videos = db.videos
videoDict = videoDf.to_dict(orient = 'records')
videos.insert_many(videoDict)

<pymongo.results.InsertManyResult at 0x1129af6c8>

In [31]:
# clear existing ingredient index
ingredients = db.ingredients
ingredients.drop()

In [32]:
# insert updated ingredient index into mongodb
ingredientDict = IngredientIndex.to_dict(orient = 'records')
ingredients.insert_many(ingredientDict)

<pymongo.results.InsertManyResult at 0x1103100c8>