In [107]:
from lxml import html
import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
import pickle
import numpy as np

import nltk
import gensim
from unidecode import unidecode
import string

# Getting the data

In [None]:
t0 = time.time()

food_categories = ['american','american-new','german','crepes','french','burgers','deli']#'asianfusion','californian','chinese','dim-sum','sandwiches'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

base_url = url = 'http://www.allmenus.com/ca/san-francisco/'

rest_url_list = []
category_list = []

print 'Scraping for urls'
for fcat in food_categories:
    # Get all the restaurants that food category falls into
    top_page = requests.get(base_url + '-/' + fcat + '/',headers=headers)
    
    # Turn it into a BeautifulSoup object
    top_soup= BeautifulSoup(top_page.text, "lxml")
    all_rest_links = top_soup.findAll("p",{"class","restaurant_name"})
    
    for rest in all_rest_links:
        rest_url_list.append(rest.find('a')['href'])
        category_list.append(fcat)
    
print 'URLs obtained, time to scrape for menus'

In [50]:
def scrape_menus(rest_url_list,category_list):

    t0 = time.time()

    item_list = []
    i=1
    for url,fcat in zip(rest_url_list,category_list):
        #print "Scraping restaurant %d"%(i)

        page = requests.get('http://www.allmenus.com'+url,headers=headers)
        soup = BeautifulSoup(page.text, "lxml")

        # Extract restaurant name
        name = unidecode(soup.find("h1", {"itemprop":"name"}).text)

        # Extract street address
        saddr = unidecode(soup.find("span",{"itemprop":"streetAddress"}).text)

        # Extract the city
        city = unidecode(soup.find("span",{"itemprop":"addressLocality"}).text)

        # Extract the state
        state = unidecode(soup.find("span",{"itemprop":"addressRegion"}).text)

        # Extract the zip code
        zipc = unidecode(soup.find("span",{"itemprop":"postalCode"}).text)

        # Extract yelp rating
        try:
            yelp_rating = float(soup.find("meta",{"itemprop":"ratingValue"})['content'])
        except:
            yelp_rating = None

        # Extract number of yelp reviews
        try:
            num_yelp_reviews = int(soup.find("meta",{"itemprop":"reviewCount"})['content'])
        except:
            num_yelp_reviews = None

        # Get the yelp link
        try:
            yelp_link = soup.find("span",{"class":"review_count"}).find('a')['href']
        except:
            yelp_link = None

        all_categories = soup.find_all("div",{"class":"category"})

        for cat in all_categories:
            category_name = unidecode(cat.find("div",{"class":"category_head"}).h3.text)
            category_description = unidecode(cat.find("div",{"class":"category_head"}).p.text)

            all_menu_items_in_category = cat.find_all("li",{"class":"menu_item"})

            for menu_item in all_menu_items_in_category:
                item_name = unidecode(menu_item.find("span",{"class":"name"}).text)
                item_description = unidecode(menu_item.find("p",{"class":"description"}).text)
                try:
                    item_price = unidecode(menu_item.find("span",{"class":"price"}).text)
                except:
                    item_price = []

                new_item = {'restaurant_name':name, 'item_name':item_name,'item_description':item_description,'item_price':item_price,'category_name'
                        :category_name,'category_description':category_description,'street_address':saddr,'city':city,
                        'state':state,'zip':zipc,'full_address':", ".join([saddr,city,state,zipc]),"yelp_rating":yelp_rating,
                       'num_reviews':num_yelp_reviews,'yelp_link':yelp_link,'restaurant_category':fcat}
                item_list.append(new_item)
        i+=1

    all_menus_rest_df = pd.DataFrame(item_list)
    del item_list
    return all_menus_rest_df
    t1 = time.time()
    print str(t1-t0) + 'seconds'

Load the pickled dataframe

In [2]:
all_menus_rest_df = pd.read_pickle('all_menus_rest_df.p')

Remove all the duplicated entries (same restaurant, same address, same menu item)

In [46]:
all_menus_drop_dup = all_menus_rest_df.drop_duplicates(['restaurant_name','item_name','street_address'])

Get the menu for Ti Couz

In [56]:
ti_couz_df = scrape_menus(['/ca/san-francisco/157991-ti-couz/menu/'],'crepes')

Do some basic clean up for my example

In [79]:
ti_couz_df.item_description[(ti_couz_df.category_name=='Krampouz Ble Noir - Savory Crepe')] = ti_couz_df.item_description[(ti_couz_df.category_name=='Krampouz Ble Noir - Savory Crepe')] + " savory crepe" 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [89]:
ti_couz_df.item_description[ti_couz_df.category_name=='Krampouz Froment - Sweet Crepe'] = ti_couz_df.item_description[ti_couz_df.category_name=='Krampouz Froment - Sweet Crepe'] +  " sweet crepe"  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Merge the main dataframe with the Ti couz one

In [93]:
all_menus_drop_dup = pd.concat([all_menus_drop_dup,ti_couz_df])

In [104]:
all_menus_drop_dup['item_name_and_description'] = all_menus_drop_dup.item_name.map(str) + " " + all_menus_drop_dup.item_description.map(str)

In [151]:
def removePunctuation(df,col_name):
    out_df = df[col_name].apply(lambda x: x.translate(string.maketrans("",""), string.punctuation))
    return out_df

In [152]:
def tokenize(df,col_name):
    out_df = df[col_name].apply(lambda x: x.split())
    return out_df

In [153]:
def removeStopWordsMakeLowerCase(df,col_name):
    out_df = df[col_name].apply(lambda x: [i.lower() for i in x if i.lower() not in nltk.corpus.stopwords.words('english')]) 
    return out_df

In [154]:
def lemmatize(df,col_name):
    from nltk.stem.wordnet import WordNetLemmatizer
    lmtzr = WordNetLemmatizer()
    out_df = [[lmtzr.lemmatize(unicode(i)) for i in x] for x in df[col_name]]
    return out_df

In [155]:
def removeDuplicateWords(df,col_name):
    out_df = [list(set(i)) for i in df[col_name]]
    return out_df

In [165]:
all_menus_drop_dup.item_name_and_description = removePunctuation(all_menus_drop_dup,'item_name_and_description')
all_menus_drop_dup.item_name_and_description = tokenize(all_menus_drop_dup,'item_name_and_description')
all_menus_drop_dup.item_name_and_description = removeStopWordsMakeLowerCase(all_menus_drop_dup,'item_name_and_description')
all_menus_drop_dup.item_name_and_description = lemmatize(all_menus_drop_dup,'item_name_and_description')
all_menus_drop_dup.item_name_and_description = removeDuplicateWords(all_menus_drop_dup,'item_name_and_description')

In [167]:
tokens = all_menus_drop_dup.item_name_and_description.tolist()

In [168]:
# Vectorize the tokens using tf-idf

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_model = TfidfVectorizer(tokenizer=lambda i:i, lowercase=False)
tfidf_mat = tfidf_model.fit_transform(tokens)

In [169]:
tfidf_mat

<75105x20924 sparse matrix of type '<type 'numpy.float64'>'
	with 582394 stored elements in Compressed Sparse Row format>

In [172]:
all_menus_drop_dup[all_menus_drop_dup.restaurant_name=='Ti-couz - CLOSED']

Unnamed: 0,category_description,category_name,city,full_address,item_description,item_name,item_price,num_reviews,restaurant_category,restaurant_name,state,street_address,yelp_link,yelp_rating,zip,item_name_and_description
0,can all be garnished with homemade creme fraic...,Krampouz Ble Noir - Savory Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",Plain savory crepe,Nature,$2.00,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[savory, plain, crepe, nature]"
1,can all be garnished with homemade creme fraic...,Krampouz Ble Noir - Savory Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",ham savory crepe,Jambon,$4.50,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[savory, jambon, crepe, ham]"
2,can all be garnished with homemade creme fraic...,Krampouz Ble Noir - Savory Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",butter savory crepe,Beurre,$2.50,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[butter, savory, beurre, crepe]"
3,can all be garnished with homemade creme fraic...,Krampouz Ble Noir - Savory Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",flavored butter (basil or parsley or garlic or...,Beurre Parfume,$3.50,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[butter, beurre, crepe, parfume, parsley, savo..."
4,can all be garnished with homemade creme fraic...,Krampouz Ble Noir - Savory Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",egg savory crepe,Oeuf,$3.50,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[savory, crepe, egg, oeuf]"
5,can all be garnished with homemade creme fraic...,Krampouz Ble Noir - Savory Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",cheese savory crepe,Fromage,$4.50,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[savory, cheese, crepe, fromage]"
6,can all be garnished with homemade creme fraic...,Krampouz Ble Noir - Savory Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",caramelized onions savory crepe,Oignon Caramalise,$5.00,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[caramalise, crepe, onion, oignon, savory, car..."
7,can all be garnished with homemade creme fraic...,Krampouz Ble Noir - Savory Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",organic tomatoes (with basil butter) savory crepe,Tomate Organique,$5.25,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[tomato, butter, organic, tomate, savory, basi..."
8,can all be garnished with homemade creme fraic...,Krampouz Ble Noir - Savory Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",ratatouille savory crepe,Ratatouille,$6.00,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[savory, crepe, ratatouille]"
9,can all be garnished with homemade creme fraic...,Krampouz Ble Noir - Savory Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",sausage (with basil butter) savory crepe,Saucisse,$6.25,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[butter, sausage, crepe, saucisse, savory, basil]"


In [175]:
all_menus_drop_dup.iloc[-79]

category_description         can all be garnished with homemade creme fraic...
category_name                                 Krampouz Ble Noir - Savory Crepe
city                                                             San Francisco
full_address                            3108 16th St, San Francisco, CA, 94103
item_description                   mushroom (with mushroom sauce) savory crepe
item_name                                                           Champignon
item_price                                                               $6.25
num_reviews                                                               1148
restaurant_category                                                          c
restaurant_name                                               Ti-couz - CLOSED
state                                                                       CA
street_address                                                    3108 16th St
yelp_link                        http://www.yelp.com

In [176]:
all_menus_drop_dup.shape

(75105, 16)

In [180]:
all_menus_drop_dup.iloc[75105-79]

category_description         can all be garnished with homemade creme fraic...
category_name                                 Krampouz Ble Noir - Savory Crepe
city                                                             San Francisco
full_address                            3108 16th St, San Francisco, CA, 94103
item_description                   mushroom (with mushroom sauce) savory crepe
item_name                                                           Champignon
item_price                                                               $6.25
num_reviews                                                               1148
restaurant_category                                                          c
restaurant_name                                               Ti-couz - CLOSED
state                                                                       CA
street_address                                                    3108 16th St
yelp_link                        http://www.yelp.com

In [181]:
75105-79

75026

In [182]:
# 75026 is the index for crepe with mushroom sauce

In [184]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

cosine_similarities = cosine_similarity(tfidf_mat[75026], tfidf_mat).flatten()
related_food_idcs = cosine_similarities.argsort()[::-1][1:20]

#cosine_similarities[related_food_idcs]

print related_food_idcs

all_menus_drop_dup.iloc[related_food_idcs]

[75036 75042 75050 48333 75051 39611 60097 75024 75021 75055 75018 56500
 75017 59531 75020 45648 75016 45645 75028]


Unnamed: 0,category_description,category_name,city,full_address,item_description,item_name,item_price,num_reviews,restaurant_category,restaurant_name,state,street_address,yelp_link,yelp_rating,zip,item_name_and_description
20,,Additions,San Francisco,"3108 16th St, San Francisco, CA, 94103",mushroom sauce,Sauce Champignon,$2.00,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[champignon, sauce, mushroom]"
26,,Additions,San Francisco,"3108 16th St, San Francisco, CA, 94103",mushroom (with mushroom sauce),Champignon,$3.75,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[sauce, champignon, mushroom]"
34,"basket of bread - $2.50, (complimentary with s...",Our Recommendations,San Francisco,"3108 16th St, San Francisco, CA, 94103",mushroom and cheese,Champignon and Fromage,$0.00,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[cheese, champignon, mushroom, fromage]"
53989,,Seafood Specialties,San Francisco,"Pier 2 Embarcadero St, San Francisco, CA, 94111",with shrimp and mushroom in white wine sauce,Grilled Halibut Champignon,$26.50,568.0,american-new,Sinbad's,CA,Pier 2 Embarcadero St,http://www.yelp.com/biz/sinbads-pier-ii-restau...,2.0,94111,"[mushroom, shrimp, champignon, halibut, grille..."
35,"basket of bread - $2.50, (complimentary with s...",Our Recommendations,San Francisco,"3108 16th St, San Francisco, CA, 94103",shrimp and mushroom,Crevettes and Champignon,$0.00,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[champignon, mushroom, crevettes, shrimp]"
43098,Cage Free Eggs,Omelets,San Francisco,"1155 Folsom St, San Francisco, CA, 94103","mushroom, baby spinach, mozzarella cheese, avo...",Champignon,$10.00,955.0,american-new,Triptych,CA,1155 Folsom St,http://www.yelp.com/biz/triptych-san-francisco,3.5,94103,"[cheese, mushroom, spinach, avocado, champigno..."
68025,"Pick Your Fruits, Filling, Sauce, Ice Cream, T...",Create Your Own Crepe,San Francisco,"Location Varies, San Francisco, CA, 94103",,Crepe,$3.50,,crepes,J Shack,CA,Location Varies,,,94103,[crepe]
8,can all be garnished with homemade creme fraic...,Krampouz Ble Noir - Savory Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",ratatouille savory crepe,Ratatouille,$6.00,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[savory, crepe, ratatouille]"
5,can all be garnished with homemade creme fraic...,Krampouz Ble Noir - Savory Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",cheese savory crepe,Fromage,$4.50,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[savory, cheese, crepe, fromage]"
39,"basket of bread - $2.50, (complimentary with s...",Our Recommendations,San Francisco,"3108 16th St, San Francisco, CA, 94103",mushroom and cheese and almonds,Champignon Fromage and Amandes,$0.00,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,"[cheese, mushroom, almond, amandes, champignon..."
