In [107]:
from lxml import html
import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
import pickle
import numpy as np

import nltk
import gensim
from unidecode import unidecode
import string

# Getting the data

In [None]:
t0 = time.time()

food_categories = ['american','american-new','german','crepes','french','burgers','deli']#'asianfusion','californian','chinese','dim-sum','sandwiches'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

base_url = url = 'http://www.allmenus.com/ca/san-francisco/'

rest_url_list = []
category_list = []

print 'Scraping for urls'
for fcat in food_categories:
    # Get all the restaurants that food category falls into
    top_page = requests.get(base_url + '-/' + fcat + '/',headers=headers)
    
    # Turn it into a BeautifulSoup object
    top_soup= BeautifulSoup(top_page.text, "lxml")
    all_rest_links = top_soup.findAll("p",{"class","restaurant_name"})
    
    for rest in all_rest_links:
        rest_url_list.append(rest.find('a')['href'])
        category_list.append(fcat)
    
print 'URLs obtained, time to scrape for menus'

In [50]:
def scrape_menus(rest_url_list,category_list):

    t0 = time.time()

    item_list = []
    i=1
    for url,fcat in zip(rest_url_list,category_list):
        #print "Scraping restaurant %d"%(i)

        page = requests.get('http://www.allmenus.com'+url,headers=headers)
        soup = BeautifulSoup(page.text, "lxml")

        # Extract restaurant name
        name = unidecode(soup.find("h1", {"itemprop":"name"}).text)

        # Extract street address
        saddr = unidecode(soup.find("span",{"itemprop":"streetAddress"}).text)

        # Extract the city
        city = unidecode(soup.find("span",{"itemprop":"addressLocality"}).text)

        # Extract the state
        state = unidecode(soup.find("span",{"itemprop":"addressRegion"}).text)

        # Extract the zip code
        zipc = unidecode(soup.find("span",{"itemprop":"postalCode"}).text)

        # Extract yelp rating
        try:
            yelp_rating = float(soup.find("meta",{"itemprop":"ratingValue"})['content'])
        except:
            yelp_rating = None

        # Extract number of yelp reviews
        try:
            num_yelp_reviews = int(soup.find("meta",{"itemprop":"reviewCount"})['content'])
        except:
            num_yelp_reviews = None

        # Get the yelp link
        try:
            yelp_link = soup.find("span",{"class":"review_count"}).find('a')['href']
        except:
            yelp_link = None

        all_categories = soup.find_all("div",{"class":"category"})

        for cat in all_categories:
            category_name = unidecode(cat.find("div",{"class":"category_head"}).h3.text)
            category_description = unidecode(cat.find("div",{"class":"category_head"}).p.text)

            all_menu_items_in_category = cat.find_all("li",{"class":"menu_item"})

            for menu_item in all_menu_items_in_category:
                item_name = unidecode(menu_item.find("span",{"class":"name"}).text)
                item_description = unidecode(menu_item.find("p",{"class":"description"}).text)
                try:
                    item_price = unidecode(menu_item.find("span",{"class":"price"}).text)
                except:
                    item_price = []

                new_item = {'restaurant_name':name, 'item_name':item_name,'item_description':item_description,'item_price':item_price,'category_name'
                        :category_name,'category_description':category_description,'street_address':saddr,'city':city,
                        'state':state,'zip':zipc,'full_address':", ".join([saddr,city,state,zipc]),"yelp_rating":yelp_rating,
                       'num_reviews':num_yelp_reviews,'yelp_link':yelp_link,'restaurant_category':fcat}
                item_list.append(new_item)
        i+=1

    all_menus_rest_df = pd.DataFrame(item_list)
    del item_list
    return all_menus_rest_df
    t1 = time.time()
    print str(t1-t0) + 'seconds'

Load the pickled dataframe

In [2]:
all_menus_rest_df = pd.read_pickle('all_menus_rest_df.p')

Remove all the duplicated entries (same restaurant, same address, same menu item)

In [46]:
all_menus_drop_dup = all_menus_rest_df.drop_duplicates(['restaurant_name','item_name','street_address'])

Get the menu for Ti Couz

In [56]:
ti_couz_df = scrape_menus(['/ca/san-francisco/157991-ti-couz/menu/'],'crepes')

Do some basic clean up for my example

In [79]:
ti_couz_df.item_description[(ti_couz_df.category_name=='Krampouz Ble Noir - Savory Crepe')] = ti_couz_df.item_description[(ti_couz_df.category_name=='Krampouz Ble Noir - Savory Crepe')] + " savory crepe" 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [89]:
ti_couz_df.item_description[ti_couz_df.category_name=='Krampouz Froment - Sweet Crepe'] = ti_couz_df.item_description[ti_couz_df.category_name=='Krampouz Froment - Sweet Crepe'] +  " sweet crepe"  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Merge the main dataframe with the Ti couz one

In [93]:
all_menus_drop_dup = pd.concat([all_menus_drop_dup,ti_couz_df])

In [104]:
all_menus_drop_dup['item_name_and_description'] = all_menus_drop_dup.item_name.map(str) + " " + all_menus_drop_dup.item_description.map(str)

In [132]:
def removePunctuation(df,col_name):
    out_df = df[col_name].apply(lambda x: x.translate(string.maketrans("",""), string.punctuation))
    return out_df

In [133]:
def tokenize(df,col_name):
    out_df = df[col_name].apply(lambda x: x.split())
    return out_df

In [131]:
def removeStopWordsMakeLowerCase(df,col_name):
    out_df = df[col_name].apply(lambda x: [i.lower() for i in x if i.lower() not in nltk.corpus.stopwords.words('english')]) 
    return out_df

In [134]:
def lemmatize(df,col_name):
    from nltk.stem.wordnet import WordNetLemmatizer
    lmtzr = WordNetLemmatizer()
    out_df = [[lmtzr.lemmatize(unicode(i)) for i in x] for x in df[col_name]]
    return out_df

In [135]:
def removeDuplicateWords(df,col_name):
    out_df = [list(set(i)) for i in df[col_name]]
    return out_df

In [122]:
test = [[lmtzr.lemmatize(unicode(i)) for i in x] for x in test]

In [146]:
test = all_menus_drop_dup.tail()

In [138]:
test.item_description = removePunctuation(test,'item_description')
test.item_description = tokenize(test,'item_description')
test.item_description = removeStopWordsMakeLowerCase(test,'item_description')
test.item_description = lemmatize(test,'item_description')
test.item_description = removeDuplicateWords(test,'item_description')
test.item_description

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


AttributeError: 'NoneType' object has no attribute 'split'

In [140]:
test.item_description = removePunctuation(test,'item_description')


In [143]:
test.item_description =  tokenize(test,'item_description')


AttributeError: 'NoneType' object has no attribute 'split'

In [144]:
test['item_description']

84    None
85    None
86    None
87    None
88    None
Name: item_description, dtype: object

In [147]:
test

Unnamed: 0,category_description,category_name,city,full_address,item_description,item_name,item_price,num_reviews,restaurant_category,restaurant_name,state,street_address,yelp_link,yelp_rating,zip,item_name_and_description
84,"garnished with homemade chantilly, to create y...",Krampouz Froment - Sweet Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",banana or apple or pear sweet crepe,Banane Ou Pomme Ou Poire,$5.00,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,Banane Ou Pomme Ou Poire banana or apple or pe...
85,"garnished with homemade chantilly, to create y...",Krampouz Froment - Sweet Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",chestnut sweet crepe,Chataigne,$5.00,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,Chataigne chestnut sweet crepe
86,"garnished with homemade chantilly, to create y...",Krampouz Froment - Sweet Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",nutella sweet crepe,Nutella,$5.00,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,Nutella nutella sweet crepe
87,"garnished with homemade chantilly, to create y...",Krampouz Froment - Sweet Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",compote (cooked fruit) sweet crepe,Compote,$5.25,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,Compote compote (cooked fruit) sweet crepe
88,"garnished with homemade chantilly, to create y...",Krampouz Froment - Sweet Crepe,San Francisco,"3108 16th St, San Francisco, CA, 94103",berries (with berry butter) sweet crepe,Baies,$5.75,1148.0,c,Ti-couz - CLOSED,CA,3108 16th St,http://www.yelp.com/biz/ti-couz-san-francisco,4.0,94103,Baies berries (with berry butter) sweet crepe


In [148]:
test2 = test.item_name_and_description

In [149]:
test2

84    Banane Ou Pomme Ou Poire banana or apple or pe...
85                       Chataigne chestnut sweet crepe
86                          Nutella nutella sweet crepe
87          Compote  compote (cooked fruit) sweet crepe
88        Baies berries (with berry butter) sweet crepe
Name: item_name_and_description, dtype: object

In [150]:
test2.apply(lambda x: x.translate(string.maketrans("",""), string.punctuation))

84    Banane Ou Pomme Ou Poire banana or apple or pe...
85                       Chataigne chestnut sweet crepe
86                          Nutella nutella sweet crepe
87            Compote  compote cooked fruit sweet crepe
88          Baies berries with berry butter sweet crepe
Name: item_name_and_description, dtype: object