In [11]:
from IPython.display import display, clear_output
import os
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import datetime
import pandas as pd
import numpy as np
import requests
import time
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import gmplot
from geopy.geocoders import Nominatim
%matplotlib inline

In [4]:
# define the class cheese
class RawCheese:
    def __init__(self,cheese_name):
        self.url_name = cheese_name
        self.cheese_com_url = 'https://www.cheese.com/{}/'.format(cheese_name)
        self.dump = []
        self.soup = None
    
    def get_soup(self):
        if not self.soup:
            r = requests.get(self.cheese_com_url)
            self.soup = BeautifulSoup(r.content, 'html.parser')
                
    @property
    def dict(self):
        self.get_soup()
        cheese_dict = {'name': self.url_name}
        summary_points = self.soup.find_all("ul", {"class": "summary-points"})
        features_list = [x.text for x in summary_points[0].find_all("p")]
        for feature in features_list:
            if ":" in feature:
                split_feature = feature.split(":")
                feature_key = split_feature[0].lower().replace(" ","_")
                feature_val = split_feature[1].strip().lower()
                cheese_dict[feature_key]=feature_val
            elif "Made from" in feature:
                feature_key = 'ingredients'
                feature_val = feature.split("Made from")[1].lower()
                cheese_dict[feature_key]=feature_val.strip()
            else:
                self.dump.append(feature)
        return cheese_dict

def get_cheeses_url_names(soup):
    """
        given a soup, extract all the cheeses url names present in the corresponding page
    """
    cheeses_url_names = []
    cheeses_divs = soup.find_all("div", {"class": "col-sm-6 col-md-4 cheese-item text-center"})
    for cheese_div in cheeses_divs:
        cheeses_url_names.append(cheese_div.find("h3").findChild().attrs['href'].replace("/",""))
    return cheeses_url_names

def get_all_cheeses_url_names():
    all_cheeses_url_names = set()
    # Create alphabet list of lowercase letters
    alphabet = []
    for letter in range(97,123):
        alphabet.append(chr(letter))
    for letter in alphabet:
        keep_going=True
        n=0
        print(letter)
        while keep_going and n<5:
            n+=1
            letter_url = "https://www.cheese.com/alphabetical/?per_page=100&i={}&page={}#top".format(letter,n)
            clear_output(wait=True)
            display("getting letter {} page #{}".format(letter,n))
            time.sleep(0.1) 
            r = requests.get(letter_url)
            soup = BeautifulSoup(r.content,'html.parser')
            page_cheeses_url_names = set(get_cheeses_url_names(soup))
            diff = page_cheeses_url_names.difference(set(all_cheeses_url_names))
            if not diff:
                keep_going = False
            else:
                all_cheeses_url_names.update(diff)
    return sorted(list(all_cheeses_url_names))

def get_raw_cheeses_df(all_cheeses_url_names):
    # creates the cheese data frame                
    columns = ["name"]
    raw_cheeses_df = pd.DataFrame(columns=columns)
    raw_cheeses_df.set_index("name")               
    for cheese_url in all_cheeses_url_names:
        clear_output(wait=True)
        display("getting cheese {}".format(cheese_url))
        raw_cheese = RawCheese(cheese_url)
        time.sleep(0.1)
        raw_cheeses_df = raw_cheeses_df.append(raw_cheese.dict, ignore_index=True)
    return raw_cheeses_df
    

In [5]:
all_cheeses_url_names = get_all_cheeses_url_names()

'getting letter z page #2'

In [6]:
raw_cheeses_df = get_raw_cheeses_df(all_cheeses_url_names)

'getting cheese zwitser'

In [15]:
raw_cheeses_df.to_csv('raw_cheese_df.csv')

In [8]:
set(raw_cheeses_df['ingredients'])

{nan,
 "goat's and sheep's milk",
 "pasteurized goat's or sheep's milk",
 "unpasteurized goat's and sheep's milk",
 "pasteurized or unpasteurized cow's, goat's and sheep's milk",
 "moose's milk",
 "donkey's milk",
 "cow's and sheep's milk",
 "pasteurized goat's milk",
 "goat's milk",
 "mare's milk",
 "yak's milk",
 "unpasteurized cow's and sheep's milk",
 "pasteurized buffalo's or cow's milk",
 "cow's and goat's milk",
 "buffalo's milk",
 "unpasteurized cow's or goat's milk",
 "pasteurized cow's or water buffalo's milk",
 "pasteurized cow's, goat's and water buffalo's milk",
 "cow's or sheep's milk",
 "water buffalo's milk",
 "unpasteurized cow's, goat's and sheep's milk",
 "cow's, goat's or sheep's milk",
 "pasteurized sheep's milk",
 "pasteurized water buffalo's milk",
 "pasteurized or unpasteurized cow's, goat's, sheep's and water buffalo's milk",
 "pasteurized or unpasteurized cow's or sheep's milk",
 "pasteurized or unpasteurized cow's or water buffalo's milk",
 "cow's, goat's and

In [9]:
set(raw_cheeses_df['rind'])

{nan,
 'mold ripened',
 'waxed',
 'artificial',
 'cloth wrapped',
 'bloomy',
 'plastic',
 'washed',
 'leaf wrapped',
 'rindless',
 'natural',
 'ash coated'}

In [16]:
raw_cheeses_df.keys()

Index(['name', 'aroma', 'colour', 'country_of_origin', 'family', 'flavour',
       'ingredients', 'producers', 'region', 'rind', 'texture', 'type',
       'vegetarian', 'synonyms', 'fat_content', 'alternative_spellings',
       'calcium_content', 'fat_content_(in_dry_matter)'],
      dtype='object')

In [37]:
flavours = set()
for these_flavors in raw_cheeses_df['flavour']:
    if these_flavors:
        print(these_flavors)
        these_flavors_list = [these_flavors.strip() for flavour in these_flavors.split(',')]
        flavours.update(these_flavors_list)
    

buttery, salty, tangy
burnt caramel
nan


AttributeError: 'float' object has no attribute 'split'

In [36]:
these_flavors = [flavour.strip() for flavour in raw_cheeses_df['flavour'][0].split(',')]

In [28]:
raw_cheeses_df['flavour'][0].split(',')
            

['buttery', ' salty', ' tangy']

In [31]:
flavours.update(these_flavors)

In [32]:
flavours

{'buttery', 'salty', 'tangy'}

In [39]:
these_flavors.__class__

float