Webscrap the website c-spot.com to get user reviews about chocolates such as CQ, sweetness, acidity, bitterness, company manufacturers, chocolate originsm etc.

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup 

In [2]:
#gets the name of the choco bar, name of the company, user rating
def get_header_info(soup_tables):
    #go to section where name of chocolate bar and company name is
    titles = soup_tables.find('div', class_='census-product-title')
    
    #get the string text for titles
    title_str = titles.next_element.next_element.get_text().split(' by ')
    #get name of chocolate bar
    choco_bar_name = title_str[0]
    #get company name
    company_name = title_str[1]
    
    #next sibling is where the rating is
    ratings = titles.next_sibling.next_sibling.find(id='star-rating').attrs
    #get the rating of the chocolate bar
    rating = float(ratings['data-score'])
    
    #combine data
    names_dict = {
        'choco_bar_name': choco_bar_name,
        'company_name': company_name,
        'rating': rating
    }
    
    return names_dict
    
#get_header_info(soup_tables)

In [3]:
#gets the chocolate bar company location, cocoa percentage in bar,
#bean type, where bean originally came from, and intended flavor by maker
def get_table_info(soup_tables):
    #go to table
    cat_soup = soup_tables.find('div', class_="et_pb_column et_pb_column_100 census-product-header-details-rating")
    #go to each row
    cat_soups = cat_soup.find_all('td', class_="census-product-header-content")
    
    #can't loop - need info from different places
    #get company location
    try:
        company_loc = cat_soups[0].get_text().split()[0]
    except:
        company_loc = None
    
    #get cocoa percentage - out of 100
    try: #if % percent is there
        cocoa_perc_str = cat_soups[1].get_text().split("%")[0]
        if cocoa_perc_str[-3] == '1':
            cocoa_perc = float(cocoa_perc_str[-3:])
        else:
            cocoa_perc = float(cocoa_perc_str[-2:])
        
    except:
        cocoa_perc = None
        
    #get bean type
    bean_varieties = ['Criollo', 'Trinitario']  
    try: #not all have the general bean type
        bean_type = cat_soups[2].get_text().split()[0]
        if bean_type not in bean_varieties:
            bean_type = cat_soups[2].get_text().split("(")[1][:-1].split()[0].strip(';')
    except: #in that case, get the strain details
        try:
            bean_type = cat_soups[2].get_text().split()[0]
        except:
            bean_type = None
        
    #get bean origin
    try:
        bean_origin = cat_soups[3].get_text().split("\xa0")[0]
        if bean_origin == '':
            bean_origin = None
    except:
        bean_origin = None
    
    #get flavor
    try:
        flavor = cat_soups[4].get_text().split()[0].strip(";").strip('(').strip(')')
    except:
        flavor = None
    
    #combine data
    table_dict = {
        'comapany_loc': company_loc,
        'cocoa_perc': cocoa_perc,
        'bean_type': bean_type,
        'bean_origin': bean_origin,
        'company_flavor': flavor #flavor intended by company
    }
    
    return table_dict

#get_table_info(soup_tables)

In [4]:
#get user ratings of different aspects of chocolate bar
def get_perc_info(soup_tables):
    #got to the percent bars table
    outer_perc_soup = soup_tables.find('div', class_="et_pb_column et_pb_column_100 census-product-header-details-rating").next_sibling.next_sibling
    perc_soup = outer_perc_soup.find_all('div', class_="meter-bar")
    #create space to store info
    all_percents = {}
    #state the names for each variable
    percent_cats = ['CQ', 'sweetness', 'acidity', 'bitterness',
                    'roast', 'intensity', 'complexity', 'structure',
                    'length']
    #get the percentage for each bar
    for i, perc in enumerate(perc_soup):
        perc_attrs = perc.attrs
        percent = float(perc_attrs['style'].split("%")[0][-2:])
        all_percents[percent_cats[i]] = percent
        
    return all_percents

#get_perc_info(soup_tables)

In [5]:
#gets the appearance, aroma, mouthfeel, flavor, and quality ratings
def get_lower_info(soup):
    #get the info at the bottom of the page
    lower_soup = soup.find_all('div', class_="census-product-spec-label")
    lower_soup
    #state the names for each variable
    labels = ['appearance', 'aroma', 'mouthfeel', 'flavor', 'quality']
    #create storing dict
    lower_dict = {}
    #go through the other labels with the info
    for i, label in enumerate(lower_soup[1:]):
        label_splt = label.get_text().split()
        num = float(label_splt[1])
        denom = float(label_splt[3])
        #scale to out of 100
        scale = 100/denom
        label_num = round(num * scale, 1)
        #store value in dict
        lower_dict[labels[i]] = label_num
    
    return lower_dict

#get_lower_info(soup)

In [6]:
#gets the information from page and combines it into one dictionary
def get_page_info(soup):
    soup = BeautifulSoup(page.content, 'html.parser')
    #go to the where the charts and name of choco is
    soup_tables = soup.find('div', id='census-product-header')
    
    #get infos from different parts of page
    header_info = get_header_info(soup_tables)
    table_info = get_table_info(soup_tables)
    perc_info = get_perc_info(soup_tables)
    lower_info = get_lower_info(soup)
    
    #combine the dictionaries in lst
    combined_lst = [header_info, table_info, perc_info, lower_info]
    #create a overall dict
    combined = {}
    #combine all the dicts into one by iterating from list of dicts
    for d in combined_lst:
        for k,v in d.items():
            #need values in a list to turn in Dataframe
            combined[k] = [v]
    
    return combined

#get_page_info(soup)

In [10]:
"""
Go through different Chocolate webpages and gather informations about it
"""

#create page numbers
page_num = list(range(441, 2048))
pre_url = 'https://www.c-spot.com/chocolate-census/bars/bar/?pid='
#create DataFrame for all chocolates
all_data = pd.DataFrame()
for pg_num in page_num:
    print('page_num: ', pg_num)
    #combine url
    url = pre_url+str(pg_num)
    page = requests.get(url)
    #check if page exist
    if page.status_code == 200:
        soup = BeautifulSoup(page.content)
        #get the information from page
        try:
            choco_dict = get_page_info(soup)
            #create a dataframe
            choco_df = pd.DataFrame(choco_dict)
            #combine to all dataframes together
            all_data = pd.concat([all_data, choco_df], ignore_index=True)
        #some webpages are missing sooooo much - just skip it
        except:
            print('Skipped page_num: ', pg_num)
    
    #save dataframe every 500 pages to csv
#     if pg_num in [1000, 1500, 2000, 2047]:
#         file_name = "choco_rating_component_"+str(pg_num)+".csv"
#         all_data.to_csv(file_name, encoding='utf-8', index=False)

page_num:  441
page_num:  442
page_num:  443
page_num:  444
page_num:  445
page_num:  446
page_num:  447
page_num:  448
page_num:  449
page_num:  450
page_num:  451
page_num:  452
page_num:  453
page_num:  454
page_num:  455
page_num:  456
page_num:  457
page_num:  458
page_num:  459
page_num:  460
page_num:  461
page_num:  462
page_num:  463
page_num:  464
page_num:  465
page_num:  466
page_num:  467
page_num:  468
page_num:  469
page_num:  470
Skipped page_num:  470
page_num:  471
page_num:  472
page_num:  473
page_num:  474
page_num:  475
page_num:  476
page_num:  477
page_num:  478
page_num:  479
page_num:  480
page_num:  481
page_num:  482
page_num:  483
page_num:  484
page_num:  485
page_num:  486
Skipped page_num:  486
page_num:  487
page_num:  488
page_num:  489
page_num:  490
Skipped page_num:  490
page_num:  491
page_num:  492
page_num:  493
page_num:  494
Skipped page_num:  494
page_num:  495
Skipped page_num:  495
page_num:  496
page_num:  497
Skipped page_num:  497
page_nu

Skipped page_num:  730
page_num:  731
Skipped page_num:  731
page_num:  732
Skipped page_num:  732
page_num:  733
Skipped page_num:  733
page_num:  734
Skipped page_num:  734
page_num:  735
Skipped page_num:  735
page_num:  736
Skipped page_num:  736
page_num:  737
Skipped page_num:  737
page_num:  738
Skipped page_num:  738
page_num:  739
Skipped page_num:  739
page_num:  740
Skipped page_num:  740
page_num:  741
Skipped page_num:  741
page_num:  742
Skipped page_num:  742
page_num:  743
page_num:  744
page_num:  745
Skipped page_num:  745
page_num:  746
Skipped page_num:  746
page_num:  747
page_num:  748
Skipped page_num:  748
page_num:  749
Skipped page_num:  749
page_num:  750
Skipped page_num:  750
page_num:  751
Skipped page_num:  751
page_num:  752
Skipped page_num:  752
page_num:  753
Skipped page_num:  753
page_num:  754
page_num:  755
page_num:  756
page_num:  757
Skipped page_num:  757
page_num:  758
Skipped page_num:  758
page_num:  759
Skipped page_num:  759
page_num:  76

page_num:  1040
page_num:  1041
page_num:  1042
page_num:  1043
page_num:  1044
page_num:  1045
page_num:  1046
Skipped page_num:  1046
page_num:  1047
page_num:  1048
page_num:  1049
page_num:  1050
page_num:  1051
page_num:  1052
page_num:  1053
page_num:  1054
page_num:  1055
Skipped page_num:  1055
page_num:  1056
page_num:  1057
page_num:  1058
page_num:  1059
page_num:  1060
page_num:  1061
page_num:  1062
page_num:  1063
page_num:  1064
page_num:  1065
page_num:  1066
page_num:  1067
page_num:  1068
page_num:  1069
page_num:  1070
page_num:  1071
page_num:  1072
page_num:  1073
page_num:  1074
page_num:  1075
page_num:  1076
page_num:  1077
page_num:  1078
page_num:  1079
page_num:  1080
page_num:  1081
page_num:  1082
page_num:  1083
page_num:  1084
page_num:  1085
page_num:  1086
page_num:  1087
page_num:  1088
page_num:  1089
page_num:  1090
page_num:  1091
page_num:  1092
page_num:  1093
page_num:  1094
page_num:  1095
page_num:  1096
page_num:  1097
page_num:  1098
page_num

page_num:  1461
page_num:  1462
page_num:  1463
Skipped page_num:  1463
page_num:  1464
page_num:  1465
page_num:  1466
Skipped page_num:  1466
page_num:  1467
Skipped page_num:  1467
page_num:  1468
Skipped page_num:  1468
page_num:  1469
page_num:  1470
page_num:  1471
page_num:  1472
page_num:  1473
page_num:  1474
page_num:  1475
page_num:  1476
page_num:  1477
page_num:  1478
page_num:  1479
page_num:  1480
page_num:  1481
page_num:  1482
page_num:  1483
page_num:  1484
page_num:  1485
page_num:  1486
page_num:  1487
Skipped page_num:  1487
page_num:  1488
Skipped page_num:  1488
page_num:  1489
page_num:  1490
page_num:  1491
page_num:  1492
page_num:  1493
page_num:  1494
page_num:  1495
page_num:  1496
page_num:  1497
page_num:  1498
page_num:  1499
page_num:  1500
page_num:  1501
page_num:  1502
page_num:  1503
page_num:  1504
page_num:  1505
page_num:  1506
page_num:  1507
page_num:  1508
page_num:  1509
page_num:  1510
page_num:  1511
page_num:  1512
page_num:  1513
Skipped 

page_num:  1914
page_num:  1915
page_num:  1916
page_num:  1917
page_num:  1918
page_num:  1919
page_num:  1920
page_num:  1921
page_num:  1922
page_num:  1923
page_num:  1924
page_num:  1925
page_num:  1926
page_num:  1927
page_num:  1928
page_num:  1929
page_num:  1930
page_num:  1931
page_num:  1932
page_num:  1933
page_num:  1934
page_num:  1935
page_num:  1936
page_num:  1937
page_num:  1938
page_num:  1939
page_num:  1940
page_num:  1941
page_num:  1942
page_num:  1943
Skipped page_num:  1943
page_num:  1944
page_num:  1945
page_num:  1946
page_num:  1947
page_num:  1948
page_num:  1949
Skipped page_num:  1949
page_num:  1950
page_num:  1951
page_num:  1952
page_num:  1953
page_num:  1954
page_num:  1955
page_num:  1956
page_num:  1957
page_num:  1958
page_num:  1959
page_num:  1960
page_num:  1961
page_num:  1962
page_num:  1963
page_num:  1964
Skipped page_num:  1964
page_num:  1965
page_num:  1966
page_num:  1967
page_num:  1968
page_num:  1969
Skipped page_num:  1969
page_num

In [15]:
all_data

Unnamed: 0,choco_bar_name,company_name,rating,comapany_loc,cocoa_perc,bean_type,bean_origin,company_flavor,CQ,sweetness,...,roast,intensity,complexity,structure,length,appearance,aroma,mouthfeel,flavor,quality
0,Nib Brittle,Theo,2.5,USA,65,Hybrid,Madagascar,Earthen,57.0,55.0,...,62.0,51.0,43.0,41.0,29.0,74.0,72.0,72.0,81.4,83.0
1,Vanilla Milk Choc,Theo,4.0,USA,,Blend,,Earthen,77.0,80.0,...,62.0,28.0,46.0,81.0,34.0,100.0,89.0,92.0,89.0,87.0
2,Ivory Coast,Theo,4.0,USA,75,Amazon,Ivory Coast,Earthen,53.0,42.0,...,71.0,37.0,36.0,80.0,82.0,92.0,82.0,85.3,88.4,86.5
3,Ghana-Panama-Ecuador,Theo,3.5,USA,75,Blend,(Ghana [Kumasi]; Panama [Bocas del Toro]; Ecua...,Crossover,44.0,48.0,...,63.0,45.0,52.0,39.0,81.0,100.0,80.0,82.0,86.4,87.0
4,Mad 65,Theo,4.0,USA,65,Criollo,Madagascar,Fruits,60.0,56.0,...,68.0,64.0,62.0,83.0,51.0,100.0,73.0,90.7,89.0,90.0
5,Pure White,Venchi,3.0,Italy,,,,Sugar,13.0,91.0,...,52.0,41.0,7.0,37.0,24.0,82.0,74.0,77.3,85.8,86.5
6,85% Blend,Venchi,3.5,Italy,85,Blend,,Earthen,23.0,21.0,...,76.0,52.0,10.0,78.0,69.0,92.0,84.0,83.3,84.2,82.5
7,Le Noir 56,Valrhona,3.5,France,56,Blend,,Naked,81.0,80.0,...,44.0,42.0,21.0,77.0,26.0,94.0,76.0,80.7,87.0,89.0
8,Noir 71,Valrhona,4.0,France,71,Criollo,,Crossover,61.0,60.0,...,62.0,31.0,86.0,81.0,35.0,100.0,85.0,74.7,88.0,92.5
9,Le Noir Extra Amer,Valrhona,3.0,France,85,Blend,(Africa),Earthen,22.0,21.0,...,33.0,67.0,26.0,77.0,91.0,92.0,72.0,69.3,86.2,86.5


In [16]:
#turn dataframe to csv
# all_data.to_csv('choco_rating_dataset.csv', index=False)