In [79]:
# from selenium import webdriver
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

# OCR for tasting notes (if applicable)
from PIL import Image
import numpy as np
from io import BytesIO
import easyocr

reader = easyocr.Reader(['en']) # this needs to run only once to load the model into memory

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


# Functions

In [3]:
def get_products(url, platform = "Shopify"):
    jsonReq = requests.get(url).json()
    if platform == "Shopify":
        products = jsonReq['products']
    elif platform == "Squarespace":
        products = jsonReq['items']
    return products

In [4]:
# Wix and Woocommerce are special and needs its own method of scraping bc god knows
def build_url(link, platform = "Shopify"):
    if platform == "Shopify":
        productJson = link + "/products.json"
    elif platform == "Squarespace":
        productJson = link + "/?format=json"
    elif platform == "Wix":
        productJson = link
    else:
        productJson = link
    return productJson

In [5]:
def extract_flavor_notes(text, profile):
    # join the list of profiles to search for
    keywords = "|".join(profile)

    # Find all matches in the text
    matches = re.findall(keywords, text, re.IGNORECASE)
    return matches


In [6]:
def get_product_details(storeJsonProducts, platform = "Shopify"):
    all_products = pd.DataFrame()
    if platform == 'Shopify':
        for k,v in storeJsonProducts.items():
            if v.shape[0] != 0:
                title = v.title
                excerpt = v.body_html
                clean_excerpt = [BeautifulSoup(a, "html.parser").get_text() for a in excerpt]
                subdataset = pd.DataFrame(data = {"store": k,
                                                "product": title,
                                                "description": clean_excerpt
                                                })
            else:
                subdataset = pd.DataFrame()
            all_products = pd.concat([all_products, subdataset], axis = 0)
    
    elif platform == "Squarespace":
        for k,v in storeJsonProducts.items():
            if v.shape[0] != 0:
                title = v.title
                excerpt = v.excerpt
                clean_excerpt = [BeautifulSoup(a, "html.parser").get_text() for a in excerpt]
                subdataset = pd.DataFrame(data = {"store": k,
                                                "product": title,
                                                "description": clean_excerpt
                                                })
            else:
                subdataset = pd.DataFrame()
            all_products = pd.concat([all_products, subdataset], axis = 0)
    
    all_products = all_products.reset_index(drop = True)
    return all_products

In [80]:
def parse_image(url):
    response = requests.get(url)
    img = np.array(Image.open(BytesIO(response.content)).convert('L'))
    text = reader.readtext(img)
    text_confident = [t for bbox, t, prob in text if prob > 0.5]
    return text_confident

# Parse Coffee Website List

In [60]:
SGcoffee = pd.read_csv("sg_coffee_websites.txt", delimiter= "\t", index_col = 0)

# Shopify

In [61]:
shopify_sites = SGcoffee[SGcoffee.Platform == "Shopify"]

In [62]:
shopifyJsonLink = {}
shopifyJsonProducts = {}
for s in shopify_sites.index:
    parseLink = build_url(shopify_sites.loc[s, "Link"])
    shopifyJsonLink[s] = parseLink
    listing = get_products(parseLink)
    shopifyJsonProducts[s] = pd.DataFrame.from_dict(listing)

In [63]:
all_shopify_products = get_product_details(shopifyJsonProducts, platform="Shopify")

In [64]:
shopify_sites[shopify_sites['Scrape_Image']]

Unnamed: 0_level_0,Link,Platform,Scrape_Image
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bettr,https://bettr.coffee/collections/coffee-beans,Shopify,True
Bloom Coffee,https://bloomwithcoffee.com/collections/all,Shopify,True
Cata Coffee,https://catacoffee.com/collections/coffee-beans,Shopify,True
Cumulo Coffee,https://www.cumulocoffee.com/collections/coffe...,Shopify,True
Doubleup Coffee,https://doubleup-coffee.com/collections/coffee...,Shopify,True
Dutch Colony Coffee,https://www.dutchcolony.sg/collections/coffee-2,Shopify,True
Fluid Collective,https://fluidcollective.co/collections/retail-...,Shopify,True
Glyph Supply Co,https://www.glyphsupply.co/collections/all,Shopify,True
Homeground Coffee Roasters,https://homegroundcoffeeroasters.com/collectio...,Shopify,True
Kurasu,https://kurasu.kyoto/collections/whole-beans,Shopify,True


In [65]:
nonolist = ['Equipment', 'Brewer', 'Merchandise']

In [82]:
# this works in theory but it's too slow
all_shopify_product_image_description = pd.DataFrame()
for i in shopify_sites[shopify_sites['Scrape_Image']].index[0:1]:
    json_pdt = shopifyJsonProducts[i]
    mask_noscrape = json_pdt[~json_pdt['product_type'].isin(nonolist)]
    imgs_to_scrape = mask_noscrape['images'] # each store's products
    product_name = mask_noscrape['title']

    store_product_texts = pd.DataFrame()
    for prod_name, img_set in zip(product_name, imgs_to_scrape): # for each product
        product_text = []
        for img in img_set: # each image in each product
            url = img['src']
            print(i, prod_name, url)
            img_text = parse_image(url) # convert the image text to string
            product_text += img_text
        clean_product_text = "".join(product_text)
        clean_product_text = clean_product_text.replace("\n", " ")
        product_df = pd.DataFrame(data = {'store': i, 'title': prod_name, 'image_description': clean_product_text}, index = [0])
        store_product_texts = pd.concat([store_product_texts, product_df], axis = 0)
    all_shopify_product_image_description = pd.concat([all_shopify_product_image_description, store_product_texts], axis = 0)

Bettr Eureka https://cdn.shopify.com/s/files/1/0164/9429/7142/files/1_7a8bbcf9-0dcf-4a8e-8f6f-eb6ed4952247.png?v=1701934204
Bettr Eureka https://cdn.shopify.com/s/files/1/0164/9429/7142/files/MadeWithPassionSG_9b52628e-a7d8-4a0a-8bc2-69276c125396.png?v=1707987386


KeyboardInterrupt: 

# Squarespace

In [18]:
sqspace_sites = SGcoffee[SGcoffee.Platform == "Squarespace"]

In [19]:
sqspace_sites

Unnamed: 0_level_0,Link,Platform
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alchemist,https://alchemist.com.sg/coffee-beans,Squarespace
Kyuukei Coffee,https://kyuukeicoffee.com/store/coffee-beans,Squarespace
Maxi Coffee Bar,https://maxicoffeebar.com/shop/coffee,Squarespace
Nylon Coffee Roasters,https://nylon.coffee/shop,Squarespace
Parallel Coffee Roastes,https://www.parallelcoffeeroasters.com/shop/fi...,Squarespace
Skill Issue,https://skillissue.coffee,Squarespace


In [22]:
sqspaceJsonLink = {}
sqspaceJsonProducts = {}
for i in sqspace_sites.index:
    print(i)
    jlnk = build_url(sqspace_sites.loc[i, "Link"], platform="Squarespace")
    sqspaceJsonLink[i] = jlnk
    pdt = get_products(jlnk, platform="Squarespace")
    sqspaceJsonProducts[i] = pd.DataFrame.from_dict(pdt)


Alchemist
Kyuukei Coffee
Maxi Coffee Bar
Nylon Coffee Roasters
Parallel Coffee Roastes
Skill Issue


In [70]:
sqspaceJsonProducts['Kyuukei Coffee']

0    <p>Varietal: Green-Tip Gesha</p><p>Region: Tiz...
1    <p>Varietal: Green-Tip Gesha</p><p>Region: Vol...
2    <p>Varietal: Green-Tip Gesha</p><p>Region: Tiz...
3    <p class="" style="white-space:pre-wrap;">Vari...
Name: excerpt, dtype: object

In [115]:
all_squarespace_products = get_product_details(sqspaceJsonProducts, platform="Squarespace")

In [116]:
all_squarespace_products

Unnamed: 0,store,product,description
0,Alchemist,Mandela,Mandela is the perfect example of Cafe Granja ...
1,Alchemist,Haru Suke,Yirgacheffe lies in the southern part of Ethio...
2,Alchemist,Koke Shalaye,Koke Shalaye is from a collection of smallhold...
3,Alchemist,Abu GW1,"Situated in Boquete, Cañas Verdes, a humid and..."
4,Alchemist,El Arenal,"Nicaragua, known for its lakes and volcanoes i..."
...,...,...,...
69,Skill Issue,Southern Star #5,Impressed by the quality of coffee on the nati...
70,Skill Issue,Finca Bella Vista,An Ethiopian lite. But without the fines that ...
71,Skill Issue,Capim Branco,"A coffee family spanning 5 generations, Capim ..."
72,Skill Issue,Kirinyaga AA,A classic washed Kenyan from smallholders in N...


# Writing data to csv

In [152]:
all_data = pd.concat([all_shopify_products, all_squarespace_products], axis=0)
all_data = all_data.reset_index(drop=True)

In [153]:
all_data.to_excel("database_products.xlsx", index=False)

# Extract Flavour Notes

In [169]:
matches_found = pd.DataFrame()
for i in all_data.index:
    text = all_data.loc[i, "description"]
    matches = extract_flavor_notes(text, ['yogurt', 'yoghurt'])
    if len(matches) > 0:
        matches_found = pd.concat([matches_found, all_data.iloc[[i]]], axis = 0)

In [22]:
shopifyJsonProducts.keys()

dict_keys(['2 Degrees North', '20 grams', 'Asylum Coffeehouse', 'Bettr', 'Bloom Coffee', 'Brawn and Brains', 'Cata Coffee', 'Cumulo Coffee', 'Doubleup Coffee', 'Dutch Colony Coffee', 'Flip Coffee Roasters', 'Fluid Collective', 'Glyph Supply Co', 'Homeground Coffee Roasterse', 'Kurasu', 'Luli Roasts', 'Narrative Coffee Stand', 'Parchmen & Co', 'PPP Coffee', 'Quarter Life Coffee', 'Rookies Coffee', 'Round Boy', 'Shake Coffee', 'Small Waves', 'The Community', 'Tiong Hoe Coffee', 'Upside Down Coffee', 'Zerah Coffee Roasters'])