In [45]:
import json
import requests
from typing import List
from bs4 import BeautifulSoup

# Scrape Data from truesake.com

collections = [
    "junmai-honjozo",
    "junmai-ginjo",
    "junmai-daiginjo-daiginjo",
    "seasonal-nama-seasonal",
    "nama",
    "futsushu",
    "sparkling",
    "nigori",
    "unique"
]

sake_data = dict()

def get_four_hot_words(text: str) -> List[str]:
    sentence = text.strip().lstrip('Four Hot Words: ')
    removed_ampersand = sentence.replace("&", "")
    words = removed_ampersand.split(', ')
    if len(words) < 4:
        words = [c for b in [word.split(" ") for word in words] for c in b if len(c)>0]
    else:
        words = [word.strip() for word in words]
    return words

for sake_type in collections:
    print("\n"+sake_type)
    BASE_URL = "https://www.truesake.com/"
    COLLECTIONS_URL = BASE_URL + "collections/{sake_type}?view=all".format(sake_type=sake_type)
    response = requests.get(COLLECTIONS_URL)
    soup = BeautifulSoup(response.content)
    catalogue = soup.find_all("div", class_="category-1")
    
    for sake in catalogue:
        detail_page = BeautifulSoup(requests.get(BASE_URL+sake.a['href'].lstrip('/')).content)
        description = detail_page.find("div", class_="product_desc")
        
        sake_name = sake['data-alpha']
        print("\t",sake_name)
        
        sake_data[sake_name] = dict()
        sake_data[sake_name]['url'] = BASE_URL+sake.a['href'].lstrip('/')
        sake_data[sake_name]['size'] = detail_page.find("div", class_="size").text.strip()
        sake_data[sake_name]['price'] = detail_page.find("div", class_="price").text.strip()
        sake_data[sake_name]['description'] = description.text.replace('\n','').replace('\xa0','')
        sake_data[sake_name]['collection'] = sake_type
        hot_words = detail_page.find("div", class_="four_hot_words")
        
        if hot_words:
            sake_data[sake_name]['words'] = get_four_hot_words(hot_words.text)
        

with open('sake.json', 'w') as f:
    json.dump(sake_data, f)


junmai-honjozo
	 Akashi-Tai Tokubetsu Honjozo
	 Akishika Junmai 
	 Akitabare Koshiki Junzukuri Junmai 
	 Amabuki Yamahai Junmai Omachi Cup 
	 Asamurasake Junmai Red Rice
	 Bunraku Nihonjin No Wasuremono Yamahai Junmai 
	 Dewatsuru Junmai 
	 Dewatsuru Kimoto Junmai
	 ENTER.SAKE Black Dot Honjozo
	 ENTER.SAKE Sookuu Junmai
	 Fuku Chitose Yamahai Junmai 
	 Gekkeikan Ace Cup
	 Genbei Onikoroshi Honjozo 
	 Gozenshu 9 Junmai
	 Hakkaisan Honjozo 
	 HeavenSake Junmai 12
	 Huchuhomare Taiheikai Tokubetsu Junmai “Pacific Ocean”
	 Ichinokura Mukansa Super-Dry Honjozo 
	 Ine Mankai Junmai Genshu “Ine’s Full Bloom”
	 Izumo Fuji 
	 Joppari Honjozo 
	 Joto Honjozo 
	 Kaika Sanomaru Cup Tokubetsu Junmai
	 Katafune Honjozo Cup 
	 Katafune Junmai Genshu
	 Katafune Koshino Sesshu Junmai Nigori
	 Kenbishi Kuromatsu Honjozo 
	 Kibo Junmai 
	 Kirin Koshi No Takumi Tokubesu Junmai 
	 Kirinzan Junmai 
	 Kiuchi Kikusakari Junmai Yamahai Genshu
	 Kiuchi Tarusake Junmai
	 Mana 1751 Tokubetsu Yamahai Junmai Muro