# Imports

In [None]:
import os, json, time

# 00. Clear all folders

In [None]:
# set to True, if you you want ALL files in ./data/ to be deleted!
clear_all = True

In [None]:
input_folder = "./../dump"
folders     = ["./../data/00_dump",
               "./../data/01_fixed-category",
               "./../data/02_applied-manual-fixes", 
               "./../data/03_with-amazon-id", 
               "./../data/04_scraped-with-review"]

In [None]:
for folder in folders:
    if not os.path.exists(folder):
        os.mkdir(folder)

In [None]:
if clear_all:
    for folder in folders:
        for file in os.listdir(folder):
            if file.endswith(".json"):
                os.remove(os.path.join(folder, file))

# 00. Load Dump Folder

In [None]:
# set to True, if you want to load all files from ./dump/
load_dump=True

In [None]:
input_folder = input_folder
output_folder = folders[0]

In [None]:
from shutil import copyfile

In [None]:
if load_dump:
    for filename in os.listdir(input_folder):
        if filename.endswith(".json"):
            copyfile(input_folder + '/' + filename, output_folder + '/' + filename)

# 01. Translate Category Names

In [None]:
fix_category = True
category_language = "en"

In [None]:
input_folder = folders[0]
output_folder = folders[1]

In [None]:
categories = [{'id':-1, 'en':'Uncategorized', 'de':'Unkategorisiert'},
                  {'id':0,  'en':'Entrepreneurship'},         {'id':1,  'en':'Politics'},                  {'id':2,  'en':'Marketing & Sales'},
                  {'id':3,  'en':'Science'},                  {'id':4,  'en':'Health & Nutrition'},        {'id':5,  'en':'Personal Development'},
                  {'id':6,  'en':'Economics'},                {'id':7,  'en':'History'},                   {'id':8,  'en':'Communication Skills'},
                  {'id':9,  'en':'Corporate Culture'},        {'id':10,  'en':'Management & Leadership'},  {'id':11,  'en':'Motivation & Inspiration'},
                  {'id':12,  'en':'Money & Investments'},     {'id':13,  'en':'Psychology'},               {'id':14,  'en':'Productivity'},
                  {'id':15,  'en':'Sex & Relationships'},     {'id':16,  'en':'Technology & the Future'},  {'id':17,  'en':'Mindfulness & Happiness'},
                  {'id':18,  'en':'Parenting'},               {'id':19,  'en':'Society & Culture'},        {'id':20,  'en':'Nature & Environment'},
                  {'id':21,  'en':'Biography & Memoir'},      {'id':22,  'en':'Career & Success'},         {'id':23,  'en':'Education'},
                  {'id':24,  'en':'Religion & Spirituality'}, {'id':25,  'en':'Creativity'},               {'id':26,  'en':'Philosophy'},    
                 ]

In [None]:
def translate_category_name(book):
    id = book['category_id']
    for category in categories:
        if category['id'] == id:
            book['category'] = category[category_language]
            break
    
    return book

In [None]:
for filename in os.listdir(input_folder):
    if filename.endswith(".json"):
        book = {}
        with open(input_folder + '/' + filename, 'r') as fp:
            book = json.load(fp)
            
        if fix_category:
            book = translate_category_name(book)
  
        with open(output_folder + '/' + filename, 'w') as fp:
            json.dump(book, fp, indent=4)
            
        os.remove(input_folder + '/' + filename)

# 02. Apply Manual Fixes

In [None]:
apply_fixes = True

In [None]:
input_folder = folders[1]
output_folder = folders[2]
fixes_folder = "./../data/99_manual-fixes"

In [None]:
index = 0
for filename in os.listdir(input_folder):
    if filename.endswith(".json"):
        book = {}
        with open(input_folder + '/' + filename, 'r') as fp:
            book = json.load(fp)
            
        if apply_fixes:
            if os.path.isfile(fixes_folder + '/' + filename):
                with open(fixes_folder + '/' + filename, 'r') as fp:
                    fixes = json.load(fp)
                    for fix in fixes:
                        book[fix] = fixes[fix]
                index = index + 1
  
        with open(output_folder + '/' + filename, 'w') as fp:
            json.dump(book, fp, indent=4)
            
        #os.remove(input_folder + '/' + filename)

In [None]:
print(str(index) + " books fixed")

# 03. Process Books with Amazon ID

In [None]:
input_folder = folders[2]
output_folder = folders[3]

In [None]:
book = {}
for filename in os.listdir(input_folder):
    if filename.endswith(".json"):
        with open(input_folder + '/' + filename, 'r') as fp:
            book = json.load(fp)
            
        if 'amazon_id' in book:
            if book['amazon_id'] != "":
                os.rename(input_folder + '/' + filename, output_folder + '/' + filename)

In [None]:
num_without_id = len(os.listdir(input_folder))
if num_without_id > 0:
    print("Warning! " + str(num_without_id) + " books found without Amazon ID")

# 04. Scrape Amazon

In [None]:
import chromedriver_autoinstaller
from selenium import webdriver

In [None]:
input_folder = folders[3]
output_folder = folders[4]

In [None]:
chromedriver_autoinstaller.install()
chrome_options = webdriver.chrome.options.Options()
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])

In [None]:
amazon_url_prefix = {'en':'https://www.amazon.com', 'de':'https://www.amazon.de'}

In [None]:
def get_amazon_review_avg_and_num(driver):
    try:
        driver = driver.find_element_by_id("averageCustomerReviews_feature_div")
    except:
        print("Strange! This page has no 'averageCustomerReviews_feature_div'")
        return
    
    try:
        driver_avg = driver.find_element_by_id("acrPopover")
        avg_string = driver_avg.get_attribute('title').split(" ")[0].replace(",",".")
        avg = float(avg_string)
        driver_num = driver.find_element_by_id("acrCustomerReviewText")
        num_string = driver_num.text.split(" ")[0].replace(".","").replace(",","")
        num = int(num_string)
    except:
        avg, num = 0, 0
    
    return {'avg':avg, 'num': num}

In [None]:
driver = webdriver.Chrome(options=chrome_options)

In [None]:
filelist = os.listdir(input_folder)
for index, filename in enumerate(filelist):
    if filename.endswith(".json"):
        
        book = {}
        with open(input_folder + '/' + filename, 'r') as fp:
            book = json.load(fp)
           
        if 'amazon_id' not in book:
            continue
            
        amazon_url = amazon_url_prefix[book['language']] + "/dp/" + book['amazon_id']
        driver.get(amazon_url)

        #try:
        #    driver.find_element_by_xpath('//a[@href="/ref=cs_404_logo"]')
        #except:
        #    pass
        #finally:
        #    driver.get(amazon_url_prefix[book['language']])
        #    driver.find_element_by_id("twotabsearchtextbox").send_keys(book['title'] + " - " + book['subtitle'])
        #    driver.find_element_by_id("nav-search-submit-button").click()
        #    time.sleep(1)
        #    driver.find_element_by_xpath("//*[@data-component-type='s-search-result']").click()
        #    print(book['author'] + " - " + book['title'] + " - " + book['subtitle'])
        #    input()
        #    amazon_url = driver.current_url
        
        time.sleep(3)
        
        amazon_review = get_amazon_review_avg_and_num(driver)
        if amazon_review is None:
            print(str(index) + "/" + str(len(filelist)) + " : " + book['slug'] + "                 <- Book or reviews not found on Amazon")
        else:
            book['amazon_avg'] = amazon_review['avg']
            book['amazon_num'] = amazon_review['num']
            book['amazon_url'] = amazon_url
            with open(output_folder + '/' + filename, 'w') as fp:
                json.dump(book, fp, indent=4)
            os.remove(input_folder + '/' + filename)

            print(str(index) + "/" + str(len(filelist)) + " : " + book['slug'] + "  " + str(book['amazon_avg']) + " / " + str(book['amazon_num']) )

In [None]:
print("Warning! " + str(len(os.listdir(input_folder))) + " books not found / without reviews")

# 05. Convert *.json to single books.json

In [None]:
input_folder = folders[4]
output_file = "./../data/books.json"

In [None]:
books = []

for filename in os.listdir(input_folder):
    if filename.endswith(".json"):
        with open(input_folder + '/' + filename) as json_file:
            book = json.load(json_file)
        books.append(book)
           
with open(output_file, 'w') as fp:
        json.dump(books, fp, indent=4)