In [1]:
# scraper.py
#
# Scrapes data about guitars and reviews from a few different sources,
# then uses that data to populate an EdgeDB instance
#
# Much of the scraper code is just refactoring of Joe's GuitarCenter code, 
# put into a little more organized piece of code


from selenium import webdriver
from selenium.webdriver.firefox.options import Options
# import time
from bs4 import BeautifulSoup
import edgedb
import os
from os import path

import numpy as np
# import json


In [2]:

# local import
import scrape_utils
# import class_definitions


In [3]:


# create new instance of firefox driver -- this should be the geckodriver
options = Options()
options.binary_location = r"C:\\Program Files\\Mozilla Firefox\\firefox.exe"
options.headless = True
# options.set_headless(True)
driver = webdriver.Firefox(executable_path="C:\Program Files\GeckoDriver\geckodriver.exe", options=options)

driver.manage()

AttributeError: 'WebDriver' object has no attribute 'manage'

In [None]:

# ---------------------------------------------------
# start with guitarcenter
# ---------------------------------------------------

url_list = [] #  list of guitar urls
guitars = [] # list of guitars
# iterate over the range of "Nao" values, get links to all guitars
# for ii in range(0, 4400, 100):
for ii in range(0,100,100): # temp testing
    html = scrape_utils.gc_get_browsing_pages(driver, ii) # get the html doc

    url_list.append(scrape_utils.gc_extract_links(html)) # append the list of matches

# make it unique -- without numpy
url_list = list(set([item for sublist in url_list for item in sublist]))

In [4]:
client = edgedb.create_client(dsn='MSDS_459')

In [None]:
client.query(""" INSERT ReviewSource {
                    name := <str>'Guitar Center',
                    sourceType := <default::SourceType>'Vendor',
            } UNLESS CONFLICT """)

# create a "Guitar Center" vendor
client.query(""" INSERT Vendor {
                    name := <str>'Guitar Center',
            } UNLESS CONFLICT """)

In [None]:
from importlib import reload
reload(scrape_utils)
reload(scrape_utils.class_definitions)
# reload(class_definitions)

In [None]:
for url_i,url in enumerate(url_list):
    if url_i < 100:
        html = scrape_utils.gc_get_all_reviews(driver, url)  
        reviews = scrape_utils.gc_extract_review_info(html) # parse the review info
        guitar = scrape_utils.gc_extract_guitar_info(url, html) # parse the specs for the guitar

        guitar_id = guitar.insert(client)


        for review in reviews:
            review.insert(guitar_id, client)

In [None]:
url = url_list[1]
html = scrape_utils.gc_get_all_reviews(driver, url)  
reviews = scrape_utils.gc_extract_review_info(html) # parse the review info
guitar = scrape_utils.gc_extract_guitar_info(url, html) # parse the specs for the guitar

In [None]:
guitar_id = guitar.insert(client)

In [None]:
for review in reviews:
    review.insert(guitar_id, client)

In [None]:
rating = review.rating if review.rating is not None else float()
rev_date = review.date if review.date is not None else str()
pros = review.pros if len(review.pros) > 0 else edgedb.Set()
cons = review.cons if len(review.cons) > 0 else edgedb.Set()
best_for = review.best_for if len(review.best_for) > 0 else edgedb.Set()
text = review.text if review.text is not None else str()

query_str = """INSERT Review {
                normalized_rating := <float64>$rating,
                date := <std::datetime>$rev_date, 
                pros := <array<str>>$pros,
                cons := <array<str>>$cons,
                best_for := <array<str>>$best_for,
                written_review := <str>$text,
                guitar := (
                    SELECT Guitar
                    filter .id = <uuid>$guitar_id
                    ),
                source :=(
                    SELECT ReviewSource
                    filter .name = <str>$review_source
                ),
                }
                """
return_val = client.query(query_str, rating=rating, rev_date=rev_date, pros=pros,\
             cons = cons, best_for = best_for, text = text, guitar_id = guitar_id[0].id, review_source = review.review_source)


In [None]:
review.review_source

In [None]:
query_str = """INSERT Review {
                normalized_rating := <float64>$rating,
                date := <std::datetime>$rev_date, 
                pros := <array<str>>$pros,
                cons := <array<str>>$cons,
                best_for := <array<str>>$best_for,
                written_review := <str>$text,
                guitar := (
                    SELECT Guitar
                    filter .id = <uuid>$guitar_id
                    ),
                source :=(
                    SELECT ReviewSource
                    filter .name = <str>$review_source
                ),
                }
                """
return_val = client.query(query_str, rating=rating, rev_date=rev_date, pros=pros,\
             cons = cons, best_for = best_for, text = text, guitar_id = guitar_id, review_source = review.review_source)


In [None]:
import re

In [None]:
match = re.search(r'[C|c]lassical',guitar.model)

In [None]:
match.group(0)

In [None]:

driver.close()
client.close()

In [None]:
client = edgedb.create_client(dsn='MSDS_459')

query_string = '''WITH pickups := 
            (SELECT Guitar.pickups filter Guitar.pickups != '')
            
            SELECT DISTINCT pickups;
            '''
cutaways = client.query(query_string)

In [None]:
resp = client.query(query_string)

In [None]:
if len(client.query(f"SELECT Guitar filter .model = 'Ernie' ")):
    print('Responses!')
else:
    print('Nothing!')

In [None]:
search_dict = dict()

guitar_props = ['body_shape','cutaway','pickups']

for prop in guitar_props:
    query_string = f"""WITH property :=
            (SELECT Guitar.{prop} filter Guitar.{prop} != '')
            SELECT DISTINCT property"""

    search_dict[prop] = client.query(query_string)    


In [None]:
# find valid entries for all "string" fields
str_props = ['body_shape','cutaway','pickups', 'type', 'country_of_origin']
for prop in str_props:
    query_string = f'''WITH prop :=
            (SELECT str_lower(Guitar.{prop}) filter Guitar.{prop} != '')
            SELECT DISTINCT prop'''
    search_dict[prop] = client.query(query_string)

In [None]:
search_dict

In [None]:
num_props = ['num_strings','scale_length','num_frets']
for prop in num_props:
    query_string = f'''WITH prop :=
            (SELECT Guitar.{prop} filter Guitar.{prop} != 0)
            SELECT DISTINCT prop'''
    search_dict[prop] = client.query(query_string)

search_dict

In [None]:
test_str = 'Acoustic-electric'
match = re.search('[A|a]coustic[ |-][E|e]lectric', test_str).group(0)
match

In [None]:
search_dict

In [None]:
query_str = 'SELECT Guitar {model, description} filter .type in {"Electric"} limit 5;'
with edgedb.create_client(dsn='MSDS_459') as client:

    guitars = client.query(query_str)

In [8]:
saveDir = '../product_pages_full'
url_file = 'product_urls.txt'
with open(url_file, "r") as file:
    url_list = file.read().splitlines()
print(f'List of Guitars and URLs read from {url_file}')

List of Guitars and URLs read from product_urls.txt


In [6]:
save_name = f"{saveDir}{path.splitext(url_partial)[0]}.html"
# not path.exists(save_name)
save_name
subDir = path.split(url_partial)[0]
saveDir + subDir

NameError: name 'url_partial' is not defined

In [None]:
n_urls = len(url_list)
scrape_start = n_urls-1000
scrape_end = n_urls
len_scrape = scrape_end-scrape_start

print(f'Downloading HTML pages for {len_scrape} pages')
status_steps = np.ceil(len_scrape/20)
for i_url,url_partial in enumerate(url_list[scrape_start:scrape_end]): # can modify this to only download a subset of the urls at a time

    # a nice little status bar :)
    curr_status = int(np.ceil(i_url/status_steps))
    print(f"[{curr_status*'-'}{(20-curr_status)*' '}]   {url_partial}",end='\r')    

    # check if file exists, skip if it's around
    save_name = f"{saveDir}{path.splitext(url_partial)[0]}.html"
    if not path.exists(save_name): # skip if the file exists

        # Construct the full URL
        url = "https://www.guitarcenter.com" + url_partial
        driver.get(url)
        html = scrape_utils.gc_get_all_reviews(driver, url)

        # create a directory as needed
        subDir = path.split(url_partial)[0]
        if not path.exists(saveDir + subDir):
            os.makedirs(f"{saveDir}{subDir}")
        with open(save_name, "w", encoding='utf-8') as file:
            file.write(html)




In [None]:
driver.close()

In [9]:
print("Adding guitars to the database")
add_list = []
skip_list = []
miss_list = []
fail_list = []
for url in url_list:

    # check to make sure that we have scraped the html
    url_file = f"{saveDir}/{url[1:-3]}.html"
    if path.exists(url_file):
        with open(url_file, "r", encoding='utf-8') as file:
            html = file.read()

        reviews = scrape_utils.gc_extract_review_info(html) # parse the review info
        guitar = scrape_utils.gc_extract_guitar_info(url, html) # parse the specs for the guitar
        # print(guitar.scale_length)

        if len(client.query(f"SELECT Guitar filter .model = '{guitar.model}'")):
            try:
                guitar_id = guitar.insert(client) # insert the guitar, get the uuid

                for review in reviews: # insert all reviews
                    review.insert(guitar_id, client)
                
                add_list.append(url) # keep track of what has been added

            except:
                # print(f'Could not insert guitar {guitar.model}')
                fail_list.append(url)
        else:
            skip_list.append(url)

    else:
        # print(f"{url_file} has not been downloaded")
        miss_list.append(url)

print('Upload Statistics:')
print(f"\t{len(add_list)} Guitars added")
print(f"\t{len(skip_list)} Guitars were already in the database")
print(f"\t{len(miss_list)} html files were missing")
print(f"\t{len(fail_list)} attempts failed for unknown reasons")


Adding guitars to the database
../product_pages_full/Gibson/Les-Paul-Axcess-Custom-Floyd-Rose-Electric-Guitar-Ebony-1500000321134.html has not been downloaded
../product_pages_full/ESP/LTD-MH-1000HT-Electric-Guitar-Black-Fade-1500000314172.html has not been downloaded
../product_pages_full/Fender-Custom-Shop/Limited-Edition-CuNiFe-Telecaster-Custom-Journeyman-Relic-Electric-Guitar-Aged-Amber-Natural-1500000366850.html has not been downloaded
../product_pages_full/Guild/OM-240CE-Orchestra-Acoustic-Electric-Guitar-Oxblood-Burst-1500000348858.html has not been downloaded
../product_pages_full/Martin/D-35-Left-Handed-Dreadnought-Acoustic-Guitar-Natural-1500000244831.html has not been downloaded
../product_pages_full/EVH/Limited-Edition-5150-Deluxe-Electric-Guitar-Natural-Ash-1500000375498.html has not been downloaded
../product_pages_full/Chapman/ML2-Electric-Guitar-Deep-Red-Satin-1500000367279.html has not been downloaded
../product_pages_full/Gretsch-Guitars/G6119T-62-Vintage-Select-Edit

AttributeError: 'NoneType' object has no attribute 'group'