In [12]:
# scraper.py
#
# Scrapes data about guitars and reviews from a few different sources,
# then uses that data to populate an EdgeDB instance

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
# import time
from bs4 import BeautifulSoup
import edgedb
import os
from os import path

import numpy as np
import random
import json


In [2]:

# local import
import scrape_utils
# import class_definitions


In [None]:
# create new instance of firefox driver -- this should be the geckodriver
options = Options()
options.binary_location = r"C:\\Program Files\\Mozilla Firefox\\firefox.exe"
options.headless = True
# options.set_headless(True)
driver = webdriver.Firefox(executable_path="C:\Program Files\GeckoDriver\geckodriver.exe", options=options)

# driver.manage()

In [None]:

# ---------------------------------------------------
# start with guitarcenter
# ---------------------------------------------------

url_list = [] #  list of guitar urls
guitars = [] # list of guitars
# iterate over the range of "Nao" values, get links to all guitars
# for ii in range(0, 4400, 100):
for ii in range(0,100,100): # temp testing
    html = scrape_utils.gc_get_browsing_pages(driver, ii) # get the html doc

    url_list.append(scrape_utils.gc_extract_links(html)) # append the list of matches

# make it unique -- without numpy
url_list = list(set([item for sublist in url_list for item in sublist]))

In [3]:
client = edgedb.create_client(dsn='MSDS_459')

In [None]:
client.query(""" INSERT ReviewSource {
                    name := <str>'Guitar Center',
                    sourceType := <default::SourceType>'Vendor',
            } UNLESS CONFLICT """)

# create a "Guitar Center" vendor
client.query(""" INSERT Vendor {
                    name := <str>'Guitar Center',
            } UNLESS CONFLICT """)

In [None]:
from importlib import reload
reload(scrape_utils)
reload(scrape_utils.class_definitions)
# reload(class_definitions)

In [None]:
for url_i,url in enumerate(url_list):
    if url_i < 100:
        html = scrape_utils.gc_get_all_reviews(driver, url)  
        reviews = scrape_utils.gc_extract_review_info(html) # parse the review info
        guitar = scrape_utils.gc_extract_guitar_info(url, html) # parse the specs for the guitar

        guitar_id = guitar.insert(client)


        for review in reviews:
            review.insert(guitar_id, client)

In [None]:
url = url_list[1]
html = scrape_utils.gc_get_all_reviews(driver, url)  
reviews = scrape_utils.gc_extract_review_info(html) # parse the review info
guitar = scrape_utils.gc_extract_guitar_info(url, html) # parse the specs for the guitar

In [None]:
rating = review.rating if review.rating is not None else float()
rev_date = review.date if review.date is not None else str()
pros = review.pros if len(review.pros) > 0 else edgedb.Set()
cons = review.cons if len(review.cons) > 0 else edgedb.Set()
best_for = review.best_for if len(review.best_for) > 0 else edgedb.Set()
text = review.text if review.text is not None else str()

query_str = """INSERT Review {
                normalized_rating := <float64>$rating,
                date := <std::datetime>$rev_date, 
                pros := <array<str>>$pros,
                cons := <array<str>>$cons,
                best_for := <array<str>>$best_for,
                written_review := <str>$text,
                guitar := (
                    SELECT Guitar
                    filter .id = <uuid>$guitar_id
                    ),
                source :=(
                    SELECT ReviewSource
                    filter .name = <str>$review_source
                ),
                }
                """
return_val = client.query(query_str, rating=rating, rev_date=rev_date, pros=pros,\
             cons = cons, best_for = best_for, text = text, guitar_id = guitar_id[0].id, review_source = review.review_source)


In [None]:
query_str = """INSERT Review {
                normalized_rating := <float64>$rating,
                date := <std::datetime>$rev_date, 
                pros := <array<str>>$pros,
                cons := <array<str>>$cons,
                best_for := <array<str>>$best_for,
                written_review := <str>$text,
                guitar := (
                    SELECT Guitar
                    filter .id = <uuid>$guitar_id
                    ),
                source :=(
                    SELECT ReviewSource
                    filter .name = <str>$review_source
                ),
                }
                """
return_val = client.query(query_str, rating=rating, rev_date=rev_date, pros=pros,\
             cons = cons, best_for = best_for, text = text, guitar_id = guitar_id, review_source = review.review_source)


In [None]:

driver.close()
client.close()

In [4]:
saveDir = '../product_pages_full'
url_file = 'product_urls.txt'
with open(url_file, "r") as file:
    url_list = file.read().splitlines()
print(f'List of Guitars and URLs read from {url_file}')

List of Guitars and URLs read from product_urls.txt


In [None]:
n_urls = len(url_list)
scrape_start = n_urls-1000
scrape_end = n_urls
len_scrape = scrape_end-scrape_start

print(f'Downloading HTML pages for {len_scrape} pages')
status_steps = np.ceil(len_scrape/20)
vpn_count = 1
for i_url,url_partial in enumerate(url_list[scrape_start:scrape_end]): # can modify this to only download a subset of the urls at a time

    # a nice little status bar :)
    curr_status = int(np.ceil(i_url/status_steps))
    print(f"[{curr_status*'-'}{(20-curr_status)*' '}] {i_url} of {len_scrape}",end='\r')    

    # check if file exists, skip if it's around
    save_name = f"{saveDir}{path.splitext(url_partial)[0]}.html"
    if not path.exists(save_name): # skip if the file exists

        try:
            # Construct the full URL
            url = "https://www.guitarcenter.com" + url_partial
            driver.get(url)
            html = scrape_utils.gc_get_all_reviews(driver, url)

            # create a directory as needed
            subDir = path.split(url_partial)[0]
            if not path.exists(saveDir + subDir):
                os.makedirs(f"{saveDir}{subDir}")
            with open(save_name, "w", encoding='utf-8') as file:
                file.write(html)

        except:
            print('Time to change the VPN')
            os.system(f'mullvad relay set hostname us-chi-00{vpn_count%5+1}')
            vpn_count += 1






In [None]:
driver.close()

In [5]:
print("Adding guitars to the database")
add_list = []
skip_list = []
miss_list = []
fail_list = []


BERT_model = scrape_utils.BERT_embed_model()
status_steps = np.ceil(len(url_list)/20)
for i_url,url in enumerate(url_list):

    # a nice little status bar :)
    curr_status = int(np.ceil(i_url/status_steps))
    print(f"[{curr_status*'-'}{(20-curr_status)*' '}]   {i_url}/{len(url_list)}",end='\r')

    # check to make sure that we have scraped the html
    url_file = f"{saveDir}/{url[1:-3]}.html"
    if path.exists(url_file):
        with open(url_file, "r", encoding='utf-8') as file:
            html = file.read()

        reviews = scrape_utils.gc_extract_review_info(html) # parse the review info
        guitar = scrape_utils.gc_extract_guitar_info(url, html, BERT_model=BERT_model) # parse the specs for the guitar
        # print(guitar.scale_length)

        if len(client.query("SELECT Guitar filter .model = <str>$model", model=guitar.model)) == 0:
            # try:
            guitar_id = guitar.insert(client) # insert the guitar, get the uuid

            for review in reviews: # insert all reviews
                review.insert(guitar_id, client)
                
            add_list.append(url) # keep track of what has been added

            # except:
                # print(f'Could not insert guitar {guitar.model}')
                # fail_list.append(url)
        else:
            skip_list.append(url)

    else:
        # print(f"{url_file} has not been downloaded")
        miss_list.append(url)

print('\n')
print('Upload Statistics:')
print(f"\t{len(add_list)} Guitars added")
print(f"\t{len(skip_list)} Guitars were already in the database")
print(f"\t{len(miss_list)} html files were missing")
print(f"\t{len(fail_list)} attempts failed for unknown reasons")


Adding guitars to the database
[--------------------]   6581/6582

Upload Statistics:
	1238 Guitars added
	163 Guitars were already in the database
	5181 html files were missing
	0 attempts failed for unknown reasons


In [None]:
with edgedb.create_client('MSDS_459') as client:
    num_gs = client.query('SELECT count(Guitar);')

In [28]:
with edgedb.create_client('MSDS_459') as client:
    # randomly choose an offset
    num_gs = client.query('SELECT count(Guitar);')[0]
    offset_gs = random.randint(0,num_gs-201)
    
    # Get a random subset of embeddings
    embeds = client.query(f'SELECT Guitar {{ embedding, id }} OFFSET {offset_gs} limit 200;')

In [30]:
embedding = BERT_model.predict([guitar.description])['encoder_outputs'][0]

dist = np.ndarray([200,])
for i_embed,embed in enumerate(embeds):
    dist[i_embed] = np.linalg.norm(np.array(json.loads(embed.embedding))-embedding)



In [60]:
closest_inds = np.argsort(dist)[0:5] # get the five closest

# create the query. I'm sure I could use arrays here.
sql_query = '''SELECT Guitar { model, brand: {name}, description}
                FILTER .id in 
                {  <uuid>$id0, <uuid>$id1, <uuid>$id2, <uuid>$id3, <uuid>$id4}'''

guitars = client.query(sql_query,\
                    id0=embeds[closest_inds[0]].id,\
                    id1=embeds[closest_inds[1]].id,\
                    id2=embeds[closest_inds[2]].id,\
                    id3=embeds[closest_inds[3]].id,\
                    id4=embeds[closest_inds[4]].id )


In [27]:
np.argsort(dist)[0:5]

array([110, 144,  87, 162, 121], dtype=int64)

In [33]:
dir(embeds[0])

['embedding', 'id']

In [45]:
print(sql_query)

SELECT Guitar { model, brand, description}
        FILTER .id in { e3f46c76-c12a-11ed-ad19-4f5fb29004f2 }


In [62]:
type(guitars)

list

In [75]:
for guitar in guitars:
    for key in guitar:
        print(key)

AttributeError: 'str' object has no attribute 'ident'