# NewEgg.Com WebScraping Program For Laptops - Beta v1.0

###  - April 2020

---

In [1]:
# Import dependencies.
import os
import glob
import time
import datetime
import re
from re import search
import pandas as pd
from splinter import Browser
import requests
from bs4 import BeautifulSoup as soup
import random
from playsound import playsound

In [None]:
# Reminder to self.
#import this

## Functions & Classes Setup
---

In [2]:
# Build a function to return date throughout the program.

def return_dt():
    
    global current_date
    
    current_date = str(datetime.datetime.now()).replace(':','.').replace(' ','_')[:-7]
    
    return current_date

#return_dt()

In [3]:
"""
Main NewEgg WebScraper function.

"""

def newegg_page_scraper(containers, turn_page):
    
    page_nums = []
    general_category = []
    product_categories = []
    images = []
    product_brands = []
    product_models = []
    product_links = []
    item_numbers = []
    promotions = []
    prices = []
    shipping_terms = []
    
    # Put this to avoid error that was being generated
    global gen_category
    
    """ 
    Loop through all the containers on the HTML, and scrap the following content into the following lists
    
    """
    for con in containers:
        
        try:
            page_counter = turn_page
            page_nums.append(int(turn_page))
            
            gen_category = target_page_soup.find_all('div', class_="nav-x-body-top-bar fix")[0].text.split('\n')[5]
            general_category.append(gen_category)
            
            prod_category = target_page_soup.find_all('h1', class_="page-title-text")[0].text
            product_categories.append(prod_category)
            
            image = con.a.img["src"]
            #print(image)
            images.append(image)

            prd_title = con.find_all('a', class_="item-title")[0].text
            product_models.append(prd_title)

            product_link = con.find_all('a', class_="item-title")[0]['href']
            product_links.append(product_link)
            
            shipping = con.find_all('li', class_='price-ship')[0].text.strip().split()[0]
            
            if shipping != "Free":
                shipping = shipping.replace('$', '')
                shipping_terms.append(shipping)
            else:
                shipping = 0.00
                shipping_terms.append(shipping)

            brand_name = con.find_all('a', class_="item-brand")[0].img["title"]
            product_brands.append(brand_name)

        except (IndexError, ValueError) as e:
            
            # If there are no item_brand container, take the Brand from product details.
            product_brands.append(con.find_all('a', class_="item-title")[0].text.split()[0])
            #print(f"{e} block 1")

        try:
            current_promo = con.find_all("p", class_="item-promo")[0].text
            promotions.append(current_promo)
            
        except:
            promotions.append('null')
            #print(f"{e} block 2")
        try:
            price = con.find_all('li', class_="price-current")[0].text.split()[0].replace('$','').replace(',', '')
            prices.append(price)
            
        except:
            price = 'null / out of stock'
            prices.append(price)
            #print(f"{e} block 3")
        
        try:
            item_num = con.find_all('a', class_="item-title")[0]['href'].split('p/')[1].split('?')[0]
            item_numbers.append(item_num)
        except (IndexError) as e:
            item_num = con.find_all('a', class_="item-title")[0]['href'].split('p/')[1]
            item_numbers.append(item_num)    
    
    # Convert all of the lists into a dataframe
    df = pd.DataFrame({
    'item_number': item_numbers,
    'general_category': general_category,
    'product_category': product_categories,
    'brand': product_brands,
    'model_specifications': product_models,
    'price': prices,
    'current_promotions': promotions,
    'shipping': shipping_terms,
    'page_number': page_nums,
    'product_links': product_links,
    'image_link': images
    })
    
    # Rearrange the dataframe columns into the following order.
    df = df[['item_number', 'general_category','product_category', 'page_number' ,'brand','model_specifications' ,'current_promotions' ,'price' ,'shipping' ,'product_links','image_link']]
    
    # Convert the dataframe into a dictionary.
    global scraped_dict
    scraped_dict = df.to_dict('records')
    
    # Grab the subcategory "Laptop/Notebooks" and eliminate any special characters that may cause errors.
    global pdt_category
    pdt_category = df['product_category'].unique()[0]
    # Eliminate special characters in a string if it exists.
    pdt_category = ''.join(e for e in pdt_category if e.isalnum())
    
    """ Count the number of items scraped by getting the length of a all the models for sale.
        This parameter is always available for each item-container in the HTML
    """

    global items_scraped
    items_scraped = len(df['model_specifications'])

    """
    Save the results into a csv file using Pandas
    """
    df.to_csv(f'./processing/{current_date}_{pdt_category}_{items_scraped}_scraped_page{turn_page}.csv')
    
    # Return these variables as they will be used.
    return scraped_dict, items_scraped, pdt_category
    
#df.head()
#newegg_page_scraper(containers, turn_page)

In [4]:
# Function to return the total results pages.

def results_pages(target_page_soup):
    
    # Use BeautifulSoup to extract the total results page number
    results_pages = target_page_soup.find_all('span', class_="list-tool-pagination-text")[0].text.strip()
    #print(results_pages)
    
    # Find and extract total pages + and add 1 to ensure proper length of total pages.
    global total_results_pages
    total_results_pages = int(re.split("/", results_pages)[1]) # need to add 1 b/c 'range(inclusive, exclusive)'
    #========================================= need to remember to +2, and remove -30
    #print(total_results_pages)
    
    return total_results_pages
#results_pages()

In [5]:
"""
Build a function to concatenate all pages that were scraped and saved in the processing folder.
Save the final output (1 csv file) all the results
    
"""
def concatenate(total_results_pages):
    
    path = f'./processing\\'
    
    scraped_pages = glob.glob(path + "/*.csv")
    
    concatenate_pages = []
    
    counter = 0
    
    for page in scraped_pages:
        
        df = pd.read_csv(page, index_col=0, header=0)
        
        concatenate_pages.append(df)

    compiled_data = pd.concat(concatenate_pages, axis=0, ignore_index=True)
    
    total_items_scraped = len(compiled_data['brand']) # can replace this counter by creating class objects everytime it scrapes
    
    concatenated_output = compiled_data.to_csv(f"./finished_outputs/{current_date}_{total_items_scraped}_scraped_{total_results_pages}_pages_.csv")
    
    return

In [6]:
"""
Built a function to clear out the entire processing files folder to avoid clutter.
Or the user can keep the processing files (page by page) for their own analysis.

"""
def clean_processing_fldr():
    
    path = f'./processing\\'
    
    scraped_pages = glob.glob(path + "/*.csv")
    
    if len(scraped_pages) < 1:
        print("There are no files in the folder to clear. \n")
        
    else:
        print(f"Clearing out a total of {len(scraped_pages)} scraped pages in the processing folder... \n")
        
        clear_processing_files = []
        
        for page in scraped_pages:
            
            os.remove(page)
        
    print('Clearing of "Processing" folder complete. \n')
    
    return

In [7]:
def random_a_tag_mouse_over3():
    
    x = random.randint(4, 8)
    
    def rdm_slp_5_9(x):

        time.sleep(x)

        print(f"Mimic Humans - Sleeping for {x} seconds. ")

        return x    

    working_try_atags = []

    #working_except_atags = []

    # This will all be working except the last one.
    finally_atags = []

    working_atags = []

    not_working_atags = []

    try_counter = 0

    finally_counter = 0

    number_of_a_tags = len(browser.find_by_tag("a"))
    
    random_90_percent_plug = (random.randint(90, 94)/100.00)
    
    ninety_pct_a_tag = int(round((number_of_a_tags * random_90_percent_plug))) 

    ninety_96th_a_tag = int(round((number_of_a_tags * .96)))
    
    # Mouse over to header of the page "Laptops"
    browser.find_by_tag("h1").mouse_over()
    
    #rdm_slp_5_9(x)
    
    step = random.randint(14, 25)
    
    for i in range(ninety_pct_a_tag, ninety_96th_a_tag, step):

        try: # try this as normal part of the program - SHORT
            
            rdm_slp_5_9(x)
            
            browser.find_by_tag("a")[i+2].mouse_over()
            
            time.sleep(1)
            
        except: # Execute this when there is an exception
            
            print("EXCEPTION raised during mouse over. Going to break loop and proceed with moving to the next page. \n")
            
            break

        else: # execute this only if no exceptions are raised
            
            working_try_atags.append(i+2)
            
            working_atags.append(i+2)
            
            try_counter += 1
            
            print(f"<a> number = {i+2} | Current Attempts / Try Count: {try_counter} \n")

    print("Sleeping for 3 seconds.. \n")
    time.sleep(3)

    return #working_atags

In [None]:
#working_atags

In [None]:
#########################################
# NOTES

x = random.randint(5, 9)
    
def rdm_slp_5_9(x):

    time.sleep(x)

    #print(f"Slept for {x} seconds. ")

    return x


working_try_atags = []

#working_except_atags = []

# This will all be working except the last one.
finally_atags = []
global working_atags
working_atags = []

not_working_atags = []

try_counter = 0

#except_counter = 0

finally_counter = 0

number_of_a_tags = len(browser.find_by_tag("a"))

ninety_pct_a_tag = int(round((number_of_a_tags * .90))) 

ninety_96th_a_tag = int(round((number_of_a_tags * .94)))


browser.find_by_tag("h1")
rdm_slp_5_9(x)
print(f"Mimic Humans - try - Sleep for {x} seconds. ")



step = random.randint(14, 25)
    
for i in range(ninety_pct_a_tag, ninety_96th_a_tag, step):
    
    wildcard = random.randint(2, 50)
    
    try: # try this as normal part of the program - SHORT
        rdm_slp_5_9(x)
        print(f"Mimic Humans - try - Sleep for {x} seconds. ")
        browser.find_by_tag("a")[i+wildcard].mouse_over()

    except: # Execute this when there is an exception
        print("Exception / error has occurred")
        break
#             not_working_atags.append(i+2)
#             print(f"{working_atags[-1]} last working <a> tag")
#             print(f"All working <a> tags: {working_atags} \n")
#             print(f"All working <a> tags: {not_working_atags} \n")
#             print("breaking loop and will return working list of <a> tag's ")
            

    else: # execute this only if no exceptions are raised
        working_try_atags.append(i+wildcard)
        working_atags.append(i+wildcard)
        try_counter += 1
        print(f"<a> number = {i+wildcard} | Current Try Count: {try_counter} ")



#     finally: # execute this always after "try" - GO LONG ON THE PAGE - this one may not catch exceptions
#         print("")
#         # if there's an error, pull the last item in the list from finally_atags list
#         rdm_slp_5_9(x)
#         finally_atags.append(i+20)
#         browser.find_by_tag("a")[i+20].mouse_over()
#         working_atags.append(i+20)
#         finally_counter += 1
#         print(f"{finally_counter} - Current finally Count")
#         count += 1



print(f"Mousing over to <a> tag: {working_atags[0]} ")
browser.find_by_tag("a")[working_atags[0]].mouse_over()
print("Sleeping for 3 seconds")
time.sleep(3)
print()

# later take this piece out after test
print("Going to next page now. ")
browser.find_by_xpath('/html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[1]/div[2]/div/div[2]/button').mouse_over()
time.sleep(1)        
browser.find_by_xpath('/html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[1]/div[2]/div/div[2]/button').click()



In [None]:
#len(browser.find_by_tag("a"))


In [None]:
#ninety_pct_a_tag

In [None]:
# What you need to do:

"""
- Come up with an algorithm that will <a> hop big (assume) 2k plus <a> tag links
- to jump big first, and then except and pause, and then for finally, break the loop after gathering about 5 workable
links, and then set up a another random.randint() function to pick one from the list, mouse_over, and click, and back
to emulate human behavior OR just mouse over and then scrape. and then next page

- Post are you human test <a> links drop down significantly - come up with an algorithm too

- if <a> length is greater than 2k then use 90% method

if <a> is less than 200 use 35% method

"""

In [8]:
def g_recaptcha_check():
    
    if browser.is_element_present_by_id('g-recaptcha') == True:
        
        for sound in range(0, 2):
            
            playsound('./sounds/user_alert.wav')
        
        print("recaptcha - Check Alert! \n")
        
        continue_scrape = input("Newegg system suspects you are a bot. \n Complete the recaptcha test to prove you're not a bot. After, enter in any key and press ENTER to continue the scrape. \n")
        
        print("Continuing with scrape... \n")
        
    return


In [9]:
def random_xpath_top_bottom():
    
    x = random.randint(3, 7)
    
    def rdm_slp_5_9(x):

        time.sleep(x)

        print(f"Slept for {x} seconds. \n")

        return x
    
    
    # Check if there are working links on the screen, otherwise alert the user.
    
    if (browser.is_element_present_by_tag('h1')) == True:
        
        print("(Check 1 - Random Xpath Top Bottom) Header is present and hoverable on page. \n")
    
    else:
        
        print("(Check 1 - ERROR - Random Xpath Top Bottom) Header is NOT present on page. \n")
        
        for s in range(0, 1):
            
            playsound('./sounds/user_alert.wav')
        
        red_light = input("Program could not detect a clickable links to hover over, and click. Please use your mouse to refresh the page, and enter 'y' to continue the scrape. \n")
        
    if (browser.is_element_present_by_tag("a")) == True:
        
        print("(Check 2- Random Xpath Top Bottom) <a> tags are present on page. Will begin mouse-over thru the page, and click a link. \n")
        
    else:
        # If there isn't, pause the program. Have user click somewhere on the screen.
        
        for s in range(0, 1):
            
            playsound('./sounds/user_alert.wav')
        
        red_light = input("Program could not detect a clickable links to hover over, and click. Please use your mouse to refresh the page, and enter 'y' to continue the scrape. \n")
    
    # There are clickable links, then 'flip the coin' to choose top or bottom button
    coin_toss_top_bottom = random.randint(0,1)
    
    next_page_button_results = []
    
    # If the coin toss is even, mouse_over and click the top page link.
    if (coin_toss_top_bottom == 0):
        
        print('Heads - Clicking "Next Page" Top Button. \n')
        
        rdm_slp_5_9(x)
        
        print(f"Mimic human behavior by randomly sleeping for {x}. \n")
        
        browser.find_by_xpath('/html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[1]/div[2]/div/div[2]/button').mouse_over()
        
        time.sleep(1)
        
        browser.find_by_xpath('/html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[1]/div[2]/div/div[2]/button').click()
        
        next_page_button_results.append(coin_toss_top_bottom)
        
        print('Heads - SUCCESSFUL "Next Page" Top Button. \n')
        
        return
    
    else:
        
        print('Tails - Clicking "Next Page" Bottom Button. \n')
        
        rdm_slp_5_9(x)
        
        print(f"Mimic human behavior by randomly sleeping for {x}. \n")
        
        browser.find_by_xpath('/html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[4]/div/div/div[11]/button').mouse_over()
        
        time.sleep(1)
        
        browser.find_by_xpath('/html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[4]/div/div/div[11]/button').click()
        
        next_page_button_results.append(coin_toss_top_bottom)
        
        print('Tails - SUCCESSFUL "Next Page" Bottom Button. \n')
        
        return
    

In [None]:
def are_you_human_backend():
    
    if target_page_soup.find_all("title")[0].text == 'Are you a human?':
        
        playsound('./sounds/user_alert.wav')

        #continue_scrape = input("Newegg notices you're a robot on the backend when requesting. REFRESH THE PAGE and you may have to perform a test to prove you're human. After you refresh, enter in any key, and press ENTER to continue the webscrape. \n")

        print("Newegg suspects you're a bot on the backend. Automatically will refresh the page 2 times, and target new URL. ")

        for i in range(0, 1):

            browser.reload()
            
            time.sleep(2)
            
        print("Targeting new url... ")
            
        # After user passes test, target the new url.
        target_url = browser.url

        response_target = requests.get(target_url)

        target_page_soup = soup(response_target.text, 'html.parser')
        
        return target_page_soup
        
    else:
        
        print("Passed the 'Are you human?' check when requesting and parsing the html. Continuing with scrape ... ")
    

In [None]:
# browser.find_by_xpath('/html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[4]/div/div/div[10]/button').mouse_over()
# time.sleep(1)
# browser.find_by_xpath('/html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[4]/div/div/div[10]/button').click()

In [None]:


# browser.find_by_xpath(/html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[4]/div/div/div[11]/button
#         /html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[4]/div/div/div[11]/button
#         /html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[4]/div/div/div[11]/button
#         /html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[1]/div[2]/div/div[2]/button

In [10]:
"""
This class takes in the dictionary from the webscraper function, and will be used in a list comprehension

to produce class "objects"

"""
class Laptops:
    
    def __init__(self, **entries):
        
        self.__dict__.update(entries)


"""
Originally modeled out parent/child inheritance object structure.

After careful research, I found it much easier to export the Pandas Dataframe of the results to a dictionary,

and then into a class object, which I will elaborate more down below.

"""
# class Product_catalog:
    
#     all_prod_count = 0
    
#     def __init__(self, general_category): # computer systems
#         self.general_category = general_category
        
#         Product_catalog.all_prod_count += 1
        
#     def count_prod(self):
#         return int(self.all_prod_count)
#         #return '{}'.format(self.general_category)

# Sub_category was later changed to Laptops due to the scope of this project.
# class Sub_category(Product_catalog): # laptops/notebooks, gaming
    
#     sub_category_ct = 0
    
#     def __init__(self, general_category, sub_categ, item_num, brand, price, img_link, prod_link, model_specifications, current_promotions):
#         super().__init__(general_category)
#         Sub_category.sub_category_ct += 1
        
#         self.sub_categ = sub_categ
#         self.item_num = item_num
#         self.brand = brand
#         self.price = price
#         self.img_link = img_link
#         self.prod_link = prod_link
#         self.model_specifications = model_specifications
#         self.current_promotions = current_promotions



'\nOriginally modeled out parent/child inheritance object structure.\n\nAfter careful research, I found it much easier to export the Pandas Dataframe of the results to a dictionary,\n\nand then into a class object, which I will elaborate more down below.\n\n'

In [None]:
# ask if you're in another country
# id="popup_overlay"
# /html/body/div[13]/div/div[2]/div[2]/button[1]

In [None]:
#browser.is_element_present_by_id()

## Main Program Logic
---

In [11]:
""" Welcome to the program message!
"""
print("=== NewEgg.Com Laptop WebScraper Beta v1.0 ===")
print("=="*30)
print('Scope: This project is a beta and is only built to scrape the laptop section of NewEgg.com due to limited time. \n')
print("Instructions: \n")
return_dt()
print(f'Current Date And Time: {current_date} \n')
print("(1) Go to www.newegg.com, go to the laptop section, select your requirements (e.g. brand, screensize, and specifications - SSD size, processor brand and etc...) ")
print("(2) Copy and paste the url from your exact search when prompted ")
print('(3) After the webscraping is successful, you will have an option to concatenate all of the pages you scraped together into one csv file')
print('(4) Lastly, you will have an option to clear out the processing folder (data scraped by each page)')
print('(5) If you have any issues or errors, "PRESS CTRL + C" to quit the program in the terminal ')
print('(6) You may run the program in the background as the program will make an alert noise to flag when Newegg suspects there is a bot, and will pause the scrape until you finish proving you are human. ')
print('(7) Disclaimer: Newegg may ban you for a 24 - 48 hours for webscraping their data, then you may resume. \n Also, please consider executing during the day, with tons of web traffic to their site in your respective area. \n')
print('Happy Scraping!')

# Set up Splinter requirements.
executable_path = {'executable_path': './chromedriver.exe'}

# Ask user to input in the laptop query link they would like to scrape.
url = input("Please copy and paste your laptop query that you want to webscrape, and press enter: \n")

browser = Browser('chrome', **executable_path, headless=False, incognito=True)

browser.visit(url)

current_url = browser.url

# Allocating loading time.
time.sleep(4)

#current_url = browser.url
        
response = requests.get(current_url)

print(f"{response} \n")

target_page_soup = soup(response.text, 'html.parser')

# Run the results_pages function to gather the total pages to be scraped.
results_pages(target_page_soup)

"""
This is the loop that performs the page by page scraping of data / results
of the user's query.
"""
# List set up for where class Laptop objects will be stored.

print("Beginning webscraping and activity log below... ")
print("="*40)

product_catalog = []

for turn_page in range(1, total_results_pages):
    
    """
    If "reCAPTCHA" pops up, pause the program using an input. This allows the user to continue
    to scrape after they're done completing the quiz by inputting any value.
    """
    # Allocating loading time.
    time.sleep(3)
    
    g_recaptcha_check()

    print(f"Beginning mouse over activity... \n")
    
    # Set up "containers" to be passed into main scraping function. 
    
    if turn_page == 1:
        
        containers = target_page_soup.find_all("div", class_="item-container")
        
    else:
    
        target_url = browser.url

        # Use Request.get() - throw the boomerang at the target, retrieve the info, & return back to requestor
        
        response_target = requests.get(target_url)
        #response
        
        # Use BeautifulSoup to read grab all the HTML using the lxml parser
        target_page_soup = soup(response_target.text, 'html.parser')
        
        # Newegg may suspect we're a bot.
#         if target_page_soup.find_all("title")[0].text == 'Are you a human?':
        
#             playsound('./sounds/user_alert.wav')

#             #continue_scrape = input("Newegg notices you're a robot on the backend when requesting. REFRESH THE PAGE and you may have to perform a test to prove you're human. After you refresh, enter in any key, and press ENTER to continue the webscrape. \n")
            
#             print("Newegg suspects you're a bot on the backend. Automatically will refresh the page 2 times, and target new URL. ")
            
#             for i in range(0, 1):
                
#                 browser.reload()
#                 time.sleep(2)
        
#             print("Targeting new url... ")
            
#             # After user passes test, target the new url.
#             target_url = browser.url
            
#             response_target = requests.get(target_url)
            
#             target_page_soup = soup(response_target.text, 'html.parser')
        
        are_you_human_backend()

        containers = target_page_soup.find_all("div", class_="item-container")
    
    print(f"Scraping page: {turn_page} \n")
    
    # Execute webscraper function. Output is a csv file in the processing folder and dictionary.
    
    #screenshot_path = browser.screenshot(f'{turn_page}screen_shot.png')
    
    newegg_page_scraper(containers, turn_page)
    
    print("Creating laptop objects for this page... \n")
    
    # Create instances of class objects of the laptops/notebooks using a list comprehension.
    objects = [Laptops(**prod_obj) for prod_obj in scraped_dict]
    
    print(f"Finished creating Laptop objects for page {turn_page} ... \n")
    
    # Append all of the objects to the main product_catalog list (List of List of Objects).
    
    print(f"Adding {len(objects)} to laptop catalog... \n")
    
    product_catalog.append(objects)
    
    random_a_tag_mouse_over3()

    #print("Will scrape pages, but will need to randomly sleep for max 35 seconds to emulate human behavior. \n")
    
    if turn_page == total_results_pages:
        
        print(f"Completed scraping {turn_page} / {total_results_pages} pages. \n ")
        
        # Exit the broswer once complete webscraping.
        
        browser.quit()
        
    else:
        
        try:
            
            y = random.randint(3, 5)
            
            print(f"Current Page: {turn_page}) | SLEEPING FOR {y} SECONDS THEN will click next page. \n")
            
            time.sleep(y)
            
            random_xpath_top_bottom()

        except:
            z = random.randint(2, 5)
            
            print(f" (EXCEPTION) Current Page: {turn_page}) | SLEEPING FOR {z} SECONDS - Will click next page, if applicable. \n")
            
            time.sleep(z)
            
            random_xpath_top_bottom()
            
            time.sleep(1)

# Prompt the user if they would like to concatenate all of the pages into one csv file
concat_y_n = input(f'All {total_results_pages} pages have been saved in the "processing" folder (1 page = csv files). Would you like for us concatenate all the files into one? Enter "y", if so. Otherwise, enter anykey to exit the program. \n')

if concat_y_n == 'y':
    
    concatenate(total_results_pages)
    
    print(f'WebScraping Complete! All {total_results_pages} have been scraped and saved as {current_date}_{pdt_category}_scraped_{total_results_pages}_pages_.csv in the "finished_outputs" folder \n')

# Prompt the user to if they would like to clear out processing folder function here - as delete everything to prevent clutter
clear_processing_y_n = input(f'The "processing" folder has {total_results_pages} csv files of each page that was scraped. Would you like to clear the files? Enter "y", if so. Otherwise, enter anykey to exit the program. \n')

if clear_processing_y_n == 'y':
    
    clean_processing_fldr()

print('Thank you checking out my project, and hope you found this useful! \n')

=== NewEgg.Com Laptop WebScraper Beta v1.0 ===
Scope: This project is a beta and is only built to scrape the laptop section of NewEgg.com due to limited time. 

Instructions: 

Current Date And Time: 2020-04-23_21.18.29 

(1) Go to www.newegg.com, go to the laptop section, select your requirements (e.g. brand, screensize, and specifications - SSD size, processor brand and etc...) 
(2) Copy and paste the url from your exact search when prompted 
(3) After the webscraping is successful, you will have an option to concatenate all of the pages you scraped together into one csv file
(4) Lastly, you will have an option to clear out the processing folder (data scraped by each page)
(5) If you have any issues or errors, "PRESS CTRL + C" to quit the program in the terminal 
(6) You may run the program in the background as the program will make an alert noise to flag when Newegg suspects there is a bot, and will pause the scrape until you finish proving you are human. 
(7) Disclaimer: Newegg may

(Check 1 - Random Xpath Top Bottom) Header is present and hoverable on page. 

(Check 2- Random Xpath Top Bottom) <a> tags are present on page. Will begin mouse-over thru the page, and click a link. 

Tails - Clicking "Next Page" Bottom Button. 

Slept for 3 seconds. 

Mimic human behavior by randomly sleeping for 3. 

Tails - SUCCESSFUL "Next Page" Bottom Button. 

Beginning mouse over activity... 

Scraping page: 9 

Creating laptop objects for this page... 

Finished creating Laptop objects for page 9 ... 

Adding 36 to laptop catalog... 

Mimic Humans - Sleeping for 9 seconds. 
EXCEPTION raised during mouse over. Going to break loop and proceed with moving to the next page. 

Sleeping for 3 seconds.. 

Current Page: 9) | SLEEPING FOR 4 SECONDS THEN will click next page. 

(Check 1 - Random Xpath Top Bottom) Header is present and hoverable on page. 

(Check 2- Random Xpath Top Bottom) <a> tags are present on page. Will begin mouse-over thru the page, and click a link. 

Tails - Clic

(Check 1 - Random Xpath Top Bottom) Header is present and hoverable on page. 

(Check 2- Random Xpath Top Bottom) <a> tags are present on page. Will begin mouse-over thru the page, and click a link. 

Heads - Clicking "Next Page" Top Button. 

Slept for 4 seconds. 

Mimic human behavior by randomly sleeping for 4. 

Heads - SUCCESSFUL "Next Page" Top Button. 

Beginning mouse over activity... 

Scraping page: 18 

Creating laptop objects for this page... 

Finished creating Laptop objects for page 18 ... 

Adding 36 to laptop catalog... 

Mimic Humans - Sleeping for 7 seconds. 
EXCEPTION raised during mouse over. Going to break loop and proceed with moving to the next page. 

Sleeping for 3 seconds.. 

Current Page: 18) | SLEEPING FOR 5 SECONDS THEN will click next page. 

(Check 1 - Random Xpath Top Bottom) Header is present and hoverable on page. 

(Check 2- Random Xpath Top Bottom) <a> tags are present on page. Will begin mouse-over thru the page, and click a link. 

Heads - Clickin

IndexError: index 0 is out of bounds for axis 0 with size 0

In [13]:
target_url = browser.url

In [14]:
response_target = requests.get(target_url)
response

<Response [200]>

In [15]:
target_page_soup = soup(response_target.text, 'html.parser')

In [16]:
target_page_soup

<!DOCTYPE HTML>

<html lang="en-us">
<head>
<title>$1000 - $2000, $1500 - $2000, 250 GB - 499 GB, 100 GB - 249 GB, 15" - 15.6", 14" - 14.5", Intel Core i7, Laptops / Notebooks, Laptops / Notebooks, Co... - Newegg.com</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="always" name="referrer"/>
<meta content='Newegg.com - $1000 - $2000, $1500 - $2000, 250 GB - 499 GB, 100 GB - 249 GB, 15" - 15.6", 14" - 14.5", Intel Core i7, Laptops / Notebooks, Laptops / Notebooks, Co...' name="keywords"/>
<meta content="Newegg.com offers the best prices on computer products, laptop computers, LED LCD TVs, digital cameras, electronics, unlocked phones, office supplies, and more with fast shipping and top-rated customer service. Newegg shopping upgraded ™" name="description"/>
<meta content="https://c1.neweggimages.com/WebResource/Themes/2005/Nest/logo_424x210.png" property="og:image"/>
<meta content="Newegg.com offers the best prices on computer products, laptop 

In [17]:
 browser.reload()

In [None]:
print(total_results_pages)

In [None]:
# 10 items
#https://www.newegg.com/p/pl?N=100006740%20601286795%20600004343%20600004344%20601296065%20601296066%204085%20600136700&LeftPriceRange=1000%201250

In [None]:
# Example when Newegg sends you bogus information when you request from their server
target_page_soup.find_all

In [None]:
current_page_soup

In [None]:
#/html/body/div[5]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[1]/div[2]/div/div[2]/button


In [None]:
print(browser.find_by_tag('a')[ninety_pct_a_tag].__dict__)

In [None]:
len(browser.find_by_tag('a')[ninety_pct_a_tag])

In [None]:
mimipopupsweepstakes

In [None]:
#browser.find_by_tag('button')#[0].click()

In [None]:
len(current_page_soup.find_all("div", class_="item-container"))

In [None]:
#id="rc-imageselect"

In [None]:
#browser.is_element_present_by_id("btn_InnerSearch")
#browser.is_element_present_by_id("rc-imageselect")

In [None]:
print(browser.__dict__)

In [None]:
browser.find_by_tag('body')[0].click()

In [None]:
len(browser.find_by_tag('body'))

In [None]:
browser.find_by_tag('a')[1958].click()#['href']

In [None]:
browser.back()

In [None]:
number_of_a_tags = len(browser.find_by_tag('a'))
ninety_pct_a_tag = round((number_of_a_tags * .902))
ninety_pct_a_tag

In [None]:
browser.find_by_tag('a')[1962].click()

In [None]:
browser.find_by_tag('a')[1970].mouse_over()

In [None]:
browser.find_by_tag('h1')[0].click()

In [None]:
#//*[@id="btn_InnerSearch"]

In [None]:
browser.find_by_xpath('/html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[1]/div[2]/div/div[2]/button').click()

In [None]:
browser.is_element_present_by_id('g-recaptcha')

In [None]:
browser.is_element_present_by_id('mimipopupsweepstakes')


In [None]:
browser.find_by_tag('a')[2263].click()

In [None]:
browser.is_element_present_by_id('sweepstakespopup')
/html/body/div[16]/div/a

In [None]:
browser.is_element_present_by_id('sweepstakespopup_close')

In [None]:
browser.find_by_id('sweepstakespopup_close').click()

In [None]:
# browser.find_by_xpath('/html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[4]/div/div/div[10]/button').click()

In [None]:
#current_page_soup.is_element_present_by_xpath("/html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[1]/div[2]/div/div[2]/button")


In [None]:
#is_element_present_by_xpath('')

In [None]:
# Test to view objects that were created from the data we collected
#product_catalog[1][0].__dict__

In [None]:
#/html/body/div[4]/section/div/div/div[2]/div/div/div/div[2]/div[1]/div[2]/div[1]/div[2]/div/div[2]/button/i

In [None]:
#/html/body/div[4]/section/div/div/div[2]/div/div/div[2]/div[2]/div[1]/div[2]/div[4]/div/div/div[11]/button/i

In [None]:
counter = 0
prices = []
for con in containers:
    counter += 1
    try:
        #print(str(counter) + " | " + con.find_all('li', class_="price-current")[0].text.split()[0].replace('$','').replace(',', ''))
        price = con.find_all('li', class_="price-current")[0].text.split()[0].replace('$','').replace(',', '')
        prices.append(price)
    except:
        #print(str(counter) + " | " + "null")
        
        price = 'null'
        prices.append(price)
        
print(prices)