# Scraping Laptops data from bestbuy canada website


This is an example of data scraping from bestbuy canada website. This notebook is an exercise I did in order to help me decide what computer can I buy for my personnal project in data science. So I decided to focus on one website and see what it can tell me about my project. I essentially make some request on the website of bestbuy and I use BeautifulSoup to parse the html file I got from the requests and collect automatically data about computer on the website. 
After getting my csv file, I clean it and take at look at the desciptive statistic measures  and make some visualization.

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from IPython.core.display import clear_output

from time import sleep
from datetime import datetime
from random import randint

In [2]:
#That is for hiding warnings
from warnings import warn

warn("Warning Simulation")

In [3]:
from requests import get, ReadTimeout, ConnectTimeout, HTTPError, Timeout, ConnectionError

In [4]:
# I use get from requests package to read the url I selected with text function
url = 'https://www.bestbuy.ca/en-ca/category/laptops-macbooks/20352.aspx?type=product&page=1&filter=category%253aComputers%2B%2526%2BTablets%253bcategory%253aLaptops%2B%2526%2BMacBooks%253bcustom0ramsize%253a4'
# url = 'https://www.bestbuy.ca/en-ca/category/laptops/36711?path=category%253AComputers%2B%2526%2BTablets%253Bcategory%253ALaptops%2B%2526%2BMacBooks%253Bcategory%253ALaptops%253Bcustom0ramsize%253A4'

In [11]:
try:
    response = get(url)
    response.raise_for_status()
    print(response.text[:200])
except ConnectionError as e:
    print("OOPS!! Connection Error. Make sure you are connected to Internet. Technical Details given below.\n")
    # print(str(e))            
    # continue
except Timeout as e:
    print("OOPS!! Timeout Error")
    # print(str(e))
    # continue
except RequestException as e:
    print("OOPS!! General Error")
    # print(str(e))
    # continue
except KeyboardInterrupt:
    print("Someone closed the program")

OOPS!! Timeout Error


## Use beautifulSoup to parse the HTML content

In [None]:
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)

In [None]:
laptop_containers = html_soup.find_all('li', class_='listing-item equal-height-container')
print(type(laptop_containers))
print('length of laptopcontainer : 'len(laptop_containers))

In [None]:
laptop_containers = html_soup.find_all('div', class_='item-inner clearfix')
print(type(laptop_containers))
print(len(laptop_containers))

## Extracting the data for a single laptop
We can access the first container, which contains information about a single laptop, by using list notation on laptop_containers.

In [None]:
# Take a look at the first element
first_laptop = laptop_containers[0]
first_laptop

## The laptop name
Here I check every tag to see what it contains in order to retrieve the informations I need.

In [None]:
first_laptop.div

In [None]:
first_laptop.a

In [None]:
first_laptop.h4

In [None]:
first_laptop.h4.a

In [None]:
first_name = first_laptop.h4.a.text

In [None]:
first_name

## Laptop price

In [None]:
first_price = first_laptop.find('span', class_='amount')

In [None]:
first_price

In [None]:
first_price = first_price.text

In [None]:
first_price

## Laptop rating

In [None]:
# I do the same investigation for laptop price for laptop rating, I deduce the following function which retrieves laptop rating
def Rating(c):
    c = c.find('div', class_="rating-stars-yellow")
    c = c['style']
    c = c.split()[1]
    c = float(c[:-2])
    return c

In [None]:
# Rating of first laptop
Rating(first_laptop)

## Number of votes

In [None]:
first_vote = first_laptop.find('div', class_="rating-num").text

## The script for a single page


In [None]:
# Lists to store the scraped data in
names = []
prices = []
ratings = []
votes = []

# Extract data from individual movie container
for container in laptop_containers:

    # If the movie has Metascore, then extract:
    if container.find('div',  class_='rating-stars-yellow') is not None:

        # The name
        name = first_laptop.h4.a.text
        names.append(name)

        # The price
        price = container.find('span', class_='amount').text
        prices.append(price)

        # The rating
        rating = Rating(container)
        ratings.append(rating)

        # The number of votes
        vote = container.find('div', class_="rating-num").text
        votes.append(vote)

In [None]:
# I put the collected data in pandas dataframe

In [None]:
test_df = pd.DataFrame({'laptop': names,
                       'price': prices,
                       'rating': ratings,
                        'votes': votes})
print(test_df.info())
test_df

In [None]:
headers = {"Accept-Language": "en-US, en;q=0.5"}

## The script for multiple pages

In [None]:
pages = [str(i) for i in range(1,4)]
Ram_url = ['2', '4',  '8', '12', '16','32']

In [None]:
def URL(s,t):
    url = 'https://www.bestbuy.ca/en-ca/category/laptops-macbooks/20352.aspx?type=product&page='+s+'&filter=category%253aComputers%2B%2526%2BTablets%253bcategory%253aLaptops%2B%2526%2BMacBooks%253bcustom0ramsize%253a8'+ t    
    print(url)

In [None]:
URL('s','t')

In [None]:
pages = [str(i) for i in range(1,30)]
Ram_url = ['2', '4',  '8', '12', '16','32', '64']

In [None]:
# Redeclaring the lists to store data in
names = []
prices = []
ratings = []
votes = []

# Preparing the monitoring of the loop
start_time = datetime.now()
requests = 0

# For every number of Go of ram in Ram_url
for ram_url in Ram_url:

    # For every page in the interval 1-54
    for page in pages:

        # Make a get request
        response = get('https://www.bestbuy.ca/en-ca/category/laptops-macbooks/20352.aspx?type=product&page='+page+'&filter=category%253aComputers%2B%2526%2BTablets%253bcategory%253aLaptops%2B%2526%2BMacBooks%253bcustom0ramsize%253a'+ ram_url, headers = headers)
       

        # Pause the loop
        sleep(randint(8,15))

        # Monitor the requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)

        # Throw a warning for non-200 status codes
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))

        # Break the loop if the number of requests is greater than expected
        if requests > 300:
            warn('Number of requests was greater than expected.')  
            break 

        # Parse the content of the request with BeautifulSoup
        page_html = BeautifulSoup(response.text, 'html.parser')

        # Select all the laptop containers from a single page
        laptops_containers = page_html.find_all('div',  class_='item-inner clearfix')

        # For every computer in laptops_container
        for container in laptops_containers:
            # If the computer has a rating, then:
            if container.find('div',  class_='rating-stars-yellow') is not None:

                # Scrape the name
                name = container.h4.a.text
                names.append(name)

                # Scrape the price 
                price = container.find('span', class_='amount').text
                prices.append(price)

                # Scrape the rating
                rating = Rating(container)
                ratings.append(rating)

                
                # Scrape the number of votes
                vote = container.find('div', class_="rating-num").text
                votes.append(vote)

In [None]:
laptops_rating = pd.DataFrame({'laptops': names,
                       'prices': prices,
                       'ratings': ratings,
                        'votes': votes})

In [None]:
laptops_rating.info()

In [None]:
laptops_rating.head()

In [None]:
laptops_rating.to_csv('laptops_rating2020.csv')

# Data cleaning

In [None]:
laptops_rating['prices'].iloc[0]

In [None]:
def prix(x):
    if ',' in x:
        x = x.replace(',','')
        return float(x[1:])
    else:
        return float(x[1:])

In [None]:
laptops_rating['prices'] = [prix(x) for x in laptops_rating['prices']]

In [None]:
laptops_rating['prices'].iloc[0]

In [None]:
def Remove_parenthese(x):
    y = len(x)-1
    z = x[1:y]
    z = float(z)
    return z

In [None]:
laptops_rating['votes'] = [Remove_parenthese(x) for x in laptops_rating['votes']]

In [None]:
float(laptops_rating['votes'].iloc[120])

# Descriptive statistic measures of the data

In [None]:
laptops_rating[['prices', 'ratings', 'votes']].describe()

In [None]:
dfmax = laptops_rating[df['votes']==5977.000000]

In [None]:
dfmax[['laptops', 'prices', 'ratings']]

In [None]:
dfmax_ratings = df[df['ratings']==100.0]

In [None]:
dfmax_ratings[['laptops', 'prices', 'votes']]

In [None]:
dfmax_ratings['votes'].max()

# Plotting and analyzing the distributions

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
laptops_rating[['prices', 'ratings', 'votes']].hist(bins=20, figsize = (16,8))
plt.show()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

fig, axes = plt.subplots(nrows = 1, ncols = 3, figsize = (16,4))
ax1, ax2, ax3 = fig.axes

ax1.boxplot(laptops_rating['prices']) # bin range = 1
ax1.set_title('Prices')

ax2.boxplot(laptops_rating['ratings']) # bin range = 10
ax2.set_title('Ratings')

ax3.boxplot(laptops_rating['votes'])
ax3.set_title('Votes')

for ax in fig.axes:
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

plt.show()