# Scraping and Parsing URLs (Intro)

In [1]:
# Importing the required packages - Beautiful Soup from bs4, get from requests and time
from bs4 import BeautifulSoup
from requests import get
import time

In [2]:
# URL of the website to be scraped
url = "https://www.ebay.com/sch/i.html?_from=R40&_nkw=samsung+tv&_sacat=0&LH_TitleDesc=0&_pgn=01"
# Adding user agent as Mozilla to make the server believe that the request came from a browser
user_agent = 'Mozilla/5.0'
# Adding the user agent to the request header
headers = {'User-Agent': user_agent}
# Requesting the ebay website access to the server and storing the response
res = get(url, headers = headers)

In [3]:
# Parsing the response and storing the resulting html
data = BeautifulSoup(res.text, 'html.parser')

In [4]:
# Writes the output to the html file
with open("ebay_samsung_tv_01.htm", "w", encoding='utf-8') as file:
    file.write(str(data))

In [5]:
# Creates the list of page numbers from 1-10 as in the url as 01, 02, etc
page_numbers = ["{0:02}".format(i+1) for i in range(10)]
page_numbers

['01', '02', '03', '04', '05', '06', '07', '08', '09', '10']

In [6]:
# Loops 10 times over the page number list, one per page number
file_name = []
for i in range(10):
    # Concatenates page number with the url to create the url for each result page
    url = "https://www.ebay.com/sch/i.html?_from=R40&_nkw=samsung+tv&_sacat=0&LH_TitleDesc=0&_pgn=" + page_numbers[i]
    # Requesting the ebay website's (with specific page number) access to the server and storing the response
    res = get(url, headers = headers)
    # Parsing the response and storing the resulting html
    data = BeautifulSoup(res.text, 'html.parser')
    # Creates the file name by concatenating the page numbers
    file_name.append("ebay_samsung_tv_" + page_numbers[i] + ".htm")
    # Writes the output to the html file
    with open(file_name[i], "w", encoding='utf-8') as file:
        file.write(str(data))
    # Waits idle for 10 secs before moving on to the next iteration of the loop
    time.sleep(10)

In [7]:
# Loops 10 times over the files created, once for each file
products = []
for name in file_name:
    # Reads the file
    with open(name, "r", encoding='utf-8') as file:
        # Parses the file contents and stores the resulting html
        content = BeautifulSoup(file.read(), 'html.parser')
        # Finds the h3 tags with particular class, relating to the sponsored items from the results shown
        titles = content.findAll("h3", class_ = "s-item__title s-item__title--has-tags")
        # adds each title from the h3 tag to a list
        for title in titles:
            products.append(title)

In [8]:
# Loops over the products list
for each in products:
    # Prints the url of the each title by traversing to the parent of the h3 tag(which is <a> tag)
    # and getting the web link from href attribute
    print(each.parent.get('href') + "\n\n")

https://www.ebay.com/itm/Samsung-55-Class-2160-4K-UHD-LED-Smart-TV-UN55RU7300FXZA/123977682848?epid=18032471557&_trkparms=ispr%3D1&hash=item1cdda54fa0:g:b0EAAOSw1B5d0Ael&enc=AQAEAAACQBPxNw%2BVj6nta7CKEs3N0qV8eBzhvJ19JQQfkviwmSQ4tWN%2BdZPci08ipuExeZouAjDxmvpDi5nPZ%2Bn1zIlrkpO4Xwiu9W38lyIdEweLox4rKSYXyIKKsU8kGRh6iKwxKSlOoQra3mwhGMqpeX7yrcO6Qhyzd66paBmpJ3Fq5BdhVuOA53TGWERDFC4Jniq6KxKpVpJSTpwatxpqhXwSE6f8LGp2oKDB9mGPOpO8Ekxz0zWRtS%2B4hp%2BF8jTg5P5F5o0A8gZ3TJZPv9xP1DSIdL9poGkdYOWogJloqkbLqq3%2BsxQhsS3cuHW0IwpRW290tSTPYOwPAm2jbwpSn%2FTizE8bXG%2BYxQkhewrizUhT8gUNIZWSo3tEe009rcw6uZf3kM4U0p2z0YWLE7JIGe1fVC6abesDbACnzWNhwPi6MN%2BNMv4G7loiKl8LXpb7MnJQ9mrtx63vEDes2OA4Q9qdlwmIAGTgavWvKZZwBkG%2BosY4AeqLZCpNG1rV9gsGVIn5GiY6dFJoz0NhT%2FN2J8MoGd%2BbZMM%2FeS3yiI2A7%2BUTnYTpmcJrTokcfjwc0A0ayAKvpVu25goGthIhJAaXpQ%2F0R75SsiwKrgY2KgFljI%2FvYKe6Bf%2FHP%2BD%2FTsRDjpWbgrXeZuVyqF0G7%2F3UNIBcH6dooLoWk5KUTkHYkPKqDGnkGi3IAAi%2BM8hAbpRLpob%2B8AzGTDu7X7jUtl%2FNNwDayk9B%2Fe7eG8IeVzB8Y7QPJpcxz9miavyyVCnsnFcXexozUIIzqw

In [9]:
#Check number of sponsored items on 10 pages
len(products)

130