# Amazon Web Scraping

In this project, I developed a web scraping tool using Python and the BeautifulSoup library to extract product information from Amazon. The tool efficiently retrieves data such as product titles, prices and ratings from Amazon product pages. It navigates through HTML structures, identifies key data elements, and input the extracted information for further analysis. This project demonstrates the application of web scraping techniques in e-commerce, enabling the automated collection of data for market research, price comparison, and trend analysis while adhering to ethical scraping practices and Amazon's terms of service.

In [1]:
# Importing libraries

from bs4 import BeautifulSoup
import requests
import time
import datetime
import smtplib

In [12]:
# Connecting to website, getting the info and cleaning it up

URL = 'https://www.amazon.co.uk/Console-Built-Gaming-Handheld-Immersive/dp/B01H7JD64S/ref=sr_1_8?crid=3EL7W5BETJ42G&dib=eyJ2IjoiMSJ9.BywvmPUALh3yuishmxxrc5TbQRpAN_CIbtlpiSn1DfvTSjijgChujDp5AgAJqgPzAbJUg4KIX2zbNScvg_gjPi2NmUkM1Ln4PJGY6t8ott04YgtE4ZaSpqUht23qXxa-iUseHZaQwOOKUo2L2waeCSOYOFZN_zCR0eo6lhTT8szQ8rimYs9AtgbHHULikoWMPZ2scQ6al3XneyikYrcAbok668Jv-jaoTsTmbcrpMmBJBxOOv-GEc6YreD6K6HHXljRzEKm7cPuLgY0L3MmascViXx_pOD89XzjoZpq7EVE.4PJ-XMS02Ql4xEPCFh0YHh5mN1w-JqWKfbrG1UYwpow&dib_tag=se&keywords=tech%2Bgadgets&qid=1723553215&sprefix=tech%2Caps%2C152&sr=8-8&th=1'

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"}

page  = requests.get(URL, headers=headers)

soup1 = BeautifulSoup(page.content, 'html.parser')

soup2 = BeautifulSoup(soup1.prettify(), 'html.parser')

title = soup2.find(id='productTitle').get_text(strip=True)

price = soup2.find("span", class_="a-price aok-align-center reinventPricePriceToPayMargin priceToPay").get_text(strip=True)
price = price.strip()[1:]

rating = soup2.find("span", class_="a-size-medium a-color-base").get_text(strip=True)
rating = rating.strip()[:3]

print(title)
print(price)
print(rating)

Mini Retro Games Console, 150 In-Built Games, 8-Bit Retro Gaming Handheld Console, 1.8” Full Colour LCD Screen Pocket Console, Immersive Sound Games Console - ThumbsUp!
16.99
4.2


In [13]:
# Adding a timestamp to know when the data was collected

import datetime
today = datetime.date.today()
print(today)

2024-08-13


In [14]:
# Creating a csv to append the data collected

import csv
header = ['Title', 'Price', 'Rating', 'Date']
data = [title, price, rating, today] 

with open('AWSDataset.csv', 'w', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerow(data)

In [15]:
# Checking to see if it's working

import os
print(os.getcwd())

C:\Users\gusta


In [5]:
import pandas as pd
df = pd.read_csv(r'C:\Users\gusta\AWSDataset.csv')
df

Unnamed: 0,Title,Price,Rating,Date
0,"Mini Retro Games Console, 150 In-Built Games, ...",16.99,4.2,2024-08-13
1,"Mini Retro Games Console, 150 In-Built Games, ...",16.99,4.2,2024-08-13
2,"Mini Retro Games Console, 150 In-Built Games, ...",16.99,4.2,2024-08-13


In [None]:
# Appending data to the csv

with open('AWSDataset.csv', 'a+', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(data)

In [None]:
# Automating the collection of the data and appending to the csv

def check_price():
    URL = 'https://www.amazon.co.uk/Console-Built-Gaming-Handheld-Immersive/dp/B01H7JD64S/ref=sr_1_8?crid=3EL7W5BETJ42G&dib=eyJ2IjoiMSJ9.BywvmPUALh3yuishmxxrc5TbQRpAN_CIbtlpiSn1DfvTSjijgChujDp5AgAJqgPzAbJUg4KIX2zbNScvg_gjPi2NmUkM1Ln4PJGY6t8ott04YgtE4ZaSpqUht23qXxa-iUseHZaQwOOKUo2L2waeCSOYOFZN_zCR0eo6lhTT8szQ8rimYs9AtgbHHULikoWMPZ2scQ6al3XneyikYrcAbok668Jv-jaoTsTmbcrpMmBJBxOOv-GEc6YreD6K6HHXljRzEKm7cPuLgY0L3MmascViXx_pOD89XzjoZpq7EVE.4PJ-XMS02Ql4xEPCFh0YHh5mN1w-JqWKfbrG1UYwpow&dib_tag=se&keywords=tech%2Bgadgets&qid=1723553215&sprefix=tech%2Caps%2C152&sr=8-8&th=1'
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36"}
    page  = requests.get(URL, headers=headers)
    soup1 = BeautifulSoup(page.content, 'html.parser')
    soup2 = BeautifulSoup(soup1.prettify(), 'html.parser')
    title = soup2.find(id='productTitle').get_text(strip=True)
    price = soup2.find("span", class_="a-price aok-align-center reinventPricePriceToPayMargin priceToPay").get_text(strip=True)
    price = price.strip()[1:]
    rating = soup2.find("span", class_="a-size-medium a-color-base").get_text(strip=True)
    rating = rating.strip()[:3]
    import datetime
    today = datetime.date.today()
    import csv
    header = ['Title', 'Price', 'Rating', 'Date']
    data = [title, price, rating, today] 
    with open('AWSDataset.csv', 'a+', newline='', encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(data)

    if(price < 10):
        send_mail()
    

In [None]:
# Runs function after a set time and inputs data into tht csv

while(True):
    check_price()
    time.sleep(86400)

In [None]:
import pandas as pd
df = pd.read_csv(r'C:\Users\gusta\AWSDataset.csv')
print(df)

In [None]:
# Sending myself an email when a price hits below a certain level

def send_mail():
    server = smtplib.SMTP_SSL('smtp.gmail.com',465)
    server.ehlo()
    #server.starttls()
    server.ehlo()
    server.login('adrianapaiva.gvu@gmail.com','xxxxxxxxxxxxxx')
    
    subject = "The product you want is below $10! Now is your chance to buy!"
    body = "Adriana, This is the moment we have been waiting for. Now is your chance to buy it. Link here: https://www.amazon.co.uk/Console-Built-Gaming-Handheld-Immersive/dp/B01H7JD64S/ref=sr_1_8?crid=3EL7W5BETJ42G&dib=eyJ2IjoiMSJ9.BywvmPUALh3yuishmxxrc5TbQRpAN_CIbtlpiSn1DfvTSjijgChujDp5AgAJqgPzAbJUg4KIX2zbNScvg_gjPi2NmUkM1Ln4PJGY6t8ott04YgtE4ZaSpqUht23qXxa-iUseHZaQwOOKUo2L2waeCSOYOFZN_zCR0eo6lhTT8szQ8rimYs9AtgbHHULikoWMPZ2scQ6al3XneyikYrcAbok668Jv-jaoTsTmbcrpMmBJBxOOv-GEc6YreD6K6HHXljRzEKm7cPuLgY0L3MmascViXx_pOD89XzjoZpq7EVE.4PJ-XMS02Ql4xEPCFh0YHh5mN1w-JqWKfbrG1UYwpow&dib_tag=se&keywords=tech%2Bgadgets&qid=1723553215&sprefix=tech%2Caps%2C152&sr=8-8&th=1"
   
    msg = f"Subject: {subject}\n\n{body}"
    
    server.sendmail(
        'adrianapaiva.gvu@gmail.com',
        msg
     
    )