<a href="https://colab.research.google.com/github/lettymoon/amazon-data-scrapping-ETL/blob/main/amazon_web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple ETL Project - Amazon Web Scraping

In [21]:
# @title Libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import time
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse

In [13]:
# @title Extracting of Products
def get_title(soup):
    try:
        return soup.find("span", id="productTitle").get_text(strip=True)
    except Exception:
        return ""

    return title

def get_price(soup):
    try:
        return soup.select_one("span.a-price > span.a-offscreen").get_text(strip=True)
    except Exception:
        try:
            return soup.select_one("#priceblock_ourprice, #priceblock_dealprice").get_text(strip=True)
        except Exception:
            return ""

def get_brand(soup):
    try:
        for row in soup.select("table tr"):
            try:
                th = row.find("th")
                td = row.find("td")
                if th and td and "marca" in th.get_text(strip=True).lower():
                    return td.get_text(strip=True)
            except Exception:
                continue
    except Exception:
        pass

    try:
        byline = soup.select_one("#bylineInfo")
        if byline:
            return byline.get_text(" ", strip=True)
    except Exception:
        pass

    return ""

def get_rating(soup):
    rating = ""
    try:
      text = soup.select_one("span.a-icon-alt").get_text(strip=True)
      m = re.search(r"([\d.,]+)", text)
      rating = m.group(1) if m else text
    except Exception:
        rating = ""
    return rating

def get_review_count(soup):
    try:
        return soup.find("span", id="acrCustomerReviewText").get_text(strip=True)
    except Exception:
        return ""

In [23]:
if __name__ == '__main__':

    HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36', 'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7'})

    URL = "https://www.amazon.com.br/s?k=cafeteira+barista&__mk_pt_BR=%C3%85M%C3%85%C5%BD%C3%95%C3%91&crid=224BYB5B97ZD6&sprefix=cafeteira+barista%2Caps%2C246&ref=nb_sb_noss"

    webpage = requests.get(URL, headers=HEADERS)

    soup = BeautifulSoup(webpage.content, "html.parser")

    links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

    links_list = []

    for link in links:
            links_list.append(link.get('href'))

    d = {"title":[], "price":[], "brand":[], "rating":[], "reviews":[]}

    for link in links_list:
        new_webpage = requests.get("https://www.amazon.com.br" + link, headers=HEADERS)

        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        d['title'].append(get_title(new_soup))
        d['price'].append(get_price(new_soup))
        d['brand'].append(get_brand(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['reviews'].append(get_review_count(new_soup))

    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['title'].replace('', np.nan, inplace=True)
    amazon_df = amazon_df.dropna(subset=['title'])
    amazon_df.to_csv("amazon_data.csv", header=True, index=False)


In [24]:
amazon_df

Unnamed: 0,title,price,brand,rating,reviews
0,"MONDIAL Cafeteira Espresso Dolce Crema 20 Bar,...","R$499,99",‎MONDIAL,45,(563)
1,"MONDIAL Cafeteira Espresso Dolce Latte 20 Bar,...","R$1.009,90",‎MONDIAL,41,(228)
2,"Cafeteira, Coffe Express 15 Bar, 2 xicaras, Pr...","R$599,90",‎PHILCO,45,(1.836)
3,"MONDIAL Cafeteira Espresso Dolce Crema 20 Bar,...","R$524,99",‎MONDIAL,45,(562)
4,Cafeteira Espresso Automática Série 1200 Phili...,"R$2.362,00",‎Philips Walita,45,(1.095)
6,"Cafeteira Espresso Eos Premium 1,2 Litros com ...","R$379,90",‎EOS,50,(1)
7,Cafeteira Espresso Oster Compacta Perfect Brew...,"R$1.099,00",‎Oster,47,(54)
8,Cafeteira Espresso Oster PrimaLatte Touch - 127V,"R$890,01",‎Oster,46,(1.334)
9,"Oster CAFETEIRA ESPRESSO OCAF900 DIGITAL, 127V","R$899,99",‎Oster,40,(455)
10,Cafeteira Espresso Automática Série 1200 Phili...,"R$2.374,90",‎Philips Walita,45,(1.095)
