# Kabum Gaming Chairs Webscraping v1.0

Olá, pessoal!

Estou animado para compartilhar com vocês o projeto que eu acabei de concluir - um notebook que faz webscraping do site da Kabum na sessão de cadeiras gamers utilizando Selenium, BeautifulSoup e Pandas! Eu sei, eu sei, talvez não seja exatamente o que você espera que um programador faça com seu tempo livre, mas aliado à vontade de fazer um projeto de webscraping eu tinha a necessidade de comprar uma boa cadeira para meu home office! 


# Importing libs

In [197]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import re 
from time import sleep
import math
import requests
from datetime import datetime

# Data Collection

In [198]:
# Configuring and starting the driver

options = Options()
options.add_argument('--headless=new')
options.add_argument('--disable-blink-features=AutomationControlled')
driver = webdriver.Chrome(options=options)
url = 'https://www.kabum.com.br'
navegador = driver.get(url)
sleep(0.5)
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
    (KHTML, like Gecko) Chrome / 86.0.4240.198Safari / 537.36"}

# Looking for the 'search button', typing and updating the page:

search_input = driver.find_element(By.TAG_NAME, 'input')
search_input.click()
search_input.send_keys('Cadeira Gamer')
search_input.submit()

# Collecting the html from the page:

site = driver.page_source
soup = BeautifulSoup(site, 'html.parser')


# Calculing the number of pages to webscrap:

qtd = soup.find('div', {'id': 'listingCount'}).get_text().strip()
pos = qtd.find(' ')
qtd = qtd[:pos]
ultima_pag = math.ceil(int(qtd)/ 20)
ultima_pag

# Creating the dictionary

dic_products = {'name': [], 'preco': [], 'url':[],'url_img':[],'descricao':[], 'scrapy_time':[]}

for i in range(1, ultima_pag+1):
    url_pag = f'https://www.kabum.com.br/espaco-gamer/cadeiras-gamer?page_number={i}&page_size=20&facet_filters=&sort=most_searched'
    site = driver.get(url_pag)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    produtos = soup.findAll('div', class_= re.compile('productCard'))
    
    for produto in produtos:
        name = produto.find('span', class_= re.compile('nameCard')).get_text().strip()
        preco = produto.find('span', class_= re.compile('priceCard')).get_text().strip()
        # Conseguindo o URL de cada produto
        produto_link = produto.find('a').get('href')
        url_img = produto.find('img').get('src')
        descricao = produto.find('img').get('alt')
        url_product = 'https://www.kabum.com.br' + produto_link

        # Ingesting the webscrapped data
        
        dic_products['name'].append(name)
        dic_products['preco'].append(preco)
        dic_products['url'].append(url_product)
        dic_products['url_img'].append(url_img)
        dic_products['descricao'].append(descricao)
        dic_products['scrapy_time'].append(datetime.now())

df_raw = pd.DataFrame(dic_products)


# Transforming data

## Observing data types

In [199]:
df_raw.dtypes

name                   object
preco                  object
url                    object
url_img                object
descricao              object
scrapy_time    datetime64[ns]
dtype: object

## Creating regex extraction functions:

_These functions will be able to help me to extract patterns of strings._

In [200]:
# first two words in a string
def extract_two_first_words(coluna):
    padrao = re.compile(r'^\w+\s\w+')
    match = re.search(padrao, coluna)
    if match:
        return match.group(0)
    return None

# price from a real currency
def extract_price(coluna):
    padrao = re.compile(r'[^R\$]+')
    match = re.search(padrao, coluna)
    if match:
        return match.group(0)
    return None

# currency symbol
def extract_currency(coluna):
    padrao = re.compile(r'^R\$')
    match = re.search(padrao, coluna)
    if match:
        return match.group(0)
    return None
# product_code
def extract_product_code(coluna):
    padrao = re.compile(r'(\d+\w+)')
    match = re.search(padrao, coluna)
    if match:
        return match.group(0)
    return None

## Creating custom columns for further analysis:



- Product Type:
    - Classify the objects in gaming chairs (_Cadeira Gamer_ in brazilian portuguese) or office chairs (_Cadeira de Escritório_ in brazilian portuguese).
- Product Code:
    - Extract the product code from the specific product url.
- Price:
    - Extracts the price of the item.
- In stock?
    - It's a binary column that says if the item is available in stock.
- Currency:
    - Extracts the currency of the item.
- Year of webscrap
- Month of webscrap
- Month name
- Day of webscrap


In [201]:
# Creating custom columns for further analysis

df_raw['product_type'] = df_raw['name'].apply(extract_two_first_words)
temp_list = ['Cadeira Office', 'Cadeira Escritório', 'Cadeira De', 'Cadeira de', 'Cadeira Ergonomic']
df_raw['product_type'] = df_raw['product_type'].apply( lambda x: 'Cadeira de Escritório' if x in temp_list else 'Cadeira Gamer')

df_raw['price'] = df_raw['preco'].apply(extract_price)
df_raw['currency'] = df_raw['preco'].apply(extract_currency)
df_raw['year_scrapy'] = pd.DatetimeIndex(df_raw['scrapy_time']).year
df_raw['month_scrapy'] = pd.DatetimeIndex(df_raw['scrapy_time']).month
df_raw['month_name'] = df_raw['month_scrapy'].apply(lambda x: 'Jan' if x == 1 else
                                                              'Fev' if x == 2 else
                                                              'Mar' if x == 3 else
                                                              'Apr' if x == 4 else
                                                              'May' if x == 5 else
                                                              'Jun' if x == 6 else
                                                              'Jul' if x == 7 else
                                                              'Aug' if x == 8 else
                                                              'Sep' if x == 9 else
                                                              'Oct' if x == 10 else
                                                              'Nov' if x == 11 else 'Dec')
df_raw['day_scrapy'] = pd.DatetimeIndex(df_raw['scrapy_time']).day


df_raw.columns

df_raw = df_raw[['product_type','name','price','currency','descricao', 'year_scrapy', 'month_scrapy', 'month_name', 'day_scrapy', 'scrapy_time', 'url', 'url_img']]

df_raw

df_raw['price'] = df_raw['price'].str.replace('---','0')
df_raw['price'] = df_raw['price'].str.replace('.','')
df_raw['price'] = df_raw['price'].str.replace(',','.')

df_raw['price'] = df_raw['price'].astype(float)


In [202]:
df_raw['product_code'] = df_raw['url'].apply(extract_product_code)

In [203]:
df_raw['in_stock'] = df_raw['price'].apply(lambda x: 'no' if x == 0 else 'yes')
df_raw = df_raw[['product_type','product_code','name','price','in_stock','currency','descricao', 'year_scrapy', 'month_scrapy', 'month_name', 'day_scrapy', 'scrapy_time', 'url', 'url_img']]
df_raw = df_raw.loc[df_raw['product_type'].str.contains('Cadeira', na= False)]


In [204]:
df_raw

Unnamed: 0,product_type,product_code,name,price,in_stock,currency,descricao,year_scrapy,month_scrapy,month_name,day_scrapy,scrapy_time,url,url_img
0,Cadeira Gamer,313528,"Cadeira Gamer Prizi PZ1005, Peso Suportado até...",379.90,yes,R$,Cadeira Gamer Prizi Amarela - PZ1005A Cadeira ...,2023,5,May,8,2023-05-08 10:49:16.687476,https://www.kabum.com.br/produto/313528/cadeir...,https://images.kabum.com.br/produtos/fotos/syn...
1,Cadeira Gamer,320746,"Cadeira Gamer Prizi Pz1005, Capacidade de peso...",379.90,yes,R$,Cadeira Gamer Prizi Roxa - Pz1005A Cadeira Pz1...,2023,5,May,8,2023-05-08 10:49:16.688483,https://www.kabum.com.br/produto/320746/cadeir...,https://images.kabum.com.br/produtos/fotos/syn...
2,Cadeira Gamer,320739,"Cadeira Gamer Prizi, PZ1005, Suporta Até 120kg...",379.90,yes,R$,Cadeira Gamer Prizi Vermelho - Pz1005A Cadeira...,2023,5,May,8,2023-05-08 10:49:16.688483,https://www.kabum.com.br/produto/320739/cadeir...,https://images.kabum.com.br/produtos/fotos/syn...
3,Cadeira Gamer,265788,"Cadeira Gamer Mymax Mx5, Suportado até 150Kg, ...",710.15,yes,R$,"A nova linha de Cadeira Gamer Mymax, são as ma...",2023,5,May,8,2023-05-08 10:49:16.688483,https://www.kabum.com.br/produto/265788/cadeir...,https://images.kabum.com.br/produtos/fotos/syn...
4,Cadeira Gamer,313526,"Cadeira Gamer Prizi Runner, 180Kg, Giratória, ...",359.90,yes,R$,Cadeira Gamer Prizi Runner - Azul Desenvolvida...,2023,5,May,8,2023-05-08 10:49:16.689478,https://www.kabum.com.br/produto/313526/cadeir...,https://images.kabum.com.br/produtos/fotos/syn...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1926,Cadeira Gamer,374439,Cadeira Gamer Evolut Eg-905 Tanker V2 Laranja,0.00,no,R$,CADEIRA GAMER EVOLUT EG-905 TANKER V2 Confo...,2023,5,May,8,2023-05-08 10:53:33.957546,https://www.kabum.com.br/produto/374439/cadeir...,https://images.kabum.com.br/produtos/fotos/syn...
1927,Cadeira Gamer,374440,Cadeira Gamer Evolut Eg-905 Tanker V2 Branco,0.00,no,R$,CADEIRA GAMER EVOLUT EG-905 TANKER V2 Confo...,2023,5,May,8,2023-05-08 10:53:33.957546,https://www.kabum.com.br/produto/374440/cadeir...,https://images.kabum.com.br/produtos/fotos/syn...
1928,Cadeira Gamer,374441,Cadeira Gamer Evolut Eg-905 Tanker V2 Azul,0.00,no,R$,CADEIRA GAMER EVOLUT EG-905 TANKER V2 Confo...,2023,5,May,8,2023-05-08 10:53:33.957546,https://www.kabum.com.br/produto/374441/cadeir...,https://images.kabum.com.br/produtos/fotos/syn...
1929,Cadeira Gamer,374451,Cadeira Gamer Evolut Eg-905 Tanker V2 Verde,0.00,no,R$,CADEIRA GAMER EVOLUT EG-905 TANKER V2 Confo...,2023,5,May,8,2023-05-08 10:53:33.957546,https://www.kabum.com.br/produto/374451/cadeir...,https://images.kabum.com.br/produtos/fotos/syn...


# Data Loading

In [205]:
try:
    string = 'raw-data/cadeira-gamer-'+str(datetime.date(datetime.today()))+'.csv'
    df_raw.to_csv(path_or_buf = string, sep=',', encoding= 'utf-8-sig', index= False)
except:
    pass