# Desafio Estágio em Análise de Dados - SEAZONE

## Web scraping com Python e Beautiful Soup

In [1]:
# Importando as bibliotecas necessárias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests
from bs4 import BeautifulSoup

In [2]:
# Lendo o dataset
df = pd.read_excel("planilha_de_anncios_-_florianpolis_-_igor_almeida.xlsx")

In [3]:
df.shape

(80, 7)

In [4]:
df.head()

Unnamed: 0,Listing,Bedrooms,Days Avaliable,Avg Daily Rate,Occ.,Revenue,# Ratings
0,https://www.airbnb.com/rooms/901903,1.0,313.0,268.0,0.56,47200.0,62.0
1,https://www.airbnb.com/rooms/901903,1.0,313.0,268.0,0.56,47200.0,62.0
2,https://www.airbnb.com/rooms/690060,1.0,272.0,268.0,0.33,24400.0,50.0
3,https://www.airbnb.com/rooms/5288261,1.0,312.0,200.0,0.21,12800.0,12.0
4,https://www.airbnb.com/rooms/50133767,1.0,213.0,394.0,0.5,42100.0,22.0


In [5]:
# Existem algumas linhas duplicadas, então vamos retirá-las
df = df.drop_duplicates()

In [6]:
# Trocando o nome da coluna # Ratings por Reviews, pois faz mais sentido
df = df.rename(columns={'# Ratings': 'Reviews'})

In [7]:
# Verificando
df.head()

Unnamed: 0,Listing,Bedrooms,Days Avaliable,Avg Daily Rate,Occ.,Revenue,Reviews
0,https://www.airbnb.com/rooms/901903,1.0,313.0,268.0,0.56,47200.0,62.0
2,https://www.airbnb.com/rooms/690060,1.0,272.0,268.0,0.33,24400.0,50.0
3,https://www.airbnb.com/rooms/5288261,1.0,312.0,200.0,0.21,12800.0,12.0
4,https://www.airbnb.com/rooms/50133767,1.0,213.0,394.0,0.5,42100.0,22.0
5,https://www.airbnb.com/rooms/49910078,1.0,235.0,418.0,0.57,56000.0,23.0


In [8]:
df.shape

(78, 7)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78 entries, 0 to 79
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Listing         78 non-null     object 
 1   Bedrooms        78 non-null     float64
 2   Days Avaliable  78 non-null     float64
 3   Avg Daily Rate  78 non-null     float64
 4   Occ.            78 non-null     float64
 5   Revenue         78 non-null     float64
 6   Reviews         78 non-null     float64
dtypes: float64(6), object(1)
memory usage: 4.9+ KB


In [10]:
# Criando um novo dataframe apenas com os anúncios >= 200 dias disponíveis e >= 20 reviws
df_anuncios_validos = df[df['Days Avaliable'] >= 200][df['Reviews'] >= 20]

  df_anuncios_validos = df[df['Days Avaliable'] >= 200][df['Reviews'] >= 20]


In [11]:
# Resetando o index para ficar em ordem e criando a coluna Id por meio do index
df_anuncios_validos.reset_index(inplace=True, drop=True)
df_anuncios_validos['Id'] = df_anuncios_validos.index
df_anuncios_validos

Unnamed: 0,Listing,Bedrooms,Days Avaliable,Avg Daily Rate,Occ.,Revenue,Reviews,Id
0,https://www.airbnb.com/rooms/901903,1.0,313.0,268.0,0.56,47200.0,62.0,0
1,https://www.airbnb.com/rooms/690060,1.0,272.0,268.0,0.33,24400.0,50.0,1
2,https://www.airbnb.com/rooms/50133767,1.0,213.0,394.0,0.5,42100.0,22.0,2
3,https://www.airbnb.com/rooms/49910078,1.0,235.0,418.0,0.57,56000.0,23.0,3
4,https://www.airbnb.com/rooms/47422405,1.0,298.0,251.0,0.72,54200.0,38.0,4
5,https://www.airbnb.com/rooms/47221270,1.0,245.0,299.0,0.6,43700.0,21.0,5
6,https://www.airbnb.com/rooms/45802524,1.0,302.0,193.0,0.54,31500.0,28.0,6
7,https://www.airbnb.com/rooms/45676264,1.0,365.0,153.0,0.56,31300.0,40.0,7
8,https://www.airbnb.com/rooms/45077589,1.0,354.0,257.0,0.42,38000.0,26.0,8
9,https://www.airbnb.com/rooms/44322051,1.0,245.0,367.0,0.69,62000.0,20.0,9


In [12]:
# Exportando a nova tabela para ser trabalhada a partir de agora
df_anuncios_validos.to_excel('tab_seazone_tratados.xlsx',index=False)

PermissionError: [Errno 13] Permission denied: 'tab_seazone_tratados.xlsx'

In [13]:
# Salvando os links somente dos anúncios válidos em uma lista
list_anuncios_validos = df_anuncios_validos.Listing.values
print(list_anuncios_validos)
print(len(list_anuncios_validos))

['https://www.airbnb.com/rooms/901903'
 'https://www.airbnb.com/rooms/690060'
 'https://www.airbnb.com/rooms/50133767'
 'https://www.airbnb.com/rooms/49910078'
 'https://www.airbnb.com/rooms/47422405'
 'https://www.airbnb.com/rooms/47221270'
 'https://www.airbnb.com/rooms/45802524'
 'https://www.airbnb.com/rooms/45676264'
 'https://www.airbnb.com/rooms/45077589'
 'https://www.airbnb.com/rooms/44322051'
 'https://www.airbnb.com/rooms/44321789'
 'https://www.airbnb.com/rooms/44114093'
 'https://www.airbnb.com/rooms/4148993'
 'https://www.airbnb.com/rooms/41406424'
 'https://www.airbnb.com/rooms/4075739'
 'https://www.airbnb.com/rooms/40701296'
 'https://www.airbnb.com/rooms/40426283'
 'https://www.airbnb.com/rooms/39055969'
 'https://www.airbnb.com/rooms/37110904'
 'https://www.airbnb.com/rooms/36598119'
 'https://www.airbnb.com/rooms/36453155'
 'https://www.airbnb.com/rooms/35574792'
 'https://www.airbnb.com/rooms/34749528'
 'https://www.airbnb.com/rooms/34749170'
 'https://www.airbnb.c

### TESTANDO

In [13]:
url = 'https://www.airbnb.com/rooms/901903'

In [14]:
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

In [45]:
# Clasificação
soup.find("span", {"class": "_17p6nbba"}).get_text()

'4.72 ·'

In [15]:
#Bairro
soup.find("span", {"class": "_9xiloll"}).get_text()

'Florianopolis, Santa Catarina, Brazil'

In [49]:
#Atrativos/localização
soup.find("div", {"class": "_1jlr81g"}).get_text()

'91% of recent guests gave the location a 5-star rating.'

In [64]:
#Qualidade física
soup.find("div", {"class": "_1byskwn"}).get_text(",")

'Kitchen,Wifi,Free parking on premises,Pool,TV,Air conditioning,Hair dryer,Refrigerator,Unavailable: Carbon monoxide alarm,Carbon monoxide alarm,Unavailable: Smoke alarm,Smoke alarm'

### Final testagem

## Criando função para capturar as informações necesárias das páginas

def extract_basic_features(listing_html):
    import time
    features_dict = {}
    for link in listing_html:
        
        soup = BeautifulSoup(requests.get(link).content, 'html.parser')
        
        try:
            bairro = soup.find("span", {"class": "_9xiloll"}).get_text()
        except:
            bairro = "Not Available"
            
        try:
            rating = soup.find("span", {"class": "_17p6nbba"}).get_text()
        except:
            rating = "Not Available"
            
        try:     
            localizacao = soup.find("div", {"class": "_1jlr81g"}).get_text()
        except:
            localizacao = "Not Available"
            
        try:     
            quali_fisica = soup.find("div", {"class": "_1byskwn"}).get_text(",")
        except:
            quali_fisica = "Not Available"
            
        time.sleep(2)

    features_dict['bairro'] = bairro
    features_dict['rating'] = rating
    features_dict['localizacao'] = localizacao
    features_dict['quali_fisica'] = quali_fisica
    return features_dict

In [14]:
#<span class="_4oybiu" aria-hidden="true">4,9</span>
# Função
dados_extraidos = []
def extract_info(list_anuncios):
    
    import time  
    for link in list_anuncios:
        anuncios = []
        soup = BeautifulSoup(requests.get(link).content, 'html.parser')
        try:
            anuncios.append(soup.find("span", {"class": "_9xiloll"}).get_text())
        except:
            anuncios.append("Not Available")
            
        try:
            anuncios.append(soup.find("span", {"class": "_17p6nbba"}).get_text())
        except:
            anuncios.append("Not Available")
            
        try:     
            anuncios.append(soup.find("span", {"class": "_4oybiu"}).get_text())
        except:
            anuncios.append("Not Available")
            
        try:     
            anuncios.append(soup.find("div", {"class": "_1byskwn"}).get_text(","))
        except:
            anuncios.append("Not Available") 
            
        dados_extraidos.append(anuncios)
            
        time.sleep(2)
        
    return dados_extraidos 

In [15]:
extract_info(list_anuncios_validos)

[['Florianopolis, Santa Catarina, Brazil',
  '4.72 ·',
  '4.6',
  'Kitchen,Wifi,Free parking on premises,Pool,TV,Air conditioning,Hair dryer,Refrigerator,Unavailable: Carbon monoxide alarm,Carbon monoxide alarm,Unavailable: Smoke alarm,Smoke alarm'],
 ['Florianopolis, Santa Catarina, Brazil',
  '4.91 ·',
  '4.9',
  'Beach access,Kitchen,Wifi,Free street parking,Pool,TV,Washer,Air conditioning,Unavailable: Carbon monoxide alarm,Carbon monoxide alarm,Unavailable: Smoke alarm,Smoke alarm'],
 ['Jurerê Internacional, Santa Catarina, Brazil',
  '4.94 ·',
  '4.8',
  'Park view,Beach access,Kitchen,Wifi,Free parking on premises,TV with standard cable,Elevator,Washer,Unavailable: Carbon monoxide alarm,Carbon monoxide alarm,Unavailable: Smoke alarm,Smoke alarm'],
 ['Jurerê Internacional, Santa Catarina, Brazil',
  '4.93 ·',
  '4.9',
  'Ocean view,Garden view,Beach access – Beachfront,Kitchen,Wifi,Free parking on premises,Shared pool,TV,Elevator,EV charger'],
 ['Lagoa da Conceição, Santa Catarina

In [75]:
len(dados_extraidos)

46

In [76]:
dados_extraidos[0]

['Florianopolis, Santa Catarina, Brazil',
 '4.72 ·',
 '91% of recent guests gave the location a 5-star rating.',
 'Kitchen,Wifi,Free parking on premises,Pool,TV,Air conditioning,Hair dryer,Refrigerator,Unavailable: Carbon monoxide alarm,Carbon monoxide alarm,Unavailable: Smoke alarm,Smoke alarm']

In [77]:
df_anuncios = pd.DataFrame(dados_extraidos, columns=['Bairro', 'Rating', 'Localizacao', 'Quali_fisica'])

In [78]:
df_anuncios

Unnamed: 0,Bairro,Rating,Localizacao,Quali_fisica
0,"Florianopolis, Santa Catarina, Brazil",4.72 ·,91% of recent guests gave the location a 5-sta...,"Kitchen,Wifi,Free parking on premises,Pool,TV,..."
1,"Florianopolis, Santa Catarina, Brazil",4.91 ·,Check yourself in with the smartlock.,"Beach access,Kitchen,Wifi,Free street parking,..."
2,"Jurerê Internacional, Santa Catarina, Brazil",4.94 ·,"Superhosts are experienced, highly rated hosts...","Park view,Beach access,Kitchen,Wifi,Free parki..."
3,"Jurerê Internacional, Santa Catarina, Brazil",4.93 ·,95% of recent guests gave the location a 5-sta...,"Ocean view,Garden view,Beach access – Beachfro..."
4,"Lagoa da Conceição, Santa Catarina, Brazil",4.88 ·,Check yourself in with the lockbox.,"Garden view,Sea view,Beach access – Beachfront..."
5,"Centro, Santa Catarina, Brazil",5.0 ·,A private room with wifi that’s well-suited fo...,"Beach access – Beachfront,Kitchen,Wifi,Dedicat..."
6,"Centro, Santa Catarina, Brazil",4.78 ·,"Superhosts are experienced, highly rated hosts...","Kitchen,Wifi,Free parking on premises,Pool,Ele..."
7,Not Available,Not Available,Not Available,Not Available
8,"Barra da Lagoa, Santa Catarina, Brazil",4.87 ·,You can check in with the doorman.,"Lake access,Kitchen,Wifi,Free parking on premi..."
9,"Centro, Santa Catarina, Brazil",4.56 ·,This is one of the few places in the area with...,"Beach access – Beachfront,Kitchen,Wifi,Dedicat..."


In [79]:
df_anuncios.shape

(46, 4)

In [80]:
dados_airbnb = df_anuncios
dados_airbnb['Id']=df_anuncios_validos['Id']
dados_airbnb

Unnamed: 0,Bairro,Rating,Localizacao,Quali_fisica,Id
0,"Florianopolis, Santa Catarina, Brazil",4.72 ·,91% of recent guests gave the location a 5-sta...,"Kitchen,Wifi,Free parking on premises,Pool,TV,...",0
1,"Florianopolis, Santa Catarina, Brazil",4.91 ·,Check yourself in with the smartlock.,"Beach access,Kitchen,Wifi,Free street parking,...",1
2,"Jurerê Internacional, Santa Catarina, Brazil",4.94 ·,"Superhosts are experienced, highly rated hosts...","Park view,Beach access,Kitchen,Wifi,Free parki...",2
3,"Jurerê Internacional, Santa Catarina, Brazil",4.93 ·,95% of recent guests gave the location a 5-sta...,"Ocean view,Garden view,Beach access – Beachfro...",3
4,"Lagoa da Conceição, Santa Catarina, Brazil",4.88 ·,Check yourself in with the lockbox.,"Garden view,Sea view,Beach access – Beachfront...",4
5,"Centro, Santa Catarina, Brazil",5.0 ·,A private room with wifi that’s well-suited fo...,"Beach access – Beachfront,Kitchen,Wifi,Dedicat...",5
6,"Centro, Santa Catarina, Brazil",4.78 ·,"Superhosts are experienced, highly rated hosts...","Kitchen,Wifi,Free parking on premises,Pool,Ele...",6
7,Not Available,Not Available,Not Available,Not Available,7
8,"Barra da Lagoa, Santa Catarina, Brazil",4.87 ·,You can check in with the doorman.,"Lake access,Kitchen,Wifi,Free parking on premi...",8
9,"Centro, Santa Catarina, Brazil",4.56 ·,This is one of the few places in the area with...,"Beach access – Beachfront,Kitchen,Wifi,Dedicat...",9


In [82]:
dados_airbnb.to_excel('dados_airbnb.xlsx',index=False)