In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Índice<a id="ind"></a>

- [1. Bibliotecas](#bib)
- [2. Leitura dos dados](#leitura)
- [3. Processos de limpeza](#limp)
    - [3.1 Transforma as variáveis com datas](#data) 
    - [3.2 Tratamento dos valores ausentes](#val_nan)
    - [3.3 Remove cidades que não são Rio de Janeiro](#remove_cidade)
    - [3.4 Remove hosts repetidos](#hosts_rep)
    - [3.5 Divide a coluna "host_verifications"](#divide_col)

# 1. Bibliotecas <a id='bib'></a>
[Índice](#ind)

In [2]:
# bibliotecas
import pandas as pd
import numpy as np

import importlib    
from pathlib import Path
import os

# forma de exibição de dataframes
pd.pandas.set_option('display.max_columns', None)

# 2. Leitura dos dados<a  id="leitura"></a>
[Índice](#ind)

In [12]:
# importa os dados
current_path = os.getcwd()
path_dados=Path(current_path)/"dados"

df = pd.read_feather(path_dados/"1_listings_opt.feather")                     

print("Tamanho dos dados: ", df.shape)
df.head(5)


Tamanho dos dados:  (26366, 37)


Unnamed: 0,id,host_id,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,53344884,431412286,2021-11-11,,within an hour,1.0,1.0,0.0,"['email', 'phone']",1.0,1.0,Copacabana,-22.98299,-43.189041,Entire home/apt,6,4.5 baths,3.0,5.0,3500,3,365,0,,,,,,,,,,10,8,2,0,
1,7801456,40650139,2015-08-05,"Rio de Janeiro, Brazil",a few days or more,0.13,1.0,0.0,"['email', 'phone']",1.0,1.0,Ipanema,-22.98723,-43.204521,Entire home/apt,4,2 baths,2.0,3.0,5000,2,900,0,,,,,,,,,,5,5,0,0,
2,14333905,87749071,2016-08-03,"Rio de Janeiro, Brazil",,,,0.0,"['email', 'phone']",1.0,0.0,Pavuna,-22.808689,-43.386421,Entire home/apt,12,1 bath,2.0,4.0,681,1,1125,0,,,,,,,,,,1,1,0,0,
3,44708736,97164727,2016-09-28,"Rio de Janeiro, Brazil",within an hour,1.0,1.0,0.0,"['email', 'phone']",1.0,1.0,Glória,-22.91988,-43.170341,Entire home/apt,12,2 baths,2.0,3.0,8999,1,1,2,2021-01-10,2021-08-02,5.0,5.0,5.0,5.0,5.0,5.0,5.0,2,2,0,0,0.1
4,35351763,249666532,2019-03-18,"State of Rio de Janeiro, Brazil",,,,0.0,"['email', 'phone']",1.0,0.0,Pavuna,-22.81321,-43.385731,Entire home/apt,1,1 bath,2.0,3.0,500,1,2,0,,,,,,,,,,1,1,0,0,


# 3. Processos de limpeza<a id="limp"></a>
[Índice](#ind)

## 3.1 Transforma as variáveis com datas<a id="data"></a>

In [13]:
# colunas com datas
colunas_sel=['host_since','first_review','last_review']
df[colunas_sel]=df[colunas_sel].astype('datetime64[ns]')

# verificação
df[colunas_sel].dtypes

host_since      datetime64[ns]
first_review    datetime64[ns]
last_review     datetime64[ns]
dtype: object

## 3.2 Tratamento dos valores ausentes<a id="val_nan"></a>

Para a maioria das colunas os valores ausentes foram mantidos, quando existentes. As colunas que sofreram alterações quanto a estes valores foram:    

- "host_location": elimina linhas
- "host_since": elimina linhas
- "host_response_time": substitui por 'não informado'
- "host_response_rate": substitui por 0

Etapas para o tratamento:

**Etapa 1:** elimina valores ausentes de host_location e host_since

**Etapa 2:** susbstitui nan por valores adequados nas colunas 'host_response_time' e 'host_response_rate'. Pra este substituição vale ressaltar que coluna 'host_response_time' é do tipo categorical e para preencher os valores ausente com o valor desejado (string 'não informado') devemos criar uma nova categoria para esta coluna

In [14]:
# número de valores ausentes por coluna
colunas_null= df.isnull().sum()
cols_com_na = colunas_null[colunas_null>0]
cols_com_na

host_since                        5
host_location                  4788
host_response_time             3509
host_response_rate             3509
host_acceptance_rate           3529
host_is_superhost                 7
host_has_profile_pic              5
host_identity_verified            5
bathrooms_text                   50
bedrooms                       1311
beds                            236
first_review                   7023
last_review                    7023
review_scores_rating           7023
review_scores_accuracy         7229
review_scores_cleanliness      7229
review_scores_checkin          7229
review_scores_communication    7229
review_scores_location         7230
review_scores_value            7229
reviews_per_month              7023
dtype: int64

In [15]:
# --- Etapa 1 ---
# elimina linhas com valore ausente nas
# colunas 'host_location' e 'host_since'

df.dropna(subset=['host_location', 'host_since'], axis=0, inplace=True)

In [16]:
# --- Etapa 2 ----
# substitui os valores nan nas colunas '
# host_response_time' e 'host_response_rate'


# adiciona uma nova categoria
df['host_response_time'] = df['host_response_time'].cat\
                                .add_categories(['não informado'])

# preenche os valores Nan
df['host_response_time']= df['host_response_time'].fillna('não informado')
df['host_response_rate']= df['host_response_rate'].fillna(0)



In [17]:
# verificação: colunas que permaneceram com valores ausentes
colunas_null= df.isnull().sum()
colunas_null[colunas_null>0]


host_acceptance_rate           2518
host_is_superhost                 2
bathrooms_text                   43
bedrooms                       1104
beds                            177
first_review                   5146
last_review                    5146
review_scores_rating           5146
review_scores_accuracy         5308
review_scores_cleanliness      5308
review_scores_checkin          5308
review_scores_communication    5308
review_scores_location         5309
review_scores_value            5308
reviews_per_month              5146
dtype: int64

## 3.3 Remove cidades que não são Rio de Janeiro<a id="remove_cidade"></a>
[Índice](#ind)

 Dos valores da variável 'host_location' temos 4 que podem conter como localização a cidade do Rio de Janeiro, mas nenhum destes 4 valores faz referência explicita à cidade. Os 4 valores são:
- "Rio de Janeiro, Brazil"
- "State of Rio de Janeiro, Brazil "
- "Rio, Brazil"
- "Brazil"

Para identificar se estas localizações realmnte são realtivas à cidade do Rio de Janeiro, foi criada uma rotina que identifica se o bairro do host pertence a esta cidade. A lista com os bairros oficiais do Rio de Janeiro foi coletada da internet e está armazenada na pasta "Dados". As linhas com localizações, cujos bairros não estavam contidos na lista de bairros oficiais, foram eliminadas.

Os damais valores da variável 'host_location' não são da cidade do Rio de Janeiro. As linhas com estes valores foram descartadas. Alguns destes valores são:
- São Paulo, Brazil
- Itápolis, Brazil
- Itupeva, Brazil
- Marabá, Brazil

Etapa 1: dentre as localizações que podem ser do rio de janeiro () identifique as linhas que não são desta cidade

Etapa 2: eliminar toas as linhas que não contem dados da cidade do Rio de Janeiro

In [18]:
# locações mais frequêntes no dataset
host_locations = df['host_location']\
                        .value_counts()
                        
host_locations

Rio de Janeiro, Brazil             15792
State of Rio de Janeiro, Brazil     1628
Rio, Brazil                         1008
Brazil                               553
São Paulo, Brazil                    496
                                   ...  
Jacareí, Brazil                        1
Itápolis, Brazil                       1
Itupeva, Brazil                        1
Itu, Brazil                            1
Marabá, Brazil                         1
Name: host_location, Length: 484, dtype: int64

### Identifica localizações que não são do Rio de Janeiro

In [19]:
# carrega bairros Rio de Janeiro
bairros=pd.read_csv(path_dados/"lista_bairros_rio.csv")\
            .squeeze("columns")\
            .to_list()

In [20]:
# host_location que podem ser da cidade do rio de jeneiro
host_location_sel=["Rio de Janeiro, Brazil",
                   "State of Rio de Janeiro, Brazil",
                   "Rio, Brazil",
                   "Brazil"]

# host_location que não são da cidade do Rio de Janeiro
host_location_descart = host_locations[4:].index.to_list()
        

In [21]:

bairro_drop=[]
index_drop=[]

# para host_location_sel
for location_name in host_location_sel:
    
    # seleciona os bairros
    neighbourhood_sel = df[df['host_location']==location_name]['neighbourhood_cleansed'] 
    
    # identifica se os bairros da lista não pertencem à cidade do R. de Jeneiro
    for i in neighbourhood_sel.index:    
    
        if neighbourhood_sel[i] not in bairros:
            
            # armazena informações para eliminação do dataset
            bairro_drop.append(neighbourhood_sel[i])
            index_drop.append(i)
        

print("Linhas de devem ser eliminadas: ",index_drop)
print("Bairro das linhas eliminadas: ", bairro_drop)
            

Linhas de devem ser eliminadas:  [134, 4015, 5002]
Bairro das linhas eliminadas:  ['Gericinó', 'Gericinó', 'Jacaré']


### Elimina localizações que não são do rio de janeiro

In [22]:

# elimina linhas cujos bairros não são do R. de Janeiro
df = df.drop(index=index_drop)



# elimina linhas que comtém outra cidade
for nome in host_location_descart:   
    
    # indices para eliminar
    index_drop=df[df['host_location']==nome].index.to_list()
    
    # elimina indices    
    df = df.drop(index=index_drop)


    
# atualiza os indices
df.reset_index(drop=True, inplace=True)

# remove categoria da coluna categorica 'host_location'
df['host_location'] = df['host_location'].cat.remove_unused_categories()

# verificação
print(df['host_location'].value_counts())


Rio de Janeiro, Brazil             15789
State of Rio de Janeiro, Brazil     1628
Rio, Brazil                         1008
Brazil                               553
Name: host_location, dtype: int64


In [23]:
# remove as colunas 'id' e 'host_location'
df=df.drop(columns=['id','host_location'])

# verificação
print("tamanho do dataframe: ",df.shape)

tamanho do dataframe:  (18978, 35)


## 3.4 Remove hosts repetidos<a id="hosts_rep"></a>
[Índice](#ind)

In [24]:
# identifica hosts que aparecem mais de uma vez no dataset
host_group=df['host_id'].value_counts()
host_repetidos=host_group[host_group>1]
host_repetidos

91654021     189
6000862      109
47584281      97
1982737       90
371026651     77
            ... 
23719360       2
38561318       2
43614996       2
12000002       2
35082628       2
Name: host_id, Length: 2578, dtype: int64

In [56]:
%%time

moda =lambda x: x.value_counts().index[0]\
                    if not pd.isna(x).sum()==len(x)\
                    else float('Nan')

round_mean = lambda x: round(x.mean())\
                        if not pd.isna(x.mean())\
                        else float('Nan')


df = df.groupby("host_id", as_index=False).agg( 
    
    host_since=('host_since', moda),
    host_response_time=('host_response_time',moda),
    host_response_rate=('host_response_rate',moda),    
    host_acceptance_rate=('host_acceptance_rate','mean'),
    
    host_is_superhost=('host_is_superhost',moda),
    host_verifications=('host_verifications',moda),
    host_has_profile_pic=('host_has_profile_pic',moda),
    host_identity_verified=('host_identity_verified',moda),
    
    neighbourhood_cleansed=('neighbourhood_cleansed',moda),
    latitude=('latitude',moda),
    longitude=('longitude',moda),
    room_type=('room_type',moda),
    accommodates=('accommodates',round_mean),
    
    bathrooms_text=('bathrooms_text',moda),
    bedrooms=('bedrooms',round_mean),
    beds=('beds',round_mean),
    price=('price',round_mean),
    minimum_nights=('minimum_nights',round_mean),
    maximum_nights=('maximum_nights',round_mean),
    
    number_of_reviews=('number_of_reviews','max'),
    first_review=('first_review','min'),
    last_review=('last_review','max'),
    review_scores_rating=('review_scores_rating',moda),
    review_scores_accuracy=('review_scores_accuracy',round_mean),
    review_scores_cleanliness=('review_scores_cleanliness',round_mean),
    
    review_scores_checkin=('review_scores_checkin',round_mean),
    review_scores_communication=('review_scores_communication',round_mean),
    review_scores_location=('review_scores_location',round_mean),
    review_scores_value=('review_scores_value',round_mean),
    calculated_host_listings_count=('calculated_host_listings_count',round_mean),
    
    calculated_host_listings_count_entire_homes=('calculated_host_listings_count_entire_homes',round_mean),
    calculated_host_listings_count_private_rooms=('calculated_host_listings_count_private_rooms',round_mean),
    calculated_host_listings_count_shared_rooms=('calculated_host_listings_count_shared_rooms',round_mean),
    reviews_per_month=('reviews_per_month',moda)
)

CPU times: user 1min 51s, sys: 420 ms, total: 1min 52s
Wall time: 1min 51s


In [57]:
# verificação
print("Tamanho do dataframe: ", df.shape)
print("\nTipos dos dados:\n", df.dtypes)

Tamanho do dataframe:  (11222, 35)

Tipos dos dados:
 host_id                                                 uint32
host_since                                      datetime64[ns]
host_response_time                                      object
host_response_rate                                     float64
host_acceptance_rate                                   float32
host_is_superhost                                      float64
host_verifications                                      object
host_has_profile_pic                                   float64
host_identity_verified                                 float64
neighbourhood_cleansed                                  object
latitude                                               float64
longitude                                              float64
room_type                                               object
accommodates                                             int64
bathrooms_text                                          object
b

In [58]:
# verificação
df.head(4)

Unnamed: 0,host_id,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,11739,2009-03-29,within a few hours,0.82,1.0,0.0,"['email', 'phone']",1.0,1.0,Copacabana,-22.97242,-43.18898,Entire home/apt,4,1 bath,1.0,2.0,225,3,291,36,2016-03-20,2022-09-07,5.0,5.0,5.0,5.0,5.0,5.0,5.0,10,10,0,0,0.04
1,34105,2009-08-26,within a day,1.0,1.0,1.0,"['email', 'phone']",1.0,1.0,Copacabana,-22.96467,-43.175529,Entire home/apt,2,1 bath,1.0,1.0,184,3,90,78,2014-01-06,2022-09-12,4.62,5.0,4.0,5.0,5.0,5.0,4.0,2,2,0,0,0.74
2,37072,2009-09-08,within a day,0.83,1.0,0.0,"['email', 'phone']",1.0,0.0,Laranjeiras,-22.93598,-43.192928,Entire home/apt,14,3.5 baths,4.0,5.0,2553,1,365,0,NaT,NaT,,,,,,,,1,1,0,0,
3,48024,2009-10-24,within an hour,1.0,0.79,1.0,"['email', 'phone']",1.0,1.0,Copacabana,-22.985531,-43.2248,Entire home/apt,5,1 bath,2.0,2.0,865,5,239,114,2014-06-22,2022-09-15,5.0,5.0,5.0,5.0,5.0,5.0,5.0,8,7,1,0,1.0


In [59]:
# salva o resultado até este ponto
current_path = os.getcwd()
path_root=Path(current_path)
df.to_feather(path_root/"dados/1_listings_cl.feather")

## 3.5 Divide a coluna "host_verifications"<a id="divide_col"></a>
[Índice](#ind)

In [60]:
# abre o último arquivo
current_path = os.getcwd()
path_root=Path(current_path)
df = pd.read_feather(path_root/"dados/1_listings_cl.feather")  

In [91]:
# valores da variável 'host_verifications'
df['host_verifications'].value_counts()

['email', 'phone']                  9258
['phone']                           1100
['email', 'phone', 'work_email']     800
['phone', 'work_email']               29
['email']                             28
[]                                     4
['email', 'work_email']                3
Name: host_verifications, dtype: int64

In [127]:
%%time

list_email = len(df)*[0]
list_phone = len(df)*[0]
list_work_mail = len(df)*[0]


index_values = zip(
    df['host_verifications'].index, 
    df['host_verifications'].values
   )


for i,valor  in index_values:    
    
    if valor == "['email', 'phone']":
        list_email[i] = 1
        list_phone[i] = 1
        
    elif valor == "['phone']":
        list_phone[i] = 1
        
    elif valor == "['email', 'phone', 'work_email']":
        list_email[i] = 1
        list_phone[i] = 1
        list_work_mail[i] = 1
        
    elif valor == "['phone', 'work_email']":        
        list_phone[i] = 1
        list_work_mail[i] = 1
        
    elif valor == "['email']":
        list_email[i] = 1
    
    elif valor == "['email', 'work_email']":
        list_email[i] = 1        
        list_work_mail[i] = 1
    
    

CPU times: user 24.6 ms, sys: 9 µs, total: 24.6 ms
Wall time: 24.7 ms


In [136]:
# acrescenta colunas no dataframe
df["verifications_email"] = list_email
df["verifications_phone"] = list_phone
df["verifications_work_email"] = list_work_mail

# remove coluna
df=df.drop(columns=['host_verifications'])


In [139]:
# verificação
df.head(4)

Unnamed: 0,host_id,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,longitude,room_type,accommodates,bathrooms_text,bedrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,verifications_email,verifications_phone,verifications_work_email
0,11739,2009-03-29,within a few hours,0.82,1.0,0.0,1.0,1.0,Copacabana,-22.97242,-43.18898,Entire home/apt,4,1 bath,1.0,2.0,225,3,291,36,2016-03-20,2022-09-07,5.0,5.0,5.0,5.0,5.0,5.0,5.0,10,10,0,0,0.04,1,1,0
1,34105,2009-08-26,within a day,1.0,1.0,1.0,1.0,1.0,Copacabana,-22.96467,-43.175529,Entire home/apt,2,1 bath,1.0,1.0,184,3,90,78,2014-01-06,2022-09-12,4.62,5.0,4.0,5.0,5.0,5.0,4.0,2,2,0,0,0.74,1,1,0
2,37072,2009-09-08,within a day,0.83,1.0,0.0,1.0,0.0,Laranjeiras,-22.93598,-43.192928,Entire home/apt,14,3.5 baths,4.0,5.0,2553,1,365,0,NaT,NaT,,,,,,,,1,1,0,0,,1,1,0
3,48024,2009-10-24,within an hour,1.0,0.79,1.0,1.0,1.0,Copacabana,-22.985531,-43.2248,Entire home/apt,5,1 bath,2.0,2.0,865,5,239,114,2014-06-22,2022-09-15,5.0,5.0,5.0,5.0,5.0,5.0,5.0,8,7,1,0,1.0,1,1,0


In [140]:
# salva o resultado
current_path = os.getcwd()
path_root = Path(current_path)
df.to_feather(path_root/"dados/1_listings_cl.feather")