## Cleaning data extracted from Imovirtual

In [1]:
import pandas as pd

file_path = 'data/raw/raw_imovirtual_2024-07-25.parquet'
df = pd.read_parquet(file_path)
df.head()

Unnamed: 0,title,price_euro,location,link,info_agg,site,date_extracted,offer_type_search,property_type_search,location_search,sub_location_search
0,"Sobralinho, T2, 78m2, 100% renovado materiais ...",225 000 €,"Rua 1º de Maio - Bom Sucesso, Sobralinho, Alve...",/pt/anuncio/sobralinho-t2-78m2-100-renovado-ma...,"[T2, 78 m², 2885 €/m², rés do chão]",imovirtual,2024-07-25T19:52:26.974135,comprar,apartamento,lisboa,
1,T2 NOVO :: Odivelas - Paiã : Salão 58m2 : Equi...,397 000 €,"Rua do Casal da Serrinha - Paiã, Pontinha e Fa...",/pt/anuncio/t2-novo-odivelas-paia-salao-58m2-e...,"[T2, 131 m², 3031 €/m²]",imovirtual,2024-07-25T19:52:26.974135,comprar,apartamento,lisboa,
2,Apartamento T3 inserido em condomínio privado,695 000 €,"Centro, Loures, Loures, Lisboa",/pt/anuncio/apartamento-t3-inserido-em-condomi...,"[T3, 177 m², 3927 €/m²]",imovirtual,2024-07-25T19:52:26.974135,comprar,apartamento,lisboa,
3,Apartamento T3 em Oeiras e São Julião da Barra...,425 000 €,"Terrugem - Q.ta do Torneiro - Q.ta da Fonte, O...",/pt/anuncio/apartamento-t3-em-oeiras-e-sao-jul...,"[T3, 142 m², 2993 €/m²]",imovirtual,2024-07-25T19:52:26.974135,comprar,apartamento,lisboa,
4,Apartamento RECENTE DUPLEX com 5 assoalhadas á...,298 000 €,"Santa Maria, São Pedro e Matacães, Torres Vedr...",/pt/anuncio/apartamento-recente-duplex-com-5-a...,"[T3, 175 m², 1703 €/m²]",imovirtual,2024-07-25T19:52:26.974135,comprar,apartamento,lisboa,


In [2]:
# df.duplicated().sum()

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31265 entries, 0 to 31264
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   title                 31265 non-null  object
 1   price_euro            31265 non-null  object
 2   location              31264 non-null  object
 3   link                  31265 non-null  object
 4   info_agg              31265 non-null  object
 5   site                  31265 non-null  object
 6   date_extracted        31265 non-null  object
 7   offer_type_search     31265 non-null  object
 8   property_type_search  31265 non-null  object
 9   location_search       31265 non-null  object
 10  sub_location_search   31265 non-null  object
dtypes: object(11)
memory usage: 2.6+ MB


## Splitting information

### `location`
- Each location stays in its own column, separated by colons. 
- None of the columns that were split were removed.

In [4]:
# Split location info
df_location_info_splitted = (
    df['location']
    .str.split(',', expand=True)
)
df_location_info_splitted.columns = [
    f'location_zone_{n}' for n in df_location_info_splitted.columns
]
df = df.join(df_location_info_splitted)

### `info_agg`
- The records in this column come as a list with varying sizes. Only two fields are important:
    - `info_0`: New column name: `num_bedrooms`.
    - `info_1`: New column name: `area_m2`.

In [5]:
# split information aggregated
max_length = df['info_agg'].apply(len).max()
df_info_agg_splitted = pd.DataFrame(
    df.info_agg.to_list(), 
    columns = [f'info_{i}' for i in range(max_length)] 
)
df = df.join(df_info_agg_splitted)

# remove columns with no interest
df = df.drop(
    columns= [
        f'info_{n}' for n in range(max_length)
        if n not in (0,1)
    ]
)

# rename columns
df.rename(
    columns={
        'info_0': 'num_bedrooms',
        'info_1': 'area_m2',
    }, 
    inplace= True
)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31265 entries, 0 to 31264
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   title                 31265 non-null  object
 1   price_euro            31265 non-null  object
 2   location              31264 non-null  object
 3   link                  31265 non-null  object
 4   info_agg              31265 non-null  object
 5   site                  31265 non-null  object
 6   date_extracted        31265 non-null  object
 7   offer_type_search     31265 non-null  object
 8   property_type_search  31265 non-null  object
 9   location_search       31265 non-null  object
 10  sub_location_search   31265 non-null  object
 11  location_zone_0       31264 non-null  object
 12  location_zone_1       31264 non-null  object
 13  location_zone_2       31264 non-null  object
 14  location_zone_3       28113 non-null  object
 15  location_zone_4       9901 non-null 

#### `num_bedrooms`
- Values must be in `[T0, T1, ..., T9, T9+]`. 
- When they are not in this list, it indicates the area, so this change will be made.

In [6]:
values_accepted = [ f'T{n}' for n in range(10)]
values_accepted.append('T9+')

# Define function to alter dataframe
def correct_num_bedrooms_values(row):
    if (
        row['num_bedrooms'] 
        and row['num_bedrooms'] not in values_accepted
    ):
        row['area_m2'] = row['num_bedrooms']
        row['num_bedrooms'] = None
    return row

# Define variables to test
total_rows = df.shape[0]
rows_to_change = df[~df.num_bedrooms.str.contains('T')].shape[0]
rows_not_null = total_rows - rows_to_change
records_with_wrong_area = df.loc[
    (df.area_m2.str.contains('€', na=False)) &
    (~df.num_bedrooms.str.contains('T'))
].shape[0]


print('Values accepted: ',values_accepted)
print('Number of records with no accepted values:', rows_to_change)
print('Rows with wrong area: ', records_with_wrong_area)

Values accepted:  ['T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T9+']
Number of records with no accepted values: 74
Rows with wrong area:  67


In [7]:
df = df.apply(correct_num_bedrooms_values, axis = 1)

**Test**

In [8]:
df.num_bedrooms.value_counts(dropna= False)

num_bedrooms
T3      9325
T2      8801
T4      4865
T1      3525
T5      1733
T9+     1266
T0       647
T6       625
T7       189
T8       122
T9        93
None      74
Name: count, dtype: int64

In [9]:
assert df.num_bedrooms.notna().sum() == rows_not_null
print('Passed')

Passed


### `area_m2`
- Keep only the numbers and convert them to integers.

In [10]:
records_with_wrong_area = df.loc[
    (df.area_m2.str.contains('€', na=False)) 
].shape[0]

print(f'There are {records_with_wrong_area} rows with wrong data!')


There are 0 rows with wrong data!


In [11]:
df['area_m2'] = df['area_m2'].str.replace(r'\D', '', regex=True)
df['area_m2'] = pd.to_numeric(df['area_m2'], errors= 'coerce')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31265 entries, 0 to 31264
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   title                 31265 non-null  object
 1   price_euro            31265 non-null  object
 2   location              31264 non-null  object
 3   link                  31265 non-null  object
 4   info_agg              31265 non-null  object
 5   site                  31265 non-null  object
 6   date_extracted        31265 non-null  object
 7   offer_type_search     31265 non-null  object
 8   property_type_search  31265 non-null  object
 9   location_search       31265 non-null  object
 10  sub_location_search   31265 non-null  object
 11  location_zone_0       31264 non-null  object
 12  location_zone_1       31264 non-null  object
 13  location_zone_2       31264 non-null  object
 14  location_zone_3       28113 non-null  object
 15  location_zone_4       9901 non-null 

## Change data types

In [13]:
# Change the wrong column name
try:
    df.rename(columns={'prince': 'price_euro'}, inplace= True)
except:
    pass

try:
    df.rename(columns={'price': 'price_euro'}, inplace= True)
except:
    pass

# Change price column
df['price_euro'] = df['price_euro'].str.replace(r'\D', '', regex=True)
df['price_euro'] = pd.to_numeric(df['price_euro'], errors= 'coerce')

# remove time in date
df['date_extracted'] = pd.to_datetime(df['date_extracted']).dt.date

# Set none in empty strings in sub_location_search
df['sub_location_search'] = (
    df['sub_location_search']
    .apply(lambda x: None if x == '' else x)
)

## Drop columns and duplicates

In [14]:
# Remove location and info_agg
df = df.drop(
    columns= ['location', 'info_agg']
)

# Drop Duplicated
df.drop_duplicates(inplace=True)

## Set ID from link
- There is an ID in the end of each link, like this: `/pt/anuncio/studio-t0-c-varanda-zona-nobreqt-grande-3-mints-carro-colombo-IDZVjT`
- The ID will be used to identify each advertisement.
- We must check for duplicated ID's and ensure that all of them start with `ID`.

In [15]:
# check IDs duplicated
link_serires = df[['link']].value_counts() 
link_serires[ link_serires == 2]

Series([], Name: count, dtype: int64)

In [16]:
df[df.link == '/pt/anuncio/duplex-t0-para-venda-ID1cOqD']

Unnamed: 0,title,price_euro,link,site,date_extracted,offer_type_search,property_type_search,location_search,sub_location_search,location_zone_0,location_zone_1,location_zone_2,location_zone_3,location_zone_4,location_zone_5,location_zone_6,num_bedrooms,area_m2
15356,Duplex T0 para venda,260000.0,/pt/anuncio/duplex-t0-para-venda-ID1cOqD,imovirtual,2024-07-25,comprar,apartamento,lisboa,,Rua Carlos Saraiva,Linda-a-Velha,Algés,Linda-a-Velha e Cruz Quebrada-Dafundo,Oeiras,Lisboa,,T0,489


- The duplicated values are caused because in the search for `apartamento` (apartment), the results also include the `t0` property type.
- Therefore, we can exclude the second occurrence of the records.

In [17]:
# excluding link duplicated
df = df.drop_duplicates(subset= 'link', keep= 'first')

In [18]:
# function to extract the last part of the link
def extract_last_part(url):
    return url.rsplit('-', 1)[-1]

df['link_id'] = df['link'].apply(extract_last_part)

Now, we test for duplicates

In [19]:
assert df.shape[0] == df.link_id.nunique()

Test to see if all the link_ids atart with `ID`

In [20]:
assert df[~df.link_id.str.startswith('ID')].shape[0] == 0

## Reindex the dataframe

In [21]:
# Reindex the columns of the DataFrame
desired_columns = [
    'title', 
    'price_euro', 
    'area_m2',
    'num_bedrooms',
    'property_status',
    'location_zone_0',
    'location_zone_1',
    'location_zone_2',
    'location_zone_3', 
    'location_zone_4',
    'location_zone_5', 
    'location_zone_6', 
    'site',
    'link_id',
    'link', 
    'offer_type_search', 
    'property_type_search',
    'location_search',
    'sub_location_search',
    'date_extracted'
]
df = df.reindex(columns=desired_columns)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29957 entries, 0 to 31264
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 29957 non-null  object 
 1   price_euro            29731 non-null  float64
 2   area_m2               29957 non-null  int64  
 3   num_bedrooms          29883 non-null  object 
 4   property_status       0 non-null      float64
 5   location_zone_0       29956 non-null  object 
 6   location_zone_1       29956 non-null  object 
 7   location_zone_2       29956 non-null  object 
 8   location_zone_3       26895 non-null  object 
 9   location_zone_4       9421 non-null   object 
 10  location_zone_5       1251 non-null   object 
 11  location_zone_6       204 non-null    object 
 12  site                  29957 non-null  object 
 13  link_id               29957 non-null  object 
 14  link                  29957 non-null  object 
 15  offer_type_search     29

## Checking transformation

In [23]:
df.shape

(29957, 20)

In [24]:
df.duplicated().sum()

np.int64(0)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29957 entries, 0 to 31264
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 29957 non-null  object 
 1   price_euro            29731 non-null  float64
 2   area_m2               29957 non-null  int64  
 3   num_bedrooms          29883 non-null  object 
 4   property_status       0 non-null      float64
 5   location_zone_0       29956 non-null  object 
 6   location_zone_1       29956 non-null  object 
 7   location_zone_2       29956 non-null  object 
 8   location_zone_3       26895 non-null  object 
 9   location_zone_4       9421 non-null   object 
 10  location_zone_5       1251 non-null   object 
 11  location_zone_6       204 non-null    object 
 12  site                  29957 non-null  object 
 13  link_id               29957 non-null  object 
 14  link                  29957 non-null  object 
 15  offer_type_search     29

In [26]:
(df.isna().sum() / df.shape[0]) * 100

title                     0.000000
price_euro                0.754415
area_m2                   0.000000
num_bedrooms              0.247021
property_status         100.000000
location_zone_0           0.003338
location_zone_1           0.003338
location_zone_2           0.003338
location_zone_3          10.221317
location_zone_4          68.551591
location_zone_5          95.824014
location_zone_6          99.319024
site                      0.000000
link_id                   0.000000
link                      0.000000
offer_type_search         0.000000
property_type_search      0.000000
location_search           0.000000
sub_location_search     100.000000
date_extracted            0.000000
dtype: float64

In [27]:
df.sample(10)

Unnamed: 0,title,price_euro,area_m2,num_bedrooms,property_status,location_zone_0,location_zone_1,location_zone_2,location_zone_3,location_zone_4,location_zone_5,location_zone_6,site,link_id,link,offer_type_search,property_type_search,location_search,sub_location_search,date_extracted
10558,Apartamento T3 para venda,240000.0,143,T3,,A dos Cunhados e Maceira,Torres Vedras,Lisboa,,,,,imovirtual,ID1eqV2,/pt/anuncio/apartamento-t3-para-venda-ID1eqV2,comprar,apartamento,lisboa,,2024-07-25
24530,"Moradia em Sintra, São Pedro Penaferrim",450000.0,175,T4,,Vila de Sintra - Monserrate,S. Maria,S. Miguel,S. Martinho e S. Pedro de Penaferrim,Sintra,Lisboa,,imovirtual,ID1cdVi,/pt/anuncio/moradia-em-sintra-sao-pedro-penafe...,comprar,moradia,lisboa,,2024-07-25
19440,"Excelente T2 com grande Varanda e boa vista, j...",1988800.0,11645,T2,,Campo de Santana - Santa Marta,Santo António,Lisboa,Lisboa,,,,imovirtual,ID176vn,/pt/anuncio/excelente-t2-com-grande-varanda-e-...,comprar,apartamento,lisboa,,2024-07-25
15021,T1 | APARTAMENTO | NOVA CONSTRUÇÃO | ...,410000.0,5575,T1,,Prazeres,Estrela,Lisboa,Lisboa,,,,imovirtual,ID1d4PZ,/pt/anuncio/t1-apartamento-nova-construcao-ID1...,comprar,apartamento,lisboa,,2024-07-25
21698,Moradia V2 em São Pedro da Cadeira,195000.0,83,T2,,São Pedro da Cadeira,Torres Vedras,Lisboa,,,,,imovirtual,ID1f5XI,/pt/anuncio/moradia-v2-em-sao-pedro-da-cadeira...,comprar,moradia,lisboa,,2024-07-25
13009,"Apartamento, 480 m², Cascais e Estoril",4990000.0,480,T5,,Quinta da Marinha,Cascais e Estoril,Cascais,Lisboa,,,,imovirtual,ID1dHbs,/pt/anuncio/apartamento-480-m-cascais-e-estori...,comprar,apartamento,lisboa,,2024-07-25
17222,Prédio para venda,1150000.0,12342,T9+,,Restelo,Belém,Lisboa,Lisboa,,,,imovirtual,ID1bEZo,/pt/anuncio/predio-para-venda-ID1bEZo,comprar,apartamento,lisboa,,2024-07-25
8289,T2 em Cascais,250000.0,77,T2,,Torre,Cascais e Estoril,Cascais,Lisboa,,,,imovirtual,ID1eOdO,/pt/anuncio/t2-em-cascais-ID1eOdO,comprar,apartamento,lisboa,,2024-07-25
28709,T3 em Príncipe Real - Lisboa,2000.0,83,T3,,Alto dos Moinhos,São Domingos de Benfica,Lisboa,Lisboa,,,,imovirtual,ID1ferF,/pt/anuncio/t3-em-principe-real-lisboa-ID1ferF,arrendar,apartamento,lisboa,,2024-07-25
23371,Moradia em Construção - Empreendimento Villas ...,525000.0,157,T3,,Apelação,Camarate,Unhos e Apelação,Loures,Lisboa,,,imovirtual,ID1eOdN,/pt/anuncio/moradia-em-construcao-empreendimen...,comprar,moradia,lisboa,,2024-07-25
