# **DATA CLEANING**

## IMPORTS

In [120]:
# Imports
import numpy as np
import pandas as pd

## LOADING

In [121]:
# We load the data and check the first rows
data = pd.read_csv('../data/raw/idealista_viviendas_detalle.csv', )

# We change the names of the columns to make them easier to english and to avoid special characters
data.columns = ["url", "price", "zone","neighborhood", "built_area", "usable_area", "bedrooms", "bathrooms", "floor", "orientation", "elevator", "garage", "storage_room", "balcony", "new", "condition", "year", "heating", "agency", "consumption_value", "consumption_label", "emisions_value", "emissions_label", "description", "error"]
data.head()

Unnamed: 0,url,price,zone,neighborhood,built_area,usable_area,bedrooms,bathrooms,floor,orientation,...,condition,year,heating,agency,consumption_value,consumption_label,emisions_value,emissions_label,description,error
0,https://www.idealista.com/inmueble/109356873/,440.0,deusto,"La Ribera-Ibarrekolanda, Bilbao",76.0,70.0,2.0,2.0,3.0,exterior,...,Buen estado,2025.0,Individual,ORDUNTE Inmobiliaria,23.0,A,4.0,A,"GASTOS DE GESTIÓN de 1,5 por ciento + IVA a ca...",
1,https://www.idealista.com/inmueble/106221410/,442.0,deusto,"La Ribera-Ibarrekolanda, Bilbao",82.0,,2.0,2.0,1.0,exterior,...,,,Central,LOIOLA GESTIÓN INMOBILIARIA,,A,,A,La nueva promoción de Loiola se compone de 42 ...,
2,https://www.idealista.com/inmueble/107750109/,381.0,deusto,"La Ribera-Ibarrekolanda, Bilbao",58.0,,1.0,1.0,4.0,exterior,...,,,Central,LOIOLA GESTIÓN INMOBILIARIA,,A,,A,La nueva promoción de Loiola se compone de 42 ...,
3,https://www.idealista.com/inmueble/106221526/,575.0,deusto,"La Ribera-Ibarrekolanda, Bilbao",104.0,,3.0,2.0,1.0,exterior,...,,,Central,LOIOLA GESTIÓN INMOBILIARIA,,A,,A,La nueva promoción de Loiola se compone de 42 ...,
4,https://www.idealista.com/inmueble/108491309/,306.0,deusto,"La Ribera-Ibarrekolanda, Bilbao",62.0,,1.0,1.0,1.0,exterior,...,,,,Loiola,,A,,A,"60 viviendas de obra nueva de 1, 2 y 3 habitac...",


## CLEANING

In [122]:
# We check the summarize information of the dataset, to see the types of the columns and if there are null values
data.info()

# We remove the column that we will not use
data = data.drop(columns=["error"]) # Url is not useful for our analysis, but we will keep it for now in case we want to check some house manually

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   url                1490 non-null   object 
 1   price              1417 non-null   object 
 2   zone               1490 non-null   object 
 3   neighborhood       1417 non-null   object 
 4   built_area         1417 non-null   float64
 5   usable_area        968 non-null    float64
 6   bedrooms           1407 non-null   float64
 7   bathrooms          1415 non-null   float64
 8   floor              1271 non-null   float64
 9   orientation        1249 non-null   object 
 10  elevator           1392 non-null   object 
 11  garage             371 non-null    object 
 12  storage_room       381 non-null    object 
 13  balcony            780 non-null    object 
 14  new                1407 non-null   object 
 15  condition          1286 non-null   object 
 16  year               759 n

In [123]:
# We check the rows that contain all null values (except the zona column, which is the only one that has no null values)
null_columns = data[data.drop(columns=["zone", "url"]).isnull().all(axis=1)]
# The rows that contain all null values are the houses that have been removed from the website, probably because they 
# have been sold in just a day (the time between getting the url of the houses and getting the detailed information). 

# We get how many houses have been sold in one day per zone, just out of curiosity
print(null_columns['zone'].value_counts())

# As there are more houses in a zone than in others, we get the percentage of houses that have been sold in one day per zone
print((null_columns['zone'].value_counts() / data['zone'].value_counts()).sort_values(ascending=False))
# In some zones, more than 10% of the houses have been sold in one day.

# We drop these rows, as they do not provide any useful information
data = data.dropna(how='all', subset=data.columns.difference(['zone', "url"]))

zone
abando-albia             23
begona-santutxu          10
uribarri                  9
indautxu                  9
rekalde                   6
basurto-zorroza           5
ibaiondo                  5
casco-viejo               3
deusto                    2
otxarkoaga-txurdinaga     1
Name: count, dtype: int64
zone
begona-santutxu          0.113636
abando-albia             0.082143
uribarri                 0.066176
indautxu                 0.053892
casco-viejo              0.046154
basurto-zorroza          0.033113
rekalde                  0.028037
otxarkoaga-txurdinaga    0.027778
ibaiondo                 0.023585
deusto                   0.020619
san-adrian-la-pena            NaN
Name: count, dtype: float64


In [124]:
# We check again the summarize information of the dataset
data.info()

# We check the percentage of null values per column
null_percentage = data.isnull().mean().sort_values(ascending=False)
print(null_percentage)

# We will tranform some columns to make them easier to use, changing their format, data type and filling null values.
# Let's go column by column:

#? Price: we will change the thousands separator and convert it to a numeric value (no null values)
data['price'] = data['price'].str.replace('.', '', regex=False).astype(float)

#? Zone and neighborhood: will transform them to categorical variables (no null values)
data['zone'] = data['zone'].astype('category')
data['neighborhood'] = data['neighborhood'].astype('category')

#? Built area: we will leave it as is, since there data type is correct (no null values)

#? Usable area: we will not do anything for now (null values: 32%)

#? Bedrooms: as there are just 10 null values, we will check them manually
data[data['bedrooms'].isnull()]["url"]
# All of them are studios where people can not live, so we do not take them into account and we will drop them
data = data.dropna(subset=['bedrooms'])
# We convert it to an integer. Finally (no null values)
data['bedrooms'] = data['bedrooms'].astype(int) 

# As we have drop some rows, we check the percentage and number of null values per column again
null_amount = data.isnull().sum().sort_values(ascending=False)
print(null_percentage, null_amount)

#? Bathrooms: will transform them to integer variables (no null values)
data['bathrooms'] = data['bathrooms'].astype(int)

#? Floor:

#? Orientation:

#? Elevator, garage, storage_room, balcony: we will transform these columns into binary ones and fill the null values with False, 
#? as it means that the house does not have these features
for col in ['garage', 'storage_room', 'balcony', 'elevator']:
    data[col] = data[col].notnull()
    data[col] = data[col].fillna(False)

#? New: there are just two possible values: "Obra nueva" and "Segunda Mano", so we will transform this column into a binary one
data['new'] = data['new'].map({"Obra nueva": True, "Segunda Mano": False})
# Also, the new column, as there are just 10 null values, we will check them manually and fill them with the correct value
print(data[data['new'].isnull()]["url"])
# After checking the urls, we see that all the houses are new, so we fill the null values with True
data['new'] = data['new'].fillna(True).infer_objects(copy=False)

#? Condition

#? Year:

#? Heating:

#? Agency: in the case of null values, it means that the house is being sold by the owner, so we will fill the null values with "owner"
data['agency'] = data['agency'].fillna("owner")
data['agency'] = data['agency'].astype('category')

#? Consumption value and label:

#? Emisions value and label:

#? Description: in the case of null values, it means that there is no description, so we will fill the null values with an empty string
data['description'] = data['description'].fillna("")


print(data.info())



<class 'pandas.core.frame.DataFrame'>
Index: 1417 entries, 0 to 1489
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   url                1417 non-null   object 
 1   price              1417 non-null   object 
 2   zone               1417 non-null   object 
 3   neighborhood       1417 non-null   object 
 4   built_area         1417 non-null   float64
 5   usable_area        968 non-null    float64
 6   bedrooms           1407 non-null   float64
 7   bathrooms          1415 non-null   float64
 8   floor              1271 non-null   float64
 9   orientation        1249 non-null   object 
 10  elevator           1392 non-null   object 
 11  garage             371 non-null    object 
 12  storage_room       381 non-null    object 
 13  balcony            780 non-null    object 
 14  new                1407 non-null   object 
 15  condition          1286 non-null   object 
 16  year               759 non-nu

  data['new'] = data['new'].fillna(True).infer_objects(copy=False)


In [119]:
# We almost have full information of the following columns (less than 1% of null values):
# - new
# We check the distribution of the 'new' column
print(data['new'].value_counts())
# There are just two possible values: "Obra nueva" and "Segunda Mano", so we will transform this column into a binary one
data['new'] = data['new'].map({"Obra nueva": True, "Segunda Mano": False})
# Also, the new column, as there are just 10 null values, we will check them manually and fill them with the correct value
print(data[data['new'].isnull()]["url"])
# After checking the urls, we see that all the houses are new, so we fill the null values with True
data['new'] = data['new'].fillna(True)

# 
# 
# 
# 
# bedrooms, description (null value means that there is no description) and bathrooms


# For some of the remaining columns, null values mean that the feature is not present in the house:
# - garage, storage_room, balcony and elevator. 
# We will transform these columns into binary ones and fill the null values with False, as it means that the house does not have these features
for col in ['garage', 'storage_room', 'balcony', 'elevator']:
    data[col] = data[col].notnull()
    data[col] = data[col].fillna(False)


### Format corrections

# We will change the data types of some columns to the correct ones


# We change the thousands format in the price
# data['precio'] = data['precio'].str.replace('.', '', regex=False).astype(float)


new
True    1407
Name: count, dtype: int64
0       https://www.idealista.com/inmueble/109356873/
1       https://www.idealista.com/inmueble/106221410/
2       https://www.idealista.com/inmueble/107750109/
3       https://www.idealista.com/inmueble/106221526/
4       https://www.idealista.com/inmueble/108491309/
                            ...                      
1485    https://www.idealista.com/inmueble/107389969/
1486    https://www.idealista.com/inmueble/101540464/
1487    https://www.idealista.com/inmueble/109302834/
1488    https://www.idealista.com/inmueble/108040948/
1489    https://www.idealista.com/inmueble/105151119/
Name: url, Length: 1407, dtype: object


  data['new'] = data['new'].fillna(True)


## STORE