In [10]:
import pandas as pd

df = pd.read_csv("../data/raw/madrid_housing.csv")


In [11]:
print("Shape:", df.shape)      # e.g. (21742, 58)
df.info()                     # shows columns, types, null counts
df.head()                     # first 5 rows


Shape: (21742, 58)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21742 entries, 0 to 21741
Data columns (total 58 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Unnamed: 0                    21742 non-null  int64  
 1   id                            21742 non-null  int64  
 2   title                         21742 non-null  object 
 3   subtitle                      21742 non-null  object 
 4   sq_mt_built                   21616 non-null  float64
 5   sq_mt_useful                  8228 non-null   float64
 6   n_rooms                       21742 non-null  int64  
 7   n_bathrooms                   21726 non-null  float64
 8   n_floors                      1437 non-null   float64
 9   sq_mt_allotment               1432 non-null   float64
 10  latitude                      0 non-null      float64
 11  longitude                     0 non-null      float64
 12  raw_address                   16277 non-n

Unnamed: 0.1,Unnamed: 0,id,title,subtitle,sq_mt_built,sq_mt_useful,n_rooms,n_bathrooms,n_floors,sq_mt_allotment,...,energy_certificate,has_parking,has_private_parking,has_public_parking,is_parking_included_in_price,parking_price,is_orientation_north,is_orientation_west,is_orientation_south,is_orientation_east
0,0,21742,"Piso en venta en calle de Godella, 64","San Cristóbal, Madrid",64.0,60.0,2,1.0,,,...,D,False,,,,,False,True,False,False
1,1,21741,Piso en venta en calle de la del Manojo de Rosas,"Los Ángeles, Madrid",70.0,,3,1.0,,,...,en trámite,False,,,,,,,,
2,2,21740,"Piso en venta en calle del Talco, 68","San Andrés, Madrid",94.0,54.0,2,2.0,,,...,no indicado,False,,,,,,,,
3,3,21739,Piso en venta en calle Pedro Jiménez,"San Andrés, Madrid",64.0,,2,1.0,,,...,en trámite,False,,,,,False,False,True,False
4,4,21738,Piso en venta en carretera de Villaverde a Val...,"Los Rosales, Madrid",108.0,90.0,2,2.0,,,...,en trámite,True,,,True,0.0,True,True,True,True


In [12]:
null_counts = df.isnull().sum()
null_pct    = (null_counts / len(df)) * 100
missing_df = pd.DataFrame({
    "null_count": null_counts,
    "null_pct":   null_pct.round(2)
}).sort_values("null_pct", ascending=False)

missing_df


Unnamed: 0,null_count,null_pct
has_private_parking,21742,100.0
door,21742,100.0
portal,21742,100.0
is_furnished,21742,100.0
rent_price_by_area,21742,100.0
longitude,21742,100.0
latitude,21742,100.0
has_public_parking,21742,100.0
is_kitchen_equipped,21742,100.0
are_pets_allowed,21742,100.0


In [13]:
# 1. Data types
dtypes = df.dtypes
print(dtypes)

# 2. Number of unique values per column
uniques = df.nunique(dropna=False)
unique_df = pd.DataFrame({
    "dtype": dtypes,
    "unique_vals": uniques
}).sort_values("unique_vals")

unique_df


Unnamed: 0                        int64
id                                int64
title                            object
subtitle                         object
sq_mt_built                     float64
sq_mt_useful                    float64
n_rooms                           int64
n_bathrooms                     float64
n_floors                        float64
sq_mt_allotment                 float64
latitude                        float64
longitude                       float64
raw_address                      object
is_exact_address_hidden            bool
street_name                      object
street_number                    object
portal                          float64
floor                            object
is_floor_under                   object
door                            float64
neighborhood_id                  object
operation                        object
rent_price                        int64
rent_price_by_area              float64
is_rent_price_known                bool


Unnamed: 0,dtype,unique_vals
are_pets_allowed,float64,1
door,float64,1
is_buy_price_known,bool,1
has_public_parking,float64,1
has_private_parking,float64,1
is_rent_price_known,bool,1
rent_price_by_area,float64,1
portal,float64,1
is_kitchen_equipped,float64,1
latitude,float64,1


In [14]:
# 1. Identify boolean columns
bool_cols = df.select_dtypes(include='bool').columns.tolist()
print("Boolean columns:", bool_cols)

# 2. Show unique counts for boolean columns
bool_uniques = df[bool_cols].nunique(dropna=False)
print("Boolean unique counts:")
print(bool_uniques)

# 3. Now look at the other columns
other_cols = df.columns.difference(bool_cols)

# 4. Data types & unique values for non-boolean
dtypes_other = df[other_cols].dtypes
uniques_other = df[other_cols].nunique(dropna=False)

unique_df_other = pd.DataFrame({
    "dtype": dtypes_other,
    "unique_vals": uniques_other
}).sort_values("unique_vals")

unique_df_other


Boolean columns: ['is_exact_address_hidden', 'is_rent_price_known', 'is_buy_price_known', 'is_renewal_needed', 'has_parking']
Boolean unique counts:
is_exact_address_hidden    2
is_rent_price_known        1
is_buy_price_known         1
is_renewal_needed          2
has_parking                2
dtype: int64


Unnamed: 0,dtype,unique_vals
is_furnished,float64,1
has_public_parking,float64,1
has_private_parking,float64,1
latitude,float64,1
longitude,float64,1
is_kitchen_equipped,float64,1
operation,object,1
portal,float64,1
are_pets_allowed,float64,1
rent_price_by_area,float64,1


In [15]:
# Column‐wise missing %
missing_pct = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
print("Top 10 most‐missing columns:\n", missing_pct.head(10))

# Row‐wise missing count distribution
row_missing = df.isnull().sum(axis=1)
print("Rows with >50% features missing:",
      (row_missing > (df.shape[1]/2)).sum())


Top 10 most‐missing columns:
 has_private_parking    100.0
door                   100.0
portal                 100.0
is_furnished           100.0
rent_price_by_area     100.0
longitude              100.0
latitude               100.0
has_public_parking     100.0
is_kitchen_equipped    100.0
are_pets_allowed       100.0
dtype: float64
Rows with >50% features missing: 3682
