In [118]:
#Installing and libraries

import pandas as pd
import unicodedata
import os

file_path = "../data/raw-data/raw_data.parquet"


df = pd.read_parquet(file_path, engine='pyarrow')

FIRST OVERVIEW

In [119]:
def general_overview(df):
    print(f"{'Column Name':<40} | {'Type':<10} | {'Missing':<15} | {'Unique':<10} | {'Sample Values'}")
    print("-" * 110)

    for col in df.columns:
        dtype = str(df[col].dtype)
        
        missing_count = df[col].isnull().sum()
        missing_pct = (missing_count / len(df)) * 100
        missing_str = f"{missing_count} ({missing_pct:.1f}%)"
        
        n_unique = df[col].nunique()
        
        if len(df[col].dropna()) > 0:
            sample_vals = df[col].dropna().unique()
            if len(sample_vals) > 3:
                sample_vals = sample_vals[:3]
            examples = ", ".join(str(v) for v in sample_vals)
        else:
            examples = "All Empty"
            
        print(f"{col:<40} | {dtype:<10} | {missing_str:<15} | {n_unique:<10} | {examples}")
        
general_overview(df)


Column Name                              | Type       | Missing         | Unique     | Sample Values
--------------------------------------------------------------------------------------------------------------
ad_url                                   | object     | 0 (0.0%)        | 98824      | https://www.idealista.com/inmueble/102834499/, https://www.idealista.com/inmueble/105893546/, https://www.idealista.com/inmueble/103962048/
reference_id                             | int64      | 0 (0.0%)        | 98824      | 102834499, 105893546, 103962048
property_type                            | object     | 0 (0.0%)        | 9          | vivienda, viviendas, locales
ad_title                                 | object     | 0 (0.0%)        | 46182      | Alquiler de Piso en calle del Amparo, Lavapiés-Embajadores, Madrid, Piso en venta en Arapiles, Chamberí, Madrid,  Chalet pareado en venta en Coimbra - Guadarrama, Móstoles
ad_description                           | object     | 2549 (1.3%)

FIRST, WE ELIMINATE SOME USELESS FEATURES

In [120]:
#Choose the columns that we want to eliminate
cols_to_remove = ['country', 'country_code',]
df = df.drop(columns=cols_to_remove, errors='ignore')

WE LOOK FOR DUPLICATES

In [121]:
exact_dupes = df.duplicated().sum()

id_dupes = 0
if 'reference_id' in df.columns:
    id_dupes = df.duplicated(subset=['reference_id']).sum()

print(f"Dataset Shape: {df.shape}")
print(f"Exact Row Duplicates: {exact_dupes}")
print(f"Duplicates by Reference ID: {id_dupes}")

Dataset Shape: (200000, 37)
Exact Row Duplicates: 0
Duplicates by Reference ID: 101176


We will check the duplicated ID later. It can turn to be useful to make predictions on the housing prices case.

LET US START BY LOOKING AT THE OBJECT-TYPE FEATURES

In [122]:
def object_overview(df):

    #Select only object columns
    obj_cols = df.select_dtypes(include=['object']).columns

    #Loop through them and print key stats
    print(f"{'Column Name':<30} | {'Unique Values':<15} | {'Example Values'}")
    print("-" * 80)

    for col in obj_cols:
        unique_vals = df[col].unique()
        n_unique = len(unique_vals)
        # Show first 3 examples
        examples = ", ".join(str(v) for v in unique_vals[:5])
        print(f"{col:<30} | {n_unique:<15} | {examples}")

object_overview(df)

Column Name                    | Unique Values   | Example Values
--------------------------------------------------------------------------------
ad_url                         | 98824           | https://www.idealista.com/inmueble/102834499/, https://www.idealista.com/inmueble/105893546/, https://www.idealista.com/inmueble/103962048/, https://www.idealista.com/inmueble/104707721/, https://www.idealista.com/inmueble/105865316/
property_type                  | 9               | vivienda, viviendas, locales, garajes, oficinas
ad_title                       | 46182           | Alquiler de Piso en calle del Amparo, Lavapiés-Embajadores, Madrid, Piso en venta en Arapiles, Chamberí, Madrid,  Chalet pareado en venta en Coimbra - Guadarrama, Móstoles, Piso en venta en calle de Campo Real, Berruguete, Madrid, Piso en venta en avenida de la Reina Victoria, Vallehermoso, Madrid
ad_description                 | 90919           | UBK-493404(Disponible 1-11 meses) - Sin posibilidad a visitar!Homeli

FIRST WE ELIMINATE ACCENTS AND LIMIT THE TEXT TO UNICODE, EXCEPT FROM IS_EXTERIOR, AD_URL AND VIEWED_AT

In [123]:

text_cols = df.select_dtypes(include=['object']).columns
cols_to_exclude = [
    'is_exterior', 
    'ad_url', 
    'viewed_at'
]
text_cols = [col for col in text_cols if col not in cols_to_exclude]

#We define a function that normalizes strings
def normalize_text_safe(x):
    if pd.isna(x):
        return x
    return unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8')

for col in text_cols:
    df[col] = df[col].apply(normalize_text_safe)

#Let us check
#print(df['address'].head(20))

PROPERTY TYPE

In [124]:
#We take a look at the different values 
print(df["property_type"].unique())

['vivienda' 'viviendas' 'locales' 'garajes' 'oficinas' 'habitacion'
 'edificios' 'terrenos' 'trasteros']


In [125]:
#We merge vivienda and viviendas
substitution = {'viviendas': 'vivienda'}
df['property_type'] = df['property_type'].replace(substitution)


# We verify the result
print("--- After Merging ---")
print(df['property_type'].unique())

--- After Merging ---
['vivienda' 'locales' 'garajes' 'oficinas' 'habitacion' 'edificios'
 'terrenos' 'trasteros']


TRANSACTION TYPE 

In [126]:
#We merge alquiler and rent
substitution_dict = {'rent': 'alquiler', 'buy': 'venta'}
df['transaction_type'] = df['transaction_type'].replace(substitution_dict)

# We verify the result
print("--- After Merging ---")
print(df['transaction_type'].unique())

--- After Merging ---
['alquiler' 'venta']


STANDARIZE AND MERGE CITY, DISTRICT AND QUARTER NAMES

In [127]:
#Target features
target = ['city', 'district', 'quarter']
for col in target:
    #Standarize the names and merge them
    df[col] = df[col].str.title().str.strip()
    #Print to check
    print("\n", df[col].value_counts().sort_index().to_string())


 city
Ajalvir                                            205
Alalpardo                                            1
Alcala De Henares                                  114
Alcobendas                                        6038
Alcorcon                                          2193
Aldea Del Fresno                                     9
Algete                                              26
Alpedrete                                           11
Ambite                                               2
Anchuelo                                             2
Aranjuez                                            50
Arganda Del Rey                                   1204
Arroyomolinos                                      713
Batres                                               5
Becerril De La Sierra                                6
Belmonte De Tajo                                     3
Boadilla Del Monte                                3582
Borox                                                1
Bra

In [128]:
object_overview(df)

Column Name                    | Unique Values   | Example Values
--------------------------------------------------------------------------------
ad_url                         | 98824           | https://www.idealista.com/inmueble/102834499/, https://www.idealista.com/inmueble/105893546/, https://www.idealista.com/inmueble/103962048/, https://www.idealista.com/inmueble/104707721/, https://www.idealista.com/inmueble/105865316/
property_type                  | 8               | vivienda, locales, garajes, oficinas, habitacion
ad_title                       | 46102           | Alquiler de Piso en calle del Amparo, Lavapies-Embajadores, Madrid, Piso en venta en Arapiles, Chamberi, Madrid,  Chalet pareado en venta en Coimbra - Guadarrama, Mostoles, Piso en venta en calle de Campo Real, Berruguete, Madrid, Piso en venta en avenida de la Reina Victoria, Vallehermoso, Madrid
ad_description                 | 90888           | UBK-493404(Disponible 1-11 meses) - Sin posibilidad a visitar!Homel

CORRECTING TYPOS IN "STATE" AND "PROVINCE"

In [129]:
cols_to_view = ['city', 'address', 'province', 'state', 'ad_title', 'ad_description']

checks = {
    "CASTILLA LA MANCHA (STATE)": df['state'] == 'Castilla-La Mancha',
    "AVILA (PROVINCE)": df['province'] == 'Avila',
    "GUIPUZCOA (PROVINCE)": df['province'] == 'Guipuzcoa',
    "NONE / MISSING (PROVINCE)": df['province'].isna()
}

for title, mask in checks.items():
    print(f"--- {title} ---")
    print(df.loc[mask, cols_to_view].to_string(index=False))
    print("-" * 80)

--- CASTILLA LA MANCHA (STATE) ---
 city                 address province              state                                              ad_title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

We can observe that, except Avila, all are typos.

In [130]:
fix_mask = (
    (df['state'] == 'Castilla-La Mancha') | 
    (df['province'].isin(['Toledo', 'Guipuzcoa'])) | 
    (df['province'].isna())
)

df.loc[fix_mask, 'state'] = 'Comunidad de Madrid'
df.loc[fix_mask, 'province'] = 'Madrid'
#Eliminate all elements that are located in Avila
df = df[df['province'] != 'Avila']



In [131]:
object_overview(df)

Column Name                    | Unique Values   | Example Values
--------------------------------------------------------------------------------
ad_url                         | 98823           | https://www.idealista.com/inmueble/102834499/, https://www.idealista.com/inmueble/105893546/, https://www.idealista.com/inmueble/103962048/, https://www.idealista.com/inmueble/104707721/, https://www.idealista.com/inmueble/105865316/
property_type                  | 8               | vivienda, locales, garajes, oficinas, habitacion
ad_title                       | 46101           | Alquiler de Piso en calle del Amparo, Lavapies-Embajadores, Madrid, Piso en venta en Arapiles, Chamberi, Madrid,  Chalet pareado en venta en Coimbra - Guadarrama, Mostoles, Piso en venta en calle de Campo Real, Berruguete, Madrid, Piso en venta en avenida de la Reina Victoria, Vallehermoso, Madrid
ad_description                 | 90887           | UBK-493404(Disponible 1-11 meses) - Sin posibilidad a visitar!Homel

We can now remove the columns "province" and "state".

In [132]:
cols_to_remove = ['province', 'state']
df = df.drop(columns=cols_to_remove, errors='ignore')

LET US CHECK A GENERAL OVERVIEW AGAIN

In [133]:
general_overview(df)

Column Name                              | Type       | Missing         | Unique     | Sample Values
--------------------------------------------------------------------------------------------------------------
ad_url                                   | object     | 0 (0.0%)        | 98823      | https://www.idealista.com/inmueble/102834499/, https://www.idealista.com/inmueble/105893546/, https://www.idealista.com/inmueble/103962048/
reference_id                             | int64      | 0 (0.0%)        | 98823      | 102834499, 105893546, 103962048
property_type                            | object     | 0 (0.0%)        | 8          | vivienda, locales, garajes
ad_title                                 | object     | 0 (0.0%)        | 46101      | Alquiler de Piso en calle del Amparo, Lavapies-Embajadores, Madrid, Piso en venta en Arapiles, Chamberi, Madrid,  Chalet pareado en venta en Coimbra - Guadarrama, Mostoles
ad_description                           | object     | 2549 (1.3%)  

WE ELIMINATE FROM THE DATASET SMALL CITIES OR DISTANT VILLAGES WITH NO ECONOMIC IMPACT

We do this in order to avoid error from the future model

In [134]:


# We define the list of municipalities to DROP.
# These are locations that are either:
# - Too far (>1 hour from economic centers).
# - Rural/Low Liquidity (difficult to resell/rent).
# - Outside the main investment focus (e.g., Borox is in Toledo).

excluded_cities = [
    # Sierra Norte / Far North (Tourism only, low liquidity)
    'Braojos', 'Buitrago Del Lozoya', 'Bustarviejo', 'Cabanillas De La Sierra',
    'Garganta De Los Montes', 'Gargantilla Del Lozoya Y Pinilla De Buitrago',
    'Gascones', 'La Cabrera', 'Lozoyuela-Navas-Sieteiglesias', 'Montejo De La Sierra',
    'Navalafuente', 'Patones', 'Pradena Del Rincon', 'Puentes Viejas', 'Rascafria',
    'Sieteiglesias', 'Torrelaguna', 'Torremocha De Jarama', 'Valdepielagos',
    'Venturada', 'Villavieja Del Lozoya', 'El Vellon', 'Redueña', 'Robregordo',
    
    # Far East / South-East (Low connection)
    'Ambite', 'Anchuelo', 'Belmonte De Tajo', 'Brea De Tajo', 'Carabana',
    'Estremera', 'Fresno De Torote', 'Fuentiduena De Tajo', 'Los Santos De La Humosa',
    'Orusco De Tajuna', 'Pezuela De Las Torres', 'Pozuelo Del Rey', 'Ribatejada',
    'Santorcaz', 'Tielmes', 'Titulcia', 'Valdeavero', 'Valdelaguna', 'Valdilecha',
    'Villar Del Olmo', 'Villarejo De Salvanes', 'Eurovillas-Las Villas', 
    'Colmenar De Oreja', 'Chinchon',
    
    # Far South / South-West (Border zones)
    'Aldea Del Fresno', 'Batres', 'Cadalso De Los Vidrios', 'Cenicientos',
    'Chapineria', 'Colmenar Del Arroyo', 'Fresnedillas De La Oliva',
    'Navalagamella', 'Navas Del Rey', 'Pelayos De La Presa', 'Quijorna',
    'Robledo De Chavela', 'Santa Maria De La Alameda', 'Valdemaqueda',
    'Villa Del Prado', 'Villamanta', 'Villamantilla', 'Villanueva De Perales',
    'Zarzalejo',
    
    # Other / Out of Region
    'Borox' # This is in Toledo province
]

print(f"Shape before geographic filtering: {df.shape}")

# Apply the Filter
# We keep rows where 'city' is NOT in the excluded list
df = df[~df['city'].isin(excluded_cities)]

print(f"Shape after geographic filtering: {df.shape}")
print(f"Removed locations: {len(excluded_cities)}")

Shape before geographic filtering: (199998, 35)
Shape after geographic filtering: (199725, 35)
Removed locations: 68


BEFORE WE SPLIT THE DATASET INTO A NUMERICAL DATASET AND A METADATA DATASET, WE NEED TO ASSESS THE "reference_id" PROBLEM

There are 99000 unique IDs but almost 200000 cases, which means that we need to clean the entire dataset before moving on. In a first approach, we are going to look at the ID, and the price of the property. If they do not coincide, we keep both results. If they coincide, we keep the case with the latest date.

REFORMATTING DATA TYPES

We first transform booleans into 0 and 1

In [135]:
bool_map = {'True': 1, 'False': 0, True: 1, False: 0, 'TRUE': 1, 'FALSE': 0}

for col in df.select_dtypes(include=['object', 'bool']):
    unique_vals = set(df[col].dropna().unique())
    
    if unique_vals.issubset(set(bool_map.keys())):
        df[col] = df[col].map(bool_map).fillna(0).astype(int)
print("--- Booleans reformatted correctly ---")

--- Booleans reformatted correctly ---


In [136]:
general_overview(df)

Column Name                              | Type       | Missing         | Unique     | Sample Values
--------------------------------------------------------------------------------------------------------------
ad_url                                   | object     | 0 (0.0%)        | 98559      | https://www.idealista.com/inmueble/102834499/, https://www.idealista.com/inmueble/105893546/, https://www.idealista.com/inmueble/103962048/
reference_id                             | int64      | 0 (0.0%)        | 98559      | 102834499, 105893546, 103962048
property_type                            | object     | 0 (0.0%)        | 8          | vivienda, locales, garajes
ad_title                                 | object     | 0 (0.0%)        | 45847      | Alquiler de Piso en calle del Amparo, Lavapies-Embajadores, Madrid, Piso en venta en Arapiles, Chamberi, Madrid,  Chalet pareado en venta en Coimbra - Guadarrama, Mostoles
ad_description                           | object     | 2537 (1.3%)  

We treat the date feature and change it to a single year and month.

In [137]:

df['viewed_at'] = pd.to_datetime(df['viewed_at'])
df['year'] = df['viewed_at'].dt.year
df['month'] = df['viewed_at'].dt.month

print(df[['viewed_at', 'year', 'month']].head())
print(df['viewed_at'].dtype)

   viewed_at  year  month
0 2024-08-01  2024      8
1 2024-12-04  2024     12
2 2024-03-28  2024      3
3 2024-05-02  2024      5
4 2024-09-06  2024      9
datetime64[ns]


We can now remove the feature "viewed_at".

In [138]:
df = df.drop(columns='viewed_at', errors='ignore')
print("--- Column removed succesfully ---")

--- Column removed succesfully ---


In [139]:
general_overview(df)

Column Name                              | Type       | Missing         | Unique     | Sample Values
--------------------------------------------------------------------------------------------------------------
ad_url                                   | object     | 0 (0.0%)        | 98559      | https://www.idealista.com/inmueble/102834499/, https://www.idealista.com/inmueble/105893546/, https://www.idealista.com/inmueble/103962048/
reference_id                             | int64      | 0 (0.0%)        | 98559      | 102834499, 105893546, 103962048
property_type                            | object     | 0 (0.0%)        | 8          | vivienda, locales, garajes
ad_title                                 | object     | 0 (0.0%)        | 45847      | Alquiler de Piso en calle del Amparo, Lavapies-Embajadores, Madrid, Piso en venta en Arapiles, Chamberi, Madrid,  Chalet pareado en venta en Coimbra - Guadarrama, Mostoles
ad_description                           | object     | 2537 (1.3%)  

ASSESS THE ID DUPLICATION PROBLEM

Let us view first how a duplicate ID case looks like.

In [140]:
#pd.set_option('display.max_columns', None)
def see_a_duplicate_id(df):
    duplicate_ids = df[df.duplicated(subset='reference_id', keep=False)]['reference_id'].unique()
    sample_ids = duplicate_ids[:10]

    inspection = df[df['reference_id'].isin(sample_ids)].sort_values(by=['reference_id', 'year', 'month'], ascending=[True, False, False])

    print(inspection.to_string())
see_a_duplicate_id(df)

                                               ad_url  reference_id property_type                                                           ad_title                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

There are 99000 unique IDs but almost 200000 cases, which means that we need to clean the entire dataset before moving on. In a first approach, we are going to look at the ID, and the price of the property. If they do not coincide, we keep both results. If they coincide, we keep the case with the latest date.

In [141]:
df = df.sort_values(by=['reference_id', 'year', 'month'], ascending=[True, False, False])

df = df.drop_duplicates(subset=['reference_id', 'price','ad_title'], keep='first')

print(f"Original unique IDs: {df['reference_id'].nunique()}")
print(f"Final dataset size: {len(df)}")

Original unique IDs: 98559
Final dataset size: 112269


WE ELIMINATE SOME ROWS THAT PRESENT UNWANTED DESCRIPTIONS

In [142]:


toxic_keywords = [
    'okupa', 'ocupado', 'ocupada', 'ocupante',      # Squatters
    'nuda propiedad', 'usufructo',                  # Ownership limitations (cannot live there)
    'indiviso', 'proindiviso',                      # Partial ownership
    'subasta', 'cesion de remate',                  # Legal/Auctions
    'no visitable', 'sin posesion',                 # No access
    'tapiado',                                      # Physical blockage
]

def is_toxic(row):
    # We combine title and description to be sure
    # using 'str()' handles potential NaNs safely
    text = str(row['ad_title']) + " " + str(row['ad_description'])
    text = text.lower() # Normalize to lowercase for matching
    
    for keyword in toxic_keywords:
        if keyword in text:
            return True
    return False


print(f"Shape before filtering toxic assets: {df.shape}")

# Create a mask (True if toxic, False if clean)
toxic_mask = df.apply(is_toxic, axis=1)

# Save the toxic ones in case we want to study them later
df_toxic = df[toxic_mask].copy()
df_toxic.to_parquet('../data/processed-data/toxic_assets.parquet', index=False)

# Keep only the Clean data
df = df[~toxic_mask] 

print(f"Shape after filtering: {df.shape}")
print(f"Removed {toxic_mask.sum()} toxic assets (Occupied/Bare Ownership/etc).")

Shape before filtering toxic assets: (112269, 36)
Shape after filtering: (108959, 36)
Removed 3310 toxic assets (Occupied/Bare Ownership/etc).


WE NEED TO SPLIT THE DATASET INTO A METADATASET AND A NUMERICAL DATASET

Since latitude and longitude is the easiest way to handle mathematically locations, we have decided to remove other spatial indications. Since we keep reference_id, we can always track the property in a metadata dataset. This is why we are going to split teh dataset. This way we will be avle to run statistics and algorythms on the numerical dataset.

In [143]:




meta_cols = [
    'reference_id', 
    'ad_url', 
    'ad_title', 
    'ad_description', 
    'address', 
    'city', 
    'postcode', 
    'district', 
    'quarter'
]


output_folder = '../data/processed-data'

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

df_metadata = df[[c for c in meta_cols if c in df.columns]].copy()
df_metadata.to_parquet(f'{output_folder}/metadata.parquet', index=False)



In [144]:
meta_cols = [
    'reference_id', 
    'ad_url', 
    'ad_title', 
    'ad_description', 
    'address', 
    'city', 
    'postcode', 
    'district', 
    'quarter'
]
cols_to_remove = [c for c in meta_cols if c != 'reference_id']
df = df.drop(columns=cols_to_remove)


print(f"Current DataFrame shape: {df.shape}")
print(f"Current DataFrame columns: {list(df.columns)}")

Current DataFrame shape: (108959, 28)
Current DataFrame columns: ['reference_id', 'property_type', 'transaction_type', 'n_rooms', 'n_baths', 'area', 'floor', 'has_elevator', 'has_terrace', 'is_exterior', 'has_swimming_pool', 'has_parking', 'has_garden', 'energy_certificate_consumption', 'energy_certificate_emissions', 'energy_certificate_state', 'has_energy_certificate', 'has_floorplan', 'has_virtual_tour', 'n_videos', 'property_state', 'latitude', 'longitude', 'approximate_location', 'price_down_from', 'price', 'year', 'month']


Now we have to refine a little bit more our numerical dataset

ONE-HOT ENCODING

Implement one-hot in categorical variables.  It is true that both property state and energy certificate state are ordinal features but since we don't know the correlation between the trends we are looking for, we are going to treat them as categorical variables.

In [145]:
cols_to_encode = ['property_type', 'transaction_type']

df = pd.get_dummies(df, columns=cols_to_encode, dtype=int)

print(f"Total Columns: {len(df.columns)}")
print("--- New Binary Columns ---")
print([col for col in df.columns if 'property_type' in col or 'transaction_type' in col])

Total Columns: 36
--- New Binary Columns ---
['property_type_edificios', 'property_type_garajes', 'property_type_habitacion', 'property_type_locales', 'property_type_oficinas', 'property_type_terrenos', 'property_type_trasteros', 'property_type_vivienda', 'transaction_type_alquiler', 'transaction_type_venta']


ENERGY CERTIFICATES

We convert the seemingly numerical values into float-type values.

In [146]:
cols_to_float = ['energy_certificate_consumption', 'energy_certificate_emissions']

for col in cols_to_float:
    df[col] = pd.to_numeric(df[col], errors='coerce')

print(df[cols_to_float].dtypes)

energy_certificate_consumption    float64
energy_certificate_emissions      float64
dtype: object


ORDINAL CATEGORIES

There are only left "energy_certificate_state" and "property_state" to be turned numerical. Since in these cases order is important, we should not perform one-hot encoding.

In [147]:

maps = {
    'property_state': {
        'Obra nueva': 3,
        'Buen estado': 2,
        'A reformar': 1
    },
    'energy_certificate_state': {
        'A': 7, 'B': 6, 'C': 5, 'D': 4, 'E': 3, 'F': 2, 'G': 1
    }
}

df['property_state_score'] = df['property_state'].map(maps['property_state'])
df['energy_certificate_score'] = df['energy_certificate_state'].map(maps['energy_certificate_state'])

df = df.drop(columns=['property_state', 'energy_certificate_state'])

print(df[['property_state_score', 'energy_certificate_score']].head())


        property_state_score  energy_certificate_score
3305                     3.0                       NaN
164857                   2.0                       NaN
12616                    2.0                       NaN
114598                   2.0                       3.0
136047                   2.0                       3.0


SPLIT THE DATASET IN SALES AND RENTALS

Both datasets are going to be useful when studying prices.

In [148]:

#UNCOMMENT TO CREATE THE FYLE

df_rent = df[df['transaction_type_alquiler'] == 1].copy()
df_sale = df[df['transaction_type_venta'] == 1].copy()

cols_to_drop = ['transaction_type_alquiler', 'transaction_type_venta']
df_rent = df_rent.drop(columns=cols_to_drop)
df_sale = df_sale.drop(columns=cols_to_drop)

output_folder = '../data/processed-data'

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

df_rent.to_parquet(f'{output_folder}/madrid_housing_rent_preprocessed.parquet', index=False)
df_sale.to_parquet(f'{output_folder}/madrid_housing_sale_preprocessed.parquet', index=False)

print(f"Files saved in: {output_folder}")
print(f"Rent shape: {df_rent.shape}")
print(f"Sale shape: {df_sale.shape}")


Files saved in: ../data/processed-data
Rent shape: (37161, 34)
Sale shape: (71798, 34)
