# 🏠 Toronto Housing Data - Feature Extraction

In this notebook, we:
- Load raw CSV data from multiple area folders
- Clean price field
- Extract features: beds, baths, category (Townhouse/Condo/House)
- Extract city/area from address

In [3]:
import pandas as pd
import os
import re
from pathlib import Path

# Set path to combined CSV (after merging area-wise files)
csv_path = Path('../data/toronto_combined.csv')
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,address,price,details
0,"218 Golden Trl, Vaughan, ON L6A 5A1","C$1,199,000",3 bds3 ba- Townhouse for sale
1,"24 Sicilia St, Vaughan, ON L4H 1G3","C$1,399,999",4 bds4 ba- House for sale
2,"81 Mahogany Forest Dr, Vaughan, ON L6A 0T1","C$1,258,800",4 bds4 ba- House for sale
3,"99 Abner Miles Dr, Vaughan, ON L6A 4X4","C$2,299,000",5 bds6 ba- House for sale
4,"26 Bruce St #E17, Vaughan, ON L4L 0H4","C$649,999",2 bds2 ba- Condo for sale


## 💰 Clean Price Column

In [9]:
df['price'] = (
    df['price']
    .str.replace(r'[^\d.]', '', regex=True)
    .pipe(pd.to_numeric, errors='coerce')
)
df.head()

Unnamed: 0,address,price,details
0,"218 Golden Trl, Vaughan, ON L6A 5A1",1199000.0,3 bds3 ba- Townhouse for sale
1,"24 Sicilia St, Vaughan, ON L4H 1G3",1399999.0,4 bds4 ba- House for sale
2,"81 Mahogany Forest Dr, Vaughan, ON L6A 0T1",1258800.0,4 bds4 ba- House for sale
3,"99 Abner Miles Dr, Vaughan, ON L6A 4X4",2299000.0,5 bds6 ba- House for sale
4,"26 Bruce St #E17, Vaughan, ON L4L 0H4",649999.0,2 bds2 ba- Condo for sale


In [11]:
invalid_price_rows = df[~pd.to_numeric(df['price'], errors='coerce').notnull()]

print(invalid_price_rows)

                                   address  price  \
3534  Coming Soon Plan, Upper Joshua Creek    NaN   

                            details  
3534  -- bds-- ba- New construction  


## 🛏️ Extract Beds, Baths, sqft , Category

In [13]:
def extract_details(text):
    beds = re.search(r'(\d+)\s*bds?', text)
    baths = re.search(r'(\d+)\s*ba', text)
    category = re.search(r'(Townhouse|Condo|House|New construction)', text, re.IGNORECASE)
    sqft = re.search(r'([\d,]+)\s*sqft', text)
    return pd.Series({
        'beds': int(beds.group(1)) if beds else None,
        'baths': int(baths.group(1)) if baths else None,
        'category': category.group(1).title() if category else 'Unknown',
        'sqft': int(sqft.group(1).replace(',', '')) if sqft else None
    })

df[['bedrooms', 'bathrooms', 'category', 'area_sqft']] = df['details'].apply(extract_details)
df.head(20)

Unnamed: 0,address,price,details,bedrooms,bathrooms,category,area_sqft
0,"218 Golden Trl, Vaughan, ON L6A 5A1",1199000.0,3 bds3 ba- Townhouse for sale,3.0,3.0,Townhouse,
1,"24 Sicilia St, Vaughan, ON L4H 1G3",1399999.0,4 bds4 ba- House for sale,4.0,4.0,House,
2,"81 Mahogany Forest Dr, Vaughan, ON L6A 0T1",1258800.0,4 bds4 ba- House for sale,4.0,4.0,House,
3,"99 Abner Miles Dr, Vaughan, ON L6A 4X4",2299000.0,5 bds6 ba- House for sale,5.0,6.0,House,
4,"26 Bruce St #E17, Vaughan, ON L4L 0H4",649999.0,2 bds2 ba- Condo for sale,2.0,2.0,Condo,
5,"31 Ravineview Dr, Vaughan, ON L6A 3V2",979000.0,4 bds3 ba- House for sale,4.0,3.0,House,
6,"96 Agostino Cres, Vaughan, ON L4K 5L6",999000.0,4 bds4 ba- House for sale,4.0,4.0,House,
7,"16 Via Romano Blvd, Vaughan, ON L6A 4Y9",1498900.0,4 bds5 ba- Townhouse for sale,4.0,5.0,Townhouse,
8,"18 Alba Ave, Vaughan, ON L4H 2A7",1188000.0,2 bds3 ba- House for sale,2.0,3.0,House,
9,"40 National Pine Dr, Vaughan, ON L6A 3M3",898888.0,3 bds4 ba- House for sale,3.0,4.0,House,


## 🏙️ Extract City from Address

In [14]:
def get_city(address):
    parts = address.split(',')
    if len(parts) >= 2:
        return parts[1].strip()
    return None

df['city'] = df['address'].apply(get_city)
df.head(10)

Unnamed: 0,address,price,details,bedrooms,bathrooms,category,area_sqft,city
0,"218 Golden Trl, Vaughan, ON L6A 5A1",1199000.0,3 bds3 ba- Townhouse for sale,3.0,3.0,Townhouse,,Vaughan
1,"24 Sicilia St, Vaughan, ON L4H 1G3",1399999.0,4 bds4 ba- House for sale,4.0,4.0,House,,Vaughan
2,"81 Mahogany Forest Dr, Vaughan, ON L6A 0T1",1258800.0,4 bds4 ba- House for sale,4.0,4.0,House,,Vaughan
3,"99 Abner Miles Dr, Vaughan, ON L6A 4X4",2299000.0,5 bds6 ba- House for sale,5.0,6.0,House,,Vaughan
4,"26 Bruce St #E17, Vaughan, ON L4L 0H4",649999.0,2 bds2 ba- Condo for sale,2.0,2.0,Condo,,Vaughan
5,"31 Ravineview Dr, Vaughan, ON L6A 3V2",979000.0,4 bds3 ba- House for sale,4.0,3.0,House,,Vaughan
6,"96 Agostino Cres, Vaughan, ON L4K 5L6",999000.0,4 bds4 ba- House for sale,4.0,4.0,House,,Vaughan
7,"16 Via Romano Blvd, Vaughan, ON L6A 4Y9",1498900.0,4 bds5 ba- Townhouse for sale,4.0,5.0,Townhouse,,Vaughan
8,"18 Alba Ave, Vaughan, ON L4H 2A7",1188000.0,2 bds3 ba- House for sale,2.0,3.0,House,,Vaughan
9,"40 National Pine Dr, Vaughan, ON L6A 3M3",898888.0,3 bds4 ba- House for sale,3.0,4.0,House,,Vaughan


In [15]:
df['area_sqft'].isnull().mean()


np.float64(0.8726946721311475)

In [19]:
df.head(10)

Unnamed: 0,address,price,details,bedrooms,bathrooms,category,city
0,"218 Golden Trl, Vaughan, ON L6A 5A1",1199000.0,3 bds3 ba- Townhouse for sale,3.0,3.0,Townhouse,Vaughan
1,"24 Sicilia St, Vaughan, ON L4H 1G3",1399999.0,4 bds4 ba- House for sale,4.0,4.0,House,Vaughan
2,"81 Mahogany Forest Dr, Vaughan, ON L6A 0T1",1258800.0,4 bds4 ba- House for sale,4.0,4.0,House,Vaughan
3,"99 Abner Miles Dr, Vaughan, ON L6A 4X4",2299000.0,5 bds6 ba- House for sale,5.0,6.0,House,Vaughan
4,"26 Bruce St #E17, Vaughan, ON L4L 0H4",649999.0,2 bds2 ba- Condo for sale,2.0,2.0,Condo,Vaughan
5,"31 Ravineview Dr, Vaughan, ON L6A 3V2",979000.0,4 bds3 ba- House for sale,4.0,3.0,House,Vaughan
6,"96 Agostino Cres, Vaughan, ON L4K 5L6",999000.0,4 bds4 ba- House for sale,4.0,4.0,House,Vaughan
7,"16 Via Romano Blvd, Vaughan, ON L6A 4Y9",1498900.0,4 bds5 ba- Townhouse for sale,4.0,5.0,Townhouse,Vaughan
8,"18 Alba Ave, Vaughan, ON L4H 2A7",1188000.0,2 bds3 ba- House for sale,2.0,3.0,House,Vaughan
9,"40 National Pine Dr, Vaughan, ON L6A 3M3",898888.0,3 bds4 ba- House for sale,3.0,4.0,House,Vaughan


In [20]:


# Save the updated dataset
df.to_csv("../data/processed/toronto_cleaned_housingdata.csv", index=False)

print("✅ Saved cleaned data without area_sqft.")


✅ Saved cleaned data without area_sqft.
