In [1]:
import pandas as pd
import numpy as np
import os

INTERIM_DIR = '../data/interim/'
PROCESSED_DIR = '../data/processed/'
ENRICHED_FILE = os.path.join(INTERIM_DIR, 'sales_data_enriched.csv')
CLEAN_FILE = os.path.join(PROCESSED_DIR, 'final_cleaned_data.csv')

In [2]:
dtype_spec = {
    'sale_price': 'object',
    'class': 'str',
    'pin': 'str',
    'pin10': 'str'
}
df = pd.read_csv(ENRICHED_FILE, dtype=dtype_spec)
df.shape

  df = pd.read_csv(ENRICHED_FILE, dtype=dtype_spec)


(918551, 26)

In [3]:
# Convert sale_price to numeric (handles '$' and commas)
df['sale_price'] = df['sale_price'].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False)
df['sale_price'] = pd.to_numeric(df['sale_price'], errors='coerce')

In [4]:
# Drop rows where critical enriched features are missing (should be few after initial enrichment)
df.dropna(subset=['sale_price', 'lon', 'lat', 'min_distance_meters'], inplace=True)
print(f"Shape after critical null drops: {df.shape}")

Shape after critical null drops: (918551, 26)


In [5]:
print(df['class'].unique())
RESIDENTIAL_CLASSES = ['202', '203', '204', '205', '206', '207', '208', '209']
df['class'] = df['class'].astype(str).str[:3]
df_filtered = df[df['class'].isin(RESIDENTIAL_CLASSES)].copy()
df_filtered.shape

['234' '203' '211' '241' '202' '295' '299' '210' '212' '205' '100' 'EX'
 '315' '206' '278' '204' '207' '290' '590' '593' '517' '597' '523' '201'
 '522' '529' '297' '599' '318' '208' '209' '592' '213' '391' '314' '591'
 '239' '218' '580' '583' '313' '528' '190' '717' '790' '817' '531' '200'
 '530' '390' '501' '914' '301' '497' '663' '527' '893' '996' '990' '219'
 '915' '224' '321' '663B' '670' '823' '890' '897' '533' '892' '918' '397'
 '532' '822' '589' '829' 'RR' '221' '535' '319' '300' '490' '889' '893A'
 '797' '288' '587' '991' '663A' '396' '880' '899' '417' '491' '893B' '499'
 '729A' '225' '516' '492' '883' '399' '220' '320' '550' 'OA3' '236' '435'
 '693' '294' '765' '791' '526' '670B' '679' '420' '680' '677' '997' '921'
 '581' '500' '913' '428' '833' '638' '792']


(155618, 26)

In [6]:
df_filtered = df_filtered[
    (df_filtered['is_multisale'] == False) &
    (df_filtered['sale_filter_less_than_10k'] == False) &
    (df_filtered['sale_filter_deed_type'] == False)
]
df_filtered.shape

(131395, 26)

In [7]:
df_filtered['sale_date'] = pd.to_datetime(df_filtered['sale_date'], errors='coerce')
RECENT_YEAR = 2018
df_filtered = df_filtered[df_filtered['sale_date'].dt.year >= RECENT_YEAR]
df_filtered.shape

(43092, 26)

In [8]:
lower_bound = df_filtered['sale_price'].quantile(0.01)
upper_bound = df_filtered['sale_price'].quantile(0.99)
df_final = df_filtered[
    (df_filtered['sale_price'] >= lower_bound) & 
    (df_filtered['sale_price'] <= upper_bound)
].copy()
df_final.describe()

Unnamed: 0,year,township_code,nbhd,sale_date,sale_price,num_parcels_sale,row_id,lon,lat,min_distance_meters
count,42234.0,42234.0,42234.0,42234,42234.0,42234.0,42234.0,42234.0,42234.0,42234.0
mean,2022.93465,47.572146,47704.455273,2023-05-30 08:12:41.017189632,312701.6,1.0,21685470.0,-87.725775,41.782415,7483.400095
min,2018.0,10.0,10012.0,2018-01-01 00:00:00,26000.0,1.0,7087465.0,-88.027405,41.469928,41.235655
25%,2022.0,27.0,27030.0,2022-02-18 00:00:00,160000.0,1.0,7587752.0,-87.792542,41.683642,2020.975023
50%,2024.0,39.0,39080.0,2024-04-29 00:00:00,263000.0,1.0,7660740.0,-87.72353,41.776764,4573.243273
75%,2024.0,72.0,72030.0,2024-11-22 00:00:00,380000.0,1.0,7744809.0,-87.661064,41.908653,10387.098532
max,2025.0,77.0,77170.0,2025-09-29 00:00:00,1708000.0,1.0,98484720.0,-87.524919,42.065298,33760.975882
std,2.166531,23.938365,23985.086726,,237683.8,0.0,32649490.0,0.092275,0.14583,7461.21643


In [9]:
df_final.to_csv(CLEAN_FILE, index=False)