In [1]:
import pandas as pd
import numpy as np
import os

INTERIM_DIR = '../data/interim/'
PROCESSED_DIR = '../data/processed/'
ENRICHED_FILE = os.path.join(INTERIM_DIR, 'sales_data_enriched.csv')
CLEAN_FILE = os.path.join(PROCESSED_DIR, 'final_cleaned_data.csv')

In [2]:
dtype_spec = {
    'sale_price': 'object',
    'class': 'str',
    'pin': 'str',
    'pin10': 'str'
}
df = pd.read_csv(ENRICHED_FILE, dtype=dtype_spec)
df.shape

  df = pd.read_csv(ENRICHED_FILE, dtype=dtype_spec)


(1626320, 26)

In [3]:
# Convert sale_price to numeric (handles '$' and commas)
df['sale_price'] = df['sale_price'].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False)
df['sale_price'] = pd.to_numeric(df['sale_price'], errors='coerce')

In [4]:
# Drop rows where critical enriched features are missing (should be few after initial enrichment)
df.dropna(subset=['sale_price', 'lon', 'lat', 'min_distance_meters'], inplace=True)
print(f"Shape after critical null drops: {df.shape}")

Shape after critical null drops: (1626320, 26)


In [5]:
print(df['class'].unique())
RESIDENTIAL_CLASSES = ['202', '203', '204', '205', '206', '207', '208', '209']
df['class'] = df['class'].astype(str).str[:3]
df_filtered = df[df['class'].isin(RESIDENTIAL_CLASSES)].copy()
df_filtered.shape

['278' '299' '203' '590' '100' '218' '204' '201' '211' '399' '207' '517'
 '315' '202' '234' '593' '390' '523' '295' '528' '212' '597' '321' '241'
 '239' '205' '210' '580' '592' '221' '591' '314' 'EX' '319' '206' '522'
 '220' '290' '190' '391' '318' '516' '294' '599' '530' '396' '490' '397'
 '297' '209' '533' '320' '663' '670' '535' '219' '288' '213' '208' '531'
 '880' '589' '529' 'RR' '236' '532' '224' '200' '435' '313' '797' '550'
 '693' '587' '893' '583' '914' '527' '991' '915' '492' '765' '501' '491'
 '417' '817' '997' '526' '790' '497' '897' '990' '225' '918' '890' '493'
 '500' '679' '301' '420' '680' '677' '921' '581' '913' '891' '730' '791'
 '499' '421' '887' '919' '717' '823' '883' '996' '428' '689' '899' '827'
 '668' '638' '833' '828' '889' '483' '729' '892' '801' '792' '654' '830'
 '673' '663A' '767' '300' '663B' '670B' 'OA3' 'OA1' '817A' '817B' '717A'
 '822' '829' '893A' '893B' '729A']


(474394, 26)

In [6]:
df_filtered = df_filtered[
    (df_filtered['is_multisale'] == False) &
    (df_filtered['sale_filter_less_than_10k'] == False) &
    (df_filtered['sale_filter_deed_type'] == False)
]
df_filtered.shape

(393654, 26)

In [7]:
df_filtered['sale_date'] = pd.to_datetime(df_filtered['sale_date'], errors='coerce')
RECENT_YEAR = 2018
df_filtered = df_filtered[df_filtered['sale_date'].dt.year >= RECENT_YEAR]
df_filtered.shape

(119438, 26)

In [8]:
lower_bound = df_filtered['sale_price'].quantile(0.01)
upper_bound = df_filtered['sale_price'].quantile(0.99)
df_final = df_filtered[
    (df_filtered['sale_price'] >= lower_bound) & 
    (df_filtered['sale_price'] <= upper_bound)
].copy()
df_final.describe()

Unnamed: 0,year,township_code,nbhd,sale_date,sale_price,num_parcels_sale,row_id,lon,lat,min_distance_meters
count,117189.0,117189.0,117189.0,117189,117189.0,117189.0,117189.0,117189.0,117189.0,117189.0
mean,2021.539522,47.37245,47506.377254,2022-01-12 19:38:20.560632576,288888.1,1.0,33628440.0,-87.728987,41.784007,7461.251549
min,2018.0,10.0,10011.0,2018-01-01 00:00:00,25000.0,1.0,7087464.0,-88.027405,41.469928,29.502009
25%,2020.0,27.0,27010.0,2020-05-20 00:00:00,155000.0,1.0,7340985.0,-87.79472,41.687188,2083.859896
50%,2021.0,39.0,39080.0,2021-12-23 00:00:00,248500.0,1.0,7601532.0,-87.728999,41.77881,4650.102599
75%,2023.0,72.0,72030.0,2023-10-31 00:00:00,352500.0,1.0,96445700.0,-87.664719,41.908366,10296.48389
max,2025.0,77.0,77170.0,2025-09-29 00:00:00,1545000.0,1.0,98484840.0,-87.524891,42.065343,34227.066903
std,2.104329,23.871272,23922.086965,,210193.7,0.0,40840510.0,0.092723,0.144422,7351.721666


In [9]:
df_final.to_csv(CLEAN_FILE, index=False)