In [3]:
import pandas as pd
import numpy as np
import os

INTERIM_DIR = '../data/interim/'
PROCESSED_DIR = '../data/processed/'
ENRICHED_FILE = os.path.join(INTERIM_DIR, 'sales_data_enriched.csv')
CLEAN_FILE = os.path.join(PROCESSED_DIR, 'final_cleaned_data.csv')

In [4]:
dtype_spec = {
    'sale_price': 'object',
    'class': 'str',
    'pin': 'str',
    'pin10': 'str'
}
df = pd.read_csv(ENRICHED_FILE, dtype=dtype_spec)
df.shape

  df = pd.read_csv(ENRICHED_FILE, dtype=dtype_spec)


(1782572, 26)

In [5]:
# Convert sale_price to numeric (handles '$' and commas)
df['sale_price'] = df['sale_price'].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False)
df['sale_price'] = pd.to_numeric(df['sale_price'], errors='coerce')

In [6]:
# Drop rows where critical enriched features are missing (should be few after initial enrichment)
df.dropna(subset=['sale_price', 'lon', 'lat', 'min_distance_meters'], inplace=True)
print(f"Shape after critical null drops: {df.shape}")

Shape after critical null drops: (1782572, 26)


In [7]:
print(df['class'].unique())
RESIDENTIAL_CLASSES = ['202', '203', '204', '205', '206', '207', '208', '209']
df['class'] = df['class'].astype(str).str[:3]
df_filtered = df[df['class'].isin(RESIDENTIAL_CLASSES)].copy()
df_filtered.shape

['278' '299' '203' '100' '205' '590' '218' '204' '592' '201' '211' '399'
 '207' '517' '315' '202' '580' '234' '593' '390' 'RR' '523' '295' '528'
 'EX' '212' '597' '321' '241' '314' '239' '210' '397' '221' '591' '319'
 '206' '522' '220' '290' '190' '527' '391' '318' '516' '530' '294' '599'
 '396' '490' '663' '297' '529' '209' '533' '320' '670' '535' '208' '219'
 '288' '213' '531' '897' '880' '589' '831' '581' '236' '532' '817' '224'
 '638' '893' '200' '435' '313' '583' '797' '550' '654' '693' '587' '301'
 '915' 'OA3' '526' '914' '991' '417' '492' '765' '501' '491' '997' '790'
 '497' '679' '225' '500' '921' '677' '990' '913' '890' '689' '918' '493'
 '420' '680' '891' '730' '791' '499' '421' '887' '919' '717' '823' '883'
 '996' '428' '899' '827' '668' '833' '828' '889' '483' '729' '892' '801'
 '792' '830' '673' '663A' '767' '300' '663B' '670B' 'OA1' '817A' '817B'
 '717A' '822' '829' '893A' '893B' '729A']


(541888, 26)

In [8]:
df_filtered = df_filtered[
    (df_filtered['is_multisale'] == False) &
    (df_filtered['sale_filter_less_than_10k'] == False) &
    (df_filtered['sale_filter_deed_type'] == False)
]
df_filtered.shape

(451231, 26)

In [9]:
df_filtered['sale_date'] = pd.to_datetime(df_filtered['sale_date'], errors='coerce')
RECENT_YEAR = 2018
df_filtered = df_filtered[df_filtered['sale_date'].dt.year >= RECENT_YEAR]
df_filtered.shape

(126228, 26)

In [10]:
lower_bound = df_filtered['sale_price'].quantile(0.01)
upper_bound = df_filtered['sale_price'].quantile(0.99)
df_final = df_filtered[
    (df_filtered['sale_price'] >= lower_bound) & 
    (df_filtered['sale_price'] <= upper_bound)
].copy()
df_final.describe()

Unnamed: 0,year,township_code,nbhd,sale_date,sale_price,num_parcels_sale,row_id,lon,lat,min_distance_meters
count,123817.0,123817.0,123817.0,123817,123817.0,123817.0,123817.0,123817.0,123817.0,123817.0
mean,2021.478149,47.388129,47521.803476,2021-12-21 15:17:28.363310080,288092.8,1.0,34396490.0,-87.728588,41.783528,7467.391757
min,2018.0,10.0,10011.0,2018-01-01 00:00:00,25000.0,1.0,7087464.0,-88.027405,41.469928,29.502009
25%,2020.0,27.0,27020.0,2020-04-24 00:00:00,155000.0,1.0,7338061.0,-87.794372,41.686498,2079.619591
50%,2021.0,39.0,39080.0,2021-12-02 00:00:00,246000.0,1.0,7600373.0,-87.728322,41.77839,4644.494408
75%,2023.0,72.0,72030.0,2023-09-13 00:00:00,350000.0,1.0,96517380.0,-87.664209,41.907778,10303.747153
max,2025.0,77.0,77170.0,2025-09-29 00:00:00,1549999.0,1.0,98484840.0,-87.524891,42.065343,34227.066903
std,2.09922,23.874686,23925.131343,,210395.9,0.0,41187290.0,0.092667,0.144474,7363.38893


In [11]:
df_final.to_csv(CLEAN_FILE, index=False)