# Cleaning the Dataset

First, lets only take features that we want to work with

In [22]:
import pandas as pd
import googlemaps
import json
import numpy as np
from geopy.distance import geodesic
import time

# Load JSON list into DataFrame
with open("Zillow_Data_2025/Zillow-March2025-dataset_part0.json") as f:
    raw_data = json.load(f)

# Cleaning our data, selecting only fields that seem useful
cleaned_data = []
for entry in raw_data:
    cleaned_entry = {
        "zpid": entry.get("zpid"),
        "bedrooms": entry.get("bedrooms"),
        "bathrooms": entry.get("bathrooms"),
        "price": entry.get("price"),
        "year_built": entry.get("yearBuilt"),
        "living_area_sqft": entry.get("livingArea"),
        "latitude": entry.get("latitude"),
        "longitude": entry.get("longitude"),
        "home_type": entry.get("homeType"),
        "lot_size_sqft": entry.get("lotSize"),
        "property_tax_rate": entry.get("propertyTaxRate"),
        "rent_zestimate": entry.get("rentZestimate"),
        "zestimate": entry.get("zestimate"),
        "home_status": entry.get("homeStatus"),
        "is_rental_property": entry.get("isNonOwnerOccupied"),
        "last_sold_price": entry.get("lastSoldPrice"),
        "city": entry.get("city"),
        "state": entry.get("state"),
        "zipcode": entry.get("zipcode"),
        "description": entry.get("description"),
        "full_address": ", ".join([loc["fullValue"] for loc in entry.get("formattedChip", {}).get("location", []) if loc.get("fullValue")])
    }
    cleaned_data.append(cleaned_entry)


First, let's clean the dataset to remove any entries with an unrealistic price range (less than or equal to zero), as well as an invalid latitude/longitude entry.

In [23]:
df = pd.DataFrame(cleaned_data)

# Step 2: Remove invalid or missing lat/lon or price ≤ 0
df = df.dropna(subset=["latitude", "longitude", "price"])
df = df[df["price"] > 0]

Secondly, let's calculate create a new feature for each entry, which generates a metric from 0-5 based on the distance of each property to the quad in front of Wilson Library.

In [24]:
# Step 3: Add distance to UNC (North Campus reference point)
unc_coords = (35.909895, -79.050053)
df["distance_to_unc_km"] = df.apply(
    lambda row: geodesic((row["latitude"], row["longitude"]), unc_coords).km,
    axis=1
)

# Step 4: Normalize distance into proximity score (0 = far, 5 = close) using logarithmic scaling
max_dist = df["distance_to_unc_km"].max()

if max_dist == 0:
    df["proximity_score"] = 5.0
else:
    df["proximity_score"] = df["distance_to_unc_km"].apply(
        lambda d: round(5 * (1 - np.log(d + 1) / np.log(max_dist + 1)), 2)
        if not np.isnan(d) and d > 0 else 5.0  # Score 5 for distance = 0
    )

Let's also create a new feature which generates a metric from 0-5 based on how new a building is compared to relative entries.

In [None]:
# Step 5: Add normalized year_built score (0 = oldest, 5 = newest)
valid_years = df["year_built"].dropna()
min_year = valid_years.min()
max_year = valid_years.max()

if max_year == min_year:
    df["year_built_score"] = 5.0
else:
    df["year_built_score"] = df["year_built"].apply(
        lambda y: round(5 * (y - min_year) / (max_year - min_year), 2)
        if not np.isnan(y) else np.nan
    )


Now, lets use Google's Places API to obtain all grocery store locations in Chapel Hill and a 7000 Kilometer Radius

In [26]:

# Initialize Google Maps client
API_KEY = "AIzaSyCSn6nXIhrzjrceZUDpXquSGz3_iCjpfuI"
gmaps = googlemaps.Client(key=API_KEY)

# Center of Chapel Hill (Can Just Use Franklin Street)
chapel_hill_center = (35.9132, -79.0558)

grocery_places = gmaps.places_nearby(
    location=chapel_hill_center,
    keyword="grocery store",
    type="supermarket",
    radius=7000  # ~7 km should cover most of Chapel Hill and areas in Carborro
)

grocery_coords = []
for place in grocery_places["results"]:
    loc = place["geometry"]["location"]
    grocery_coords.append((loc["lat"], loc["lng"]))




Then calculate the distance from each property to it's closest grocery store

In [27]:
def min_distance_to_grocers(home_coords, store_coords):
    return min(geodesic(home_coords, store).km for store in store_coords)

df["distance_to_grocer_km"] = df.apply(
    lambda row: min_distance_to_grocers((row["latitude"], row["longitude"]), grocery_coords),
    axis=1
)

And normalize these distances and create a metric from 0-5 based on distances to each property's nearest grocery store.

In [28]:
max_grocer_dist = df["distance_to_grocer_km"].max()

df["grocer_score"] = df["distance_to_grocer_km"].apply(
    lambda d: round(5 * (1 - np.log(d + 1) / np.log(max_grocer_dist + 1)), 2)
    if not np.isnan(d) else np.nan
)


Now, we can save our data to a csv file! Let's also view a sample of the data

In [30]:
# Save your current DataFrame
df.to_csv("Zillow_March2025_cleaned_scored.csv", index=False)

# Display a sample of 10 rows
sample_df = df.sample(n=10, random_state=42)
print(sample_df[["full_address", "price", "distance_to_unc_km", "year_built_score","proximity_score", "distance_to_grocer_km", "grocer_score"]])

                                         full_address    price  \
421        1212 Hillview Rd #A, Chapel Hill, NC 27514   285200   
461         1005 S Columbia St, Chapel Hill, NC 27514   724000   
122  4200 Old Greensboro Rd #B, Chapel Hill, NC 27516   727300   
30               103 Dorset Pt, Chapel Hill, NC 27516   755200   
430      1513 E Franklin St #F, Chapel Hill, NC 27514   274800   
164  1515 E Franklin St APT 33, Chapel Hill, NC 27514   313800   
336             100 Orchard Ln #1, Carrboro, NC 27510   744900   
464           201 Chimeneas Pl, Chapel Hill, NC 27514  1843100   
277                  307 Kinsale Dr, Durham, NC 27707   797300   
308         320 Scarlett Dr #B, Chapel Hill, NC 27517   522900   

     distance_to_unc_km  year_built_score  proximity_score  \
421            2.195459               NaN             4.04   
461            1.649515              2.03             4.20   
122           12.219600              4.25             2.88   
30             2.205069  