In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
from shapely.geometry import Point
from sklearn.neighbors import BallTree

import os

def add_public_school_dataset(file_path):
    """
    This function takes a TSV file input, joins it with a public school dataset,
    and adds 3 new features to the initial input.
    """
    # Load haunted places dataset from TSV
    haunted_df = pd.read_csv(file_path, sep="\t", on_bad_lines="skip")

    # Load the Public Schools GeoJSON File
    schools_gdf = gpd.read_file("../data/Public_Schools_-5088709809754466635.geojson")

    # Extract latitude and longitude from the geometry
    schools_gdf["latitude"] = schools_gdf.geometry.y
    schools_gdf["longitude"] = schools_gdf.geometry.x

    # Convert to DataFrame (removing geometry)
    schools_df = schools_gdf.drop(columns=["geometry"])

    

    # Convert Haunted Places to GeoDataFrame
    haunted_gdf = gpd.GeoDataFrame(
        haunted_df,
        geometry=gpd.points_from_xy(haunted_df["longitude"], haunted_df["latitude"]),
        crs="EPSG:4326"
    )

    # Convert Schools CSV to GeoDataFrame
    schools_gdf = gpd.GeoDataFrame(
        schools_df,
        geometry=gpd.points_from_xy(schools_df["longitude"], schools_df["latitude"]),
        crs="EPSG:4326"
    )

    # Convert both datasets to EPSG:3857 (meters-based projection)
    schools_gdf = schools_gdf.to_crs(epsg=3857)
    haunted_gdf = haunted_gdf.to_crs(epsg=3857)

    # Create a 10-mile buffer around each haunted place
    haunted_gdf["buffer_10_miles"] = haunted_gdf.geometry.buffer(10 * 1609.34)

    # Count schools within the buffer of each haunted place
    haunted_gdf["schools_within_10_miles"] = haunted_gdf["buffer_10_miles"].apply(
        lambda buffer: schools_gdf[schools_gdf.geometry.within(buffer)].shape[0]
    )

    # Compute nearest school distances using BallTree
    def compute_nearest_distances(haunted_gdf, schools_gdf):
        """
        Computes the nearest school distance for each haunted location using BallTree.
        """
        # Drop rows with NaN values in latitude/longitude
        haunted_gdf = haunted_gdf.dropna(subset=["latitude", "longitude"]).copy()
        schools_gdf = schools_gdf.dropna(subset=["latitude", "longitude"]).copy()

        # Convert degrees to radians for BallTree
        haunted_coords = np.radians(haunted_gdf[["latitude", "longitude"]].to_numpy())
        school_coords = np.radians(schools_gdf[["latitude", "longitude"]].to_numpy())

        if school_coords.shape[0] == 0:
            print("⚠ No valid school coordinates found. Setting distance to NaN.")
            haunted_gdf["distance_to_nearest_school_km"] = np.nan
            return haunted_gdf

        # Build BallTree with school locations
        tree = BallTree(school_coords, metric="haversine")

        # Query the nearest school for each haunted location
        distances, _ = tree.query(haunted_coords, k=1)

        # Convert from radians to kilometers (Earth's radius = 6371 km)
        haunted_gdf.loc[:, "distance_to_nearest_school_km"] = distances[:, 0] * 6371

        return haunted_gdf

    # Compute distances using BallTree
    haunted_gdf = compute_nearest_distances(haunted_gdf, schools_gdf)

    # Add feature 3: Check if haunted place is a school
    haunted_gdf.loc[:, "is_haunted_place_a_school"] = haunted_gdf["location"].str.contains(
        "school", case=False, na=False
    ).map({True: "Yes", False: "No"})


    

    # Save the updated dataset
    haunted_gdf.to_csv("../data/haunted_places_with_new_features.csv", index=False)

    return haunted_gdf

# Example usage
new1234 = add_public_school_dataset("../data/haunted_places.tsv")

In [2]:
new1234

Unnamed: 0,city,country,description,location,state,state_abbrev,longitude,latitude,city_longitude,city_latitude,date_occured,geometry,buffer_10_miles,schools_within_10_miles,distance_to_nearest_school_km,is_haunted_place_a_school?
0,Ada,United States,Ada witch - Sometimes you can see a misty blue...,Ada Cemetery,Michigan,MI,-85.504893,42.962106,-85.495480,42.960727,2025-02-20,POINT (-9518361.16 5306205.786),"POLYGON ((-9502267.76 5306205.786, -9502345.25...",60,1.326176,No
1,Addison,United States,A little girl was killed suddenly while waitin...,North Adams Rd.,Michigan,MI,-84.381843,41.971425,-84.347168,41.986434,2025-01-01,POINT (-9393343.839 5156699.978),"POLYGON ((-9377250.439 5156699.978, -9377327.9...",5,3.642689,No
2,Adrian,United States,If you take Gorman Rd. west towards Sand Creek...,Ghost Trestle,Michigan,MI,-84.035656,41.904538,-84.037166,41.897547,2025-01-01,POINT (-9354806.457 5146690.408),"POLYGON ((-9338713.057 5146690.408, -9338790.5...",18,0.596638,No
3,Adrian,United States,"In the 1970's, one room, room 211, in the old ...",Siena Heights University,Michigan,MI,-84.017565,41.905712,-84.037166,41.897547,1970-02-23,POINT (-9352792.587 5146866.066),"POLYGON ((-9336699.187 5146866.066, -9336776.6...",18,1.104957,No
4,Albion,United States,Kappa Delta Sorority - The Kappa Delta Sororit...,Albion College,Michigan,MI,-84.745177,42.244006,-84.753030,42.243097,2025-01-01,POINT (-9433790.006 5197600.789),"POLYGON ((-9417696.606 5197600.789, -9417774.1...",5,0.822609,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10987,Westminster,United States,at 12 midnight you can see a lady with two lit...,city hall,Colorado,CO,-105.048936,39.862610,-105.037205,39.836653,2025-01-01,POINT (-11693994.064 4845997.164),"POLYGON ((-11677900.664 4845997.164, -11677978...",182,1.216969,No
10988,Westminster,United States,Is haunted by the victims of a murder that hap...,Pillar of Fire,Colorado,CO,-105.032091,39.847237,-105.037205,39.836653,2025-01-01,POINT (-11692118.932 4843767.989),"POLYGON ((-11676025.532 4843767.989, -11676103...",219,0.602155,No
10989,Wheat Ridge,United States,The institution was for kids 18 years old and ...,Ridge Mental Institution,Colorado,CO,-105.063974,39.769726,-105.077206,39.766098,2025-02-18,POINT (-11695668.031 4832535.669),"POLYGON ((-11679574.631 4832535.669, -11679652...",275,1.031467,No
10990,Wheat Ridge,United States,Gymnasium - their have been reports of a litt...,Wheat Ridge Middle School,Colorado,CO,-105.103613,39.764055,-105.077206,39.766098,2025-01-01,POINT (-11700080.635 4831714.386),"POLYGON ((-11683987.235 4831714.386, -11684064...",239,0.031699,Yes
