In [2]:

import pandas as pd
import geopandas as gpd
import folium
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
import numpy as np
from shapely.geometry import Point
import seaborn as sns
from sklearn.neighbors import BallTree






def add_public_school_dataset(file_path):

    """
    This function takes tsv file input, joins with public school dataset by adding 3 new features to initial input
    """


    #load haunted places dataset from tsv
    #file_path = "data/daylight_added.tsv"
    df = pd.read_csv(file_path, sep="\t", on_bad_lines="skip")

    # Load the Public Schools GeoJSON File
    schools_gdf = gpd.read_file("data/Public_Schools_-5088709809754466635.geojson")

    #print(gdf.head())
    #print(gdf.info())
    #print(gdf.describe())

    # Basic plot of the geometries to make sure it has high coverage
    schools_gdf.plot(figsize=(10, 6), edgecolor="black", cmap="viridis")
    plt.title("GeoJSON Spatial Data")
    plt.show()


    """
    # Convert Public Schools GeoJSON to CSV (including lat-long).

    # Since GeoJSON contains geometries, we need to extract latitude and longitude before saving it to CSV.

    # Extract latitude & longitude
    gdf["latitude"] = gdf.geometry.y
    gdf["longitude"] = gdf.geometry.x

    # Convert to DataFrame (removing geometry)
    schools_df = gdf.drop(columns=["geometry"])

    # Save as CSV
    schools_df.to_csv("public_schools.csv", index=False)

    print("Public Schools dataset successfully converted to CSV.")
    """





    # Convert Haunted Places to GeoDataFrame type for analysis
    haunted_gdf = gpd.GeoDataFrame(
    df, 
    geometry=[Point(lon, lat) for lon, lat in zip(df["longitude"], df["latitude"])], 
    crs="EPSG:4326"
    )

    """
    # Convert Schools CSV to GeoDataFrame
    schools_gdf = gpd.GeoDataFrame(
    schools_df, 
    geometry=[Point(lon, lat) for lon, lat in zip(schools_df["longitude"], schools_df["latitude"])], 
    crs="EPSG:4326"
    )
    """
    # Display structure
    #print(schools_gdf.head())
    #print(haunted_gdf.head())

    #adding feature 1 -> number of schools within 10 miles of each haunted place

    #Converting to Coordinate Reference System (CRS) for Distance Calculation We need to convert both datasets into a projected coordinate system (EPSG:3857) for accurate distance computations.


    # Convert both datasets to EPSG:3857 (meters-based projection)
    schools_gdf = schools_gdf.to_crs(epsg=3857)  
    haunted_gdf = haunted_gdf.to_crs(epsg=3857)

    #Creating a Buffer Around Each Haunted Place We'll create a 10-mile buffer around each haunted place (1 mile ≈ 1609.34 meters).


    haunted_gdf["buffer_10_miles"] = haunted_gdf.geometry.buffer(10 * 1609.34)


    # Function to count schools within the buffer of each haunted place
    def count_schools_within_buffer(haunted_row):
        return schools_gdf[schools_gdf.geometry.within(haunted_row["buffer_10_miles"])].shape[0]

    # Apply function to count schools near each haunted place
    haunted_gdf["schools_within_10_miles"] = haunted_gdf.apply(count_schools_within_buffer, axis=1)

    #haunted_gdf["schools_within_10_miles"].describe()


    #visualizing feature 1 (might be used for step 6) ___REMOVE IF NEEDED___

    """
    # Column to visualize
    column_name = "schools_within_10_miles"  # Change this to any column in your df

    # Plot the distribution
    plt.figure(figsize=(8, 5))
    sns.histplot(haunted_gdf[column_name], bins=20, kde=True, color="blue")

    # Labels & Title
    plt.xlabel(column_name)
    plt.ylabel("Frequency")
    plt.title(f"Distribution of {column_name}")
    plt.grid(True)

    # Show plot
    plt.show()
    """

    

    # Adding feature 2 "distance to nearest school" 





    def compute_nearest_distances(haunted_gdf, schools_gdf, method="haversine"):
        """
        Computes the nearest school distance for each haunted location using BallTree.

        Parameters:
            haunted_gdf (DataFrame): DataFrame containing haunted locations with 'latitude' and 'longitude'.
            schools_df (DataFrame): DataFrame containing school locations with 'latitude' and 'longitude'.
            method (str): "haversine" (fastest) or "euclidean" (for planar distances).

        Returns:
            haunted_gdf with an additional column 'distance_to_nearest_school_km'.
        """
        # Drop rows with NaN values in latitude/longitude
        haunted_gdf = haunted_gdf.dropna(subset=["latitude", "longitude"]).reset_index(drop=True)
        schools_gdf = schools_gdf.dropna(subset=["latitude", "longitude"]).reset_index(drop=True)

        # Convert degrees to radians for BallTree (needed for haversine distance)
        haunted_coords = np.radians(haunted_gdf[["latitude", "longitude"]].to_numpy())
        school_coords = np.radians(schools_gdf[["latitude", "longitude"]].to_numpy())

        # Check if we have at least one school location
        if school_coords.shape[0] == 0:
            print("⚠ No valid school coordinates found. Setting distance to NaN.")
            haunted_gdf["distance_to_nearest_school_km"] = np.nan
            return haunted_gdf

        # Build BallTree with school locations
        tree = BallTree(school_coords, metric="haversine")

        # Query the nearest school for each haunted location
        distances, _ = tree.query(haunted_coords, k=1)  # k=1 means nearest neighbor

        # Convert from radians to kilometers (Earth's radius = 6371 km)
        haunted_gdf["distance_to_nearest_school_km"] = distances[:, 0] * 6371

        return haunted_gdf

    # Choose method (Haversine is best for geographic distance)
    distance_method = "haversine"

    # Compute distances using BallTree
    haunted_gdf = compute_nearest_distances(haunted_gdf, schools_gdf, method=distance_method)

    # Save the dataset after feature 2 if needed
    #haunted_gdf.to_csv("haunted_places_with_distance.csv", index=False)

    #adding feature 3 "is_haunted_place_a_school?"




    
    
    # Create a new column 'is_haunted_place_school?' based on whether 'school' appears in 'location'
    haunted_gdf["is_haunted_place_a_school?"] = haunted_gdf["location"].str.contains("school", case=False, na=False).map({True: "Yes", False: "No"})

    # Save the updated dataset
    haunted_gdf.to_csv("haunted_places_with_new_features.csv", index=False)

    # Display the first few rows to verify if needed
    #print(haunted_gdf.head())
    



    



    return haunted_gdf

new1234=add_public_school_dataset("data/haunted_places.tsv")

FileNotFoundError: [Errno 2] No such file or directory: 'data/haunted_places.tsv'