KNN on LONG an LAT to predict whether a school would have a sport or not based on geography

In [96]:
import pandas as pd
import pickle
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from sklearn.neighbors import KNeighborsClassifier

a_cols = pickle.load(open('../data/mappings/Accolades_columns.pkl' ,'rb'))
c_cols = pickle.load(open('../data/mappings/Census_columns.pkl' ,'rb'))
cs_dict = pickle.load(open('../data/mappings/CollegeScorecard_columns.pkl' ,'rb'))
cs_cols = [col for cols in cs_dict.values() for col in cols]
sp_dict = pickle.load(open('../data/mappings/SportParticipation_columns.pkl' ,'rb'))
sp_cols = [col for cols in sp_dict.values() for col in cols]

df = pd.read_pickle('../data/merged/full_df.pkl').dropna(subset=['LONGITUDE'])[cs_dict['keys'] + ['LONGITUDE', 'LATITUDE'] + sp_dict['mens_part'] + sp_dict['womens_part']]

In [122]:
sp_dict = pickle.load(open('../data/mappings/SportParticipation_columns.pkl', 'rb'))

part_cols = sp_dict['mens_part'] + sp_dict['womens_part']
part_cols = [c for c in part_cols if c in df.columns]
print("Using", len(part_cols), "participation columns")

shp_path = "../data/raw/ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp"

world = gpd.read_file(shp_path).to_crs("EPSG:4326")
usa = world[world["NAME"] == "United States of America"]

minx, miny, maxx, maxy = usa.total_bounds
step = 0.25

lats = np.arange(miny, maxy + step, step)
lons = np.arange(minx, maxx + step, step)

lon_grid, lat_grid = np.meshgrid(lons, lats)
grid_deg = np.column_stack([lat_grid.ravel(), lon_grid.ravel()])

grid_df = pd.DataFrame({
    "LATITUDE": grid_deg[:, 0],
    "LONGITUDE": grid_deg[:, 1],
})

grid_gdf = gpd.GeoDataFrame(
    grid_df,
    geometry=[Point(lon, lat) for lat, lon in grid_deg],
    crs="EPSG:4326"
)

land_grid_gdf = gpd.sjoin(
    grid_gdf,
    usa[["geometry"]],
    how="inner",
    predicate="within"
)

land_grid_df = land_grid_gdf[["LATITUDE", "LONGITUDE"]].reset_index(drop=True)

grid_coords_deg = land_grid_df[["LATITUDE", "LONGITUDE"]].to_numpy()
grid_coords_rad = np.radians(grid_coords_deg)

data_coords_deg = df[["LATITUDE", "LONGITUDE"]].to_numpy()
data_coords_rad = np.radians(data_coords_deg)

years = sorted(df["Year"].dropna().unique())
k_list = [5, 10, 20, 50]

all_runs = []

for yr in years:
    year_mask = (df["Year"] == yr)
    if not year_mask.any():
        continue

    X_year_rad_all = data_coords_rad[year_mask]
    df_year = df.loc[year_mask].reset_index(drop=True)
    n_year = len(df_year)

    if n_year == 0 or yr != 2023:
        continue

    for k in k_list:
        k_eff = min(k, n_year)

        run_pred = pd.DataFrame({
            "Year": yr,
            "k": k,
            "LATITUDE": land_grid_df["LATITUDE"].values,
            "LONGITUDE": land_grid_df["LONGITUDE"].values,
        })

        for col in part_cols:
            y_raw = df_year[col].fillna(0)
            y_train = (y_raw > 0).astype(int).to_numpy()

            if len(np.unique(y_train)) < 2:
                run_pred[col] = int(y_train[0])
                continue

            X_train_rad = X_year_rad_all
            k_col = min(k_eff, len(X_train_rad))

            clf = KNeighborsClassifier(
                n_neighbors=k_col,
                metric="haversine",
                weights="distance"
            )
            clf.fit(X_train_rad, y_train)

            y_pred_grid = clf.predict(grid_coords_rad)
            run_pred[col] = y_pred_grid

        all_runs.append(run_pred)
pred_df = pd.concat(all_runs, ignore_index=True)

print(pred_df.shape)
pred_df.head()

Using 78 participation columns
(71896, 82)


Unnamed: 0,Year,k,LATITUDE,LONGITUDE,Archery Men's Team Participation,Badminton Men's Team Participation,Baseball Men's Team Participation,Basketball Men's Team Participation,Beach Volleyball Men's Team Participation,Bowling Men's Team Participation,...,Tennis Women's Team Participation,Track and Field Indoor Women's Team Participation,Track and Field Outdoor Women's Team Participation,Track and Field X Country Women's Team Participation,Volleyball Women's Team Participation,Water Polo Women's Team Participation,Weight Lifting Women's Team Participation,Wrestling Women's Team Participation,Other Sports Women's Team Participation,Total Women's Team Participation
0,2023,5,19.16619,-155.791111,0,0,1,1,0,0,...,1,0,0,1,1,0,0,0,0,1
1,2023,5,19.16619,-155.541111,0,0,1,1,0,0,...,1,0,0,1,1,0,0,0,0,1
2,2023,5,19.41619,-155.791111,0,0,1,1,0,0,...,1,0,0,1,1,0,0,0,0,1
3,2023,5,19.41619,-155.541111,0,0,1,1,0,0,...,1,0,0,1,1,0,0,0,0,1
4,2023,5,19.41619,-155.291111,0,0,1,1,0,0,...,1,0,0,1,1,0,0,0,0,1


In [126]:
pred_df = pred_df.loc[:, (pred_df != 0).any(axis=0)]
pred_df.to_csv("../data/tableau/knn_participation_land_grid_by_year.csv", index=False)