# Calculate Distances Between Schools and Tournament Sites

## Import data and packages

In [1]:
import pandas as pd
import numpy as np

# import all geocoded results, including schools and tournament sites
allResults = pd.read_csv('../data/cleaned/geocoded_results.csv')
allResults

Unnamed: 0,seed,school_common_name,site,year,id,school_full_name,team,city,state,type,conference,address,lng,lat
0,1.0,Duke,"Columbia, SC",2019.0,20190,Duke University,Blue Devils,Durham,North Carolina,Private/Non-sectarian,Atlantic Coast Conference,Duke University Durham North Carolina,-78.944230,36.000156
1,1.0,Gonzaga,"Salt Lake City, UT",2019.0,20191,Gonzaga University,Bulldogs,Spokane,Washington,Private/Catholic,West Coast Conference,Gonzaga University Spokane Washington,-117.403044,47.666739
2,1.0,North Carolina,"Columbus, OH",2019.0,20192,University of North Carolina at Chapel Hill,Tar Heels,Chapel Hill,North Carolina,State,Atlantic Coast Conference,University of North Carolina at Chapel Hill Ch...,-79.047753,35.905035
3,1.0,Virginia,"Columbia, SC",2019.0,20193,University of Virginia,Cavaliers,Charlottesville,Virginia,State,Atlantic Coast Conference,University of Virginia Charlottesville Virginia,-78.505500,38.041058
4,2.0,Michigan State,"Des Moines, IA",2019.0,20194,Michigan State University,Spartans,East Lansing,Michigan,State,Big Ten Conference,Michigan State University East Lansing Michigan,-84.477916,42.718568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,,,"Albuquerque, NM",,1985555,,,,,,,"Albuquerque, NM",-106.650985,35.084103
1116,,,"Hartford, CT",,1985556,,,,,,,"Hartford, CT",-72.690855,41.764582
1117,,,"Tulsa, OK",,1985557,,,,,,,"Tulsa, OK",-95.992911,36.155681
1118,,,"Dayton, OH",,1985558,,,,,,,"Dayton, OH",-84.191607,39.758948


## Separate school and site data

The school and tournament site locations will be matched through their shared id, but it will be easier to iterate through the two simultaneously if the dataframes are separated. Furthermore, the dataframes will be saved separately.

In [3]:
# filter for site locations, which will have NA for seed values
sites = allResults[allResults.seed.isna()]

# filter for schools, which will have integers/non-NA values for the seed column
schools = allResults[allResults.seed.isna() == False]

## Calculate distance

Utilizing the haversine formula to calculate the shortest possible distance between two points, the school dataset can be iterated over, matched to the corresponding sites through a shared id, and the distance between the two calculated. The distance results are converted to miles.

In [10]:
# suppress warning for copying from slice of dataframe
pd.options.mode.chained_assignment = None  # default='warn'

# haversine formula function - calculates shortest possible distance between two points
def haversine_distance(lat1, lon1, lat2, lon2):
   r = 6371
   phi1 = np.radians(lat1)
   phi2 = np.radians(lat2)
   delta_phi = np.radians(lat2 - lat1)
   delta_lambda = np.radians(lon2 - lon1)
   a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) *   np.sin(delta_lambda / 2)**2
   res = r * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))
   return np.round(res, 2)

# calculate distances for all schools and corresponding sites
distance = [haversine_distance(
    schools.loc[schools.id == uid, 'lat'][i],
    schools.loc[schools.id == uid, 'lng'][i],
    sites.loc[sites.id == uid, 'lat'][sites.loc[sites.id == uid, 'lat'].index[0]],
    sites.loc[sites.id == uid, 'lng'][sites.loc[sites.id == uid, 'lng'].index[0]]
) * 0.621371 for i, uid in enumerate(schools.id)]  # convert end results to miles

## Apply distance list to school and site dataframes

In [11]:
# set distance as new column in schools dataframe
schools['distance'] = distance
sites['distance'] = distance

# check
schools.head()

Unnamed: 0,seed,school_common_name,site,year,id,school_full_name,team,city,state,type,conference,address,lng,lat,distance
0,1.0,Duke,"Columbia, SC",2019.0,20190,Duke University,Blue Devils,Durham,North Carolina,Private/Non-sectarian,Atlantic Coast Conference,Duke University Durham North Carolina,-78.94423,36.000156,181.862864
1,1.0,Gonzaga,"Salt Lake City, UT",2019.0,20191,Gonzaga University,Bulldogs,Spokane,Washington,Private/Catholic,West Coast Conference,Gonzaga University Spokane Washington,-117.403044,47.666739,549.55294
2,1.0,North Carolina,"Columbus, OH",2019.0,20192,University of North Carolina at Chapel Hill,Tar Heels,Chapel Hill,North Carolina,State,Atlantic Coast Conference,University of North Carolina at Chapel Hill Ch...,-79.047753,35.905035,353.448252
3,1.0,Virginia,"Columbia, SC",2019.0,20193,University of Virginia,Cavaliers,Charlottesville,Virginia,State,Atlantic Coast Conference,University of Virginia Charlottesville Virginia,-78.5055,38.041058,312.854085
4,2.0,Michigan State,"Des Moines, IA",2019.0,20194,Michigan State University,Spartans,East Lansing,Michigan,State,Big Ten Conference,Michigan State University East Lansing Michigan,-84.477916,42.718568,473.633831


## Create duplicate dataframe with site coordinates/geometries

Each school will be plotted with all of its tournament sites. The `schools` dataframe can be duplicated and its geometries replaced with `sites` geometries.

In [12]:
# copy schools dataframe and drop columns we want to copy from sites dataframe
sitesAgg = schools.drop(['address', 'lng', 'lat'], axis = 1)

# replace old columns with site columns - list() is used to ignore indexes
sitesAgg['address'] = list(sites['address'])
sitesAgg['lng'] = list(sites.lng)
sitesAgg['lat'] = list(sites.lat)
sitesAgg.head()

Unnamed: 0,seed,school_common_name,site,year,id,school_full_name,team,city,state,type,conference,distance,address,lng,lat
0,1.0,Duke,"Columbia, SC",2019.0,20190,Duke University,Blue Devils,Durham,North Carolina,Private/Non-sectarian,Atlantic Coast Conference,181.862864,"Columbia, SC",-81.034331,34.000749
1,1.0,Gonzaga,"Salt Lake City, UT",2019.0,20191,Gonzaga University,Bulldogs,Spokane,Washington,Private/Catholic,West Coast Conference,549.55294,"Salt Lake City, UT",-111.886798,40.75962
2,1.0,North Carolina,"Columbus, OH",2019.0,20192,University of North Carolina at Chapel Hill,Tar Heels,Chapel Hill,North Carolina,State,Atlantic Coast Conference,353.448252,"Columbus, OH",-83.000706,39.96226
3,1.0,Virginia,"Columbia, SC",2019.0,20193,University of Virginia,Cavaliers,Charlottesville,Virginia,State,Atlantic Coast Conference,312.854085,"Columbia, SC",-81.034331,34.000749
4,2.0,Michigan State,"Des Moines, IA",2019.0,20194,Michigan State University,Spartans,East Lansing,Michigan,State,Big Ten Conference,473.633831,"Des Moines, IA",-93.603715,41.591064


## Write to CSV

In [8]:
schools.to_csv('../data/edits/distances-schools-GCD.csv', index=False)
# csv will be pulled into QGIS to convert to geojson for final web mapping
sitesAgg.to_csv('../data/edits/distances-sites-GCD.csv', index=False)