# Geocode School and Tournament Locations

## Import packages and data

In [2]:
import geocoder as gc
import pandas as pd

In [3]:
# import merged data set
tourney = pd.read_csv('../data/cleaned/tourney-metadata.csv')
tourney

Unnamed: 0,seed,school_common_name,site,year,id,school_full_name,team,city,state,type,conference
0,1.0,Duke,"Columbia, SC",2019.0,20190,Duke University,Blue Devils,Durham,North Carolina,Private/Non-sectarian,Atlantic Coast Conference
1,1.0,Gonzaga,"Salt Lake City, UT",2019.0,20191,Gonzaga University,Bulldogs,Spokane,Washington,Private/Catholic,West Coast Conference
2,1.0,North Carolina,"Columbus, OH",2019.0,20192,University of North Carolina at Chapel Hill,Tar Heels,Chapel Hill,North Carolina,State,Atlantic Coast Conference
3,1.0,Virginia,"Columbia, SC",2019.0,20193,University of Virginia,Cavaliers,Charlottesville,Virginia,State,Atlantic Coast Conference
4,2.0,Michigan State,"Des Moines, IA",2019.0,20194,Michigan State University,Spartans,East Lansing,Michigan,State,Big Ten Conference
...,...,...,...,...,...,...,...,...,...,...,...
1115,,,"Albuquerque, NM",,1985555,,,,,,
1116,,,"Hartford, CT",,1985556,,,,,,
1117,,,"Tulsa, OK",,1985557,,,,,,
1118,,,"Dayton, OH",,1985558,,,,,,


## Create address column for schools from concatenated full names, cities, and states

In [4]:
# create address for geocoding by combining school name, city, and state
tourney['address'] = tourney.school_full_name + ' ' + tourney.city + ' ' + tourney.state
tourney

Unnamed: 0,seed,school_common_name,site,year,id,school_full_name,team,city,state,type,conference,address
0,1.0,Duke,"Columbia, SC",2019.0,20190,Duke University,Blue Devils,Durham,North Carolina,Private/Non-sectarian,Atlantic Coast Conference,Duke University Durham North Carolina
1,1.0,Gonzaga,"Salt Lake City, UT",2019.0,20191,Gonzaga University,Bulldogs,Spokane,Washington,Private/Catholic,West Coast Conference,Gonzaga University Spokane Washington
2,1.0,North Carolina,"Columbus, OH",2019.0,20192,University of North Carolina at Chapel Hill,Tar Heels,Chapel Hill,North Carolina,State,Atlantic Coast Conference,University of North Carolina at Chapel Hill Ch...
3,1.0,Virginia,"Columbia, SC",2019.0,20193,University of Virginia,Cavaliers,Charlottesville,Virginia,State,Atlantic Coast Conference,University of Virginia Charlottesville Virginia
4,2.0,Michigan State,"Des Moines, IA",2019.0,20194,Michigan State University,Spartans,East Lansing,Michigan,State,Big Ten Conference,Michigan State University East Lansing Michigan
...,...,...,...,...,...,...,...,...,...,...,...,...
1115,,,"Albuquerque, NM",,1985555,,,,,,,
1116,,,"Hartford, CT",,1985556,,,,,,,
1117,,,"Tulsa, OK",,1985557,,,,,,,
1118,,,"Dayton, OH",,1985558,,,,,,,


## Use site column as address for sites

In [5]:
# subset data for rows that are mostly NAs, replace address column with site column
tourney.loc[tourney['seed'].isna(), 'address'] = tourney.site
tourney

Unnamed: 0,seed,school_common_name,site,year,id,school_full_name,team,city,state,type,conference,address
0,1.0,Duke,"Columbia, SC",2019.0,20190,Duke University,Blue Devils,Durham,North Carolina,Private/Non-sectarian,Atlantic Coast Conference,Duke University Durham North Carolina
1,1.0,Gonzaga,"Salt Lake City, UT",2019.0,20191,Gonzaga University,Bulldogs,Spokane,Washington,Private/Catholic,West Coast Conference,Gonzaga University Spokane Washington
2,1.0,North Carolina,"Columbus, OH",2019.0,20192,University of North Carolina at Chapel Hill,Tar Heels,Chapel Hill,North Carolina,State,Atlantic Coast Conference,University of North Carolina at Chapel Hill Ch...
3,1.0,Virginia,"Columbia, SC",2019.0,20193,University of Virginia,Cavaliers,Charlottesville,Virginia,State,Atlantic Coast Conference,University of Virginia Charlottesville Virginia
4,2.0,Michigan State,"Des Moines, IA",2019.0,20194,Michigan State University,Spartans,East Lansing,Michigan,State,Big Ten Conference,Michigan State University East Lansing Michigan
...,...,...,...,...,...,...,...,...,...,...,...,...
1115,,,"Albuquerque, NM",,1985555,,,,,,,"Albuquerque, NM"
1116,,,"Hartford, CT",,1985556,,,,,,,"Hartford, CT"
1117,,,"Tulsa, OK",,1985557,,,,,,,"Tulsa, OK"
1118,,,"Dayton, OH",,1985558,,,,,,,"Dayton, OH"


## Find unique addresses for school and tournament site locations

Though there are over a thousand data points for the full tournament dataset, there are many repeated locations. To save on time and requests, the unique addresses can be pulled from the dataset, geocoded, then merged back into the full dataset.

In [6]:
# find unique addresses (schools and sites), format as dataframe
addresses = pd.DataFrame({'address': list(tourney.address.unique())})
addresses

Unnamed: 0,address
0,Duke University Durham North Carolina
1,Gonzaga University Spokane Washington
2,University of North Carolina at Chapel Hill Ch...
3,University of Virginia Charlottesville Virginia
4,Michigan State University East Lansing Michigan
...,...
162,"Los Angeles, CA"
163,"Chapel Hill, NC"
164,"Lincoln, NE"
165,"Baton Rouge, LA"


## Test geocoding locations

Using the `geocoder` plugin and the open source OpenStreetMap geocoder, latitude and longitude coordinates can be found for the school and site addresses.

In [7]:
# use list comprehension to loop through each address and geocode with OSM
geocodeTest = [gc.osm(school) for school in addresses.address]

## Find and fix geocoding failures

Because the addresses are simple concatenations of the city, state, and school names (if applicable), there is plenty of room for error. Geocoding failures returned `None` in the `geocoded` data frame, so those indexed results can be filtered out of the `geocodeTest` data frame and located in the `addresses` dataframe.

In [8]:
# find the index of failed results in the geocoded dataframe
failIndex = [i for i in range(len(geocodeTest)) if geocodeTest[i].osm is None]

# locate the above failure index values in the addresses dataframe
fails = addresses.loc[failIndex]
fails

Unnamed: 0,address
25,"University of California, Los Angeles Los Ange..."
28,West Virginia University Morgantown West Virginia
31,Texas A&M University College Station Texas
76,University of Minnesota Minneapolis–Saint Paul...
80,"University of Nevada, Las Vegas Paradise Nevada"


Luckily there were only 5 geocoding failures:

* `University of California, Los Angeles Los Angeles California`
* `West Virginia University Morgantown West Virginia`
* `Texas A&M University College Station Texas`
* `University of Minnesota Minneapolis–Saint Paul Minnesota`
* `University of Nevada, Las Vegas Paradise Nevada`

Additionally, after running all notebooks and pulling the final data into a web map, visual inspection of the data revealed that the `Long Beach, CA` site is misidentified as a location in Northern Canada. This will be fixed as well.

Through simple trial and error with the OpenStreetMap geocoder (e.g. `gc.osm('University of California Los Angeles California')`), successful addresses can replace the failures.

These addresses also need to be changed in the original `tourney` data frame so that the geocoded results can be merged back in.

In [9]:
# # Tests for successful geocoding addresses
# gc.osm('University of California Los Angeles California')
# gc.osm('WVU Morgantown West Virginia')
# gc.osm('Texas A and M College Station Texas')
# gc.osm('University of Minnesota St. Paul Minnesota')
# gc.osm('UNLV Paradise Nevada')

# Replace failure geocoding addresses with successful ones
addresses[addresses.address == 'University of California, Los Angeles Los Angeles California'] = 'University of California Los Angeles California'
addresses[addresses.address == 'West Virginia University Morgantown West Virginia'] = 'WVU Morgantown West Virginia'
addresses[addresses.address == 'Texas A&M University College Station Texas'] = 'Texas A & M, College Station Texas'
addresses[addresses.address == 'University of Minnesota Minneapolis–Saint Paul Minnesota'] = 'University of Minnesota St. Paul Minnesota'
addresses[addresses.address == 'University of Nevada, Las Vegas Paradise Nevada'] = 'UNLV Paradise Nevada'
addresses[addresses.address == 'Long Beach, CA'] = 'Long Beach, California'

# Replace same strings in main tourney data set to be merged back together
tourney = tourney.replace(to_replace='University of California, Los Angeles Los Angeles California', value='University of California Los Angeles California')
tourney = tourney.replace(to_replace='West Virginia University Morgantown West Virginia', value='WVU Morgantown West Virginia')
tourney = tourney.replace(to_replace='Texas A&M University College Station Texas', value='Texas A & M, College Station Texas')
tourney = tourney.replace(to_replace='University of Minnesota Minneapolis–Saint Paul Minnesota', value='University of Minnesota St. Paul Minnesota')
tourney = tourney.replace(to_replace='University of Nevada, Las Vegas Paradise Nevada', value='UNLV Paradise Nevada')
tourney = tourney.replace(to_replace='Long Beach, CA', value='Long Beach, California')

## Geocode corrected addresses

Geocode the addresses data frame again with the corrected addresses.

In [10]:
geocoded = [gc.osm(school) for school in addresses.address]

## Check geocoding failures again

No failures!

In [11]:
# find the index of failed results in the geocoded dataframe
failIndex = [i for i in range(len(geocoded)) if geocoded[i].osm is None]

# locate the above failure index values in the addresses dataframe
fails = addresses.loc[failIndex]
fails

Unnamed: 0,address


## Create new `lat` and `lng` columns from geocoded results

Geocoding results are nested in objects, so they need to be separated out and added to the unique addresses data frame as separate columns to be converted to Point geometry in a GeoDataFrame.

In [12]:
# Create new `lng` and `lat` columns from geocoded results
addresses['lng'] = [geocoded[i].osm['x'] for i in range(len(geocoded))]
addresses['lat'] = [geocoded[i].osm['y'] for i in range(len(geocoded))]

## Merge unique geocoded addresses back into tournament data set

Since geocoded results were only found for each unique address, those unique results need to be merged back into the overall tournament data set.

In [13]:
allResults = pd.merge(tourney, addresses, how='left', left_on='address', right_on='address')
allResults

Unnamed: 0,seed,school_common_name,site,year,id,school_full_name,team,city,state,type,conference,address,lng,lat
0,1.0,Duke,"Columbia, SC",2019.0,20190,Duke University,Blue Devils,Durham,North Carolina,Private/Non-sectarian,Atlantic Coast Conference,Duke University Durham North Carolina,-78.944230,36.000156
1,1.0,Gonzaga,"Salt Lake City, UT",2019.0,20191,Gonzaga University,Bulldogs,Spokane,Washington,Private/Catholic,West Coast Conference,Gonzaga University Spokane Washington,-117.403044,47.666739
2,1.0,North Carolina,"Columbus, OH",2019.0,20192,University of North Carolina at Chapel Hill,Tar Heels,Chapel Hill,North Carolina,State,Atlantic Coast Conference,University of North Carolina at Chapel Hill Ch...,-79.047753,35.905035
3,1.0,Virginia,"Columbia, SC",2019.0,20193,University of Virginia,Cavaliers,Charlottesville,Virginia,State,Atlantic Coast Conference,University of Virginia Charlottesville Virginia,-78.505500,38.041058
4,2.0,Michigan State,"Des Moines, IA",2019.0,20194,Michigan State University,Spartans,East Lansing,Michigan,State,Big Ten Conference,Michigan State University East Lansing Michigan,-84.477916,42.718568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,,,"Albuquerque, NM",,1985555,,,,,,,"Albuquerque, NM",-106.650985,35.084103
1116,,,"Hartford, CT",,1985556,,,,,,,"Hartford, CT",-72.690855,41.764582
1117,,,"Tulsa, OK",,1985557,,,,,,,"Tulsa, OK",-95.992911,36.155681
1118,,,"Dayton, OH",,1985558,,,,,,,"Dayton, OH",-84.191607,39.758948


## Write to CSV

In [14]:
allResults.to_csv('../data/cleaned/geocoded_results.csv', index=False)