# Analyze Tournament Site Location Data

## Import data and packages

In [1]:
# import python packages
import pandas as pd

# import school distances dataset
# data = pd.read_csv('../data/cleaned/distances-schools.csv')
data = pd.read_csv('../data/edits/distances-schools-GCD.csv')

data.head()

Unnamed: 0,seed,school_common_name,site,year,id,school_full_name,team,city,state,type,conference,address,lng,lat,geometry,distance
0,1.0,Duke,"Columbia, SC",2019.0,20190,Duke University,Blue Devils,Durham,North Carolina,Private/Non-sectarian,Atlantic Coast Conference,Duke University Durham North Carolina,-78.94423,36.000156,POINT (-78.94422972195878 36.00015569999999),181.862864
1,1.0,Gonzaga,"Salt Lake City, UT",2019.0,20191,Gonzaga University,Bulldogs,Spokane,Washington,Private/Catholic,West Coast Conference,Gonzaga University Spokane Washington,-117.403044,47.666739,POINT (-117.4030438539681 47.66673855000001),549.55294
2,1.0,North Carolina,"Columbus, OH",2019.0,20192,University of North Carolina at Chapel Hill,Tar Heels,Chapel Hill,North Carolina,State,Atlantic Coast Conference,University of North Carolina at Chapel Hill Ch...,-79.047753,35.905035,POINT (-79.04775326525106 35.90503535),353.448252
3,1.0,Virginia,"Columbia, SC",2019.0,20193,University of Virginia,Cavaliers,Charlottesville,Virginia,State,Atlantic Coast Conference,University of Virginia Charlottesville Virginia,-78.5055,38.041058,POINT (-78.50549960183569 38.0410576),312.854085
4,2.0,Michigan State,"Des Moines, IA",2019.0,20194,Michigan State University,Spartans,East Lansing,Michigan,State,Big Ten Conference,Michigan State University East Lansing Michigan,-84.477916,42.718568,POINT (-84.47791570930522 42.71856800000001),473.633831


## Find weighted distance

In theory, higher seeded teams should play at closer sites, and the further down the list of the top 16 teams, the lesser geographic preference. To compare higher and lower seeds on the same level, a simple weighted average is calculated where 1 seeds are weighted 1, 2 seeds are weighted 0.75, 3 seeds are weighted 0.5, and 4 seeds are weighted 0.25.

In [2]:
# dictionary of weights - seeds are keys, weights are values
weights = {1: 1, 2: 0.75, 3: 0.5, 4: 0.25}

# loop through distances and apply weights based on the associated seed value 
weightedDistance = [dist * weights[data.seed[i]] for i, dist in enumerate(data.distance)]

# add weighted distance column to dataframe
data['weightedDist'] = weightedDistance
data.tail()

Unnamed: 0,seed,school_common_name,site,year,id,school_full_name,team,city,state,type,conference,address,lng,lat,geometry,distance,weightedDist
555,3.0,NC State,"Albuquerque, NM",1985.0,1985555,North Carolina State University,Wolfpack,Raleigh,North Carolina,State,Atlantic Coast Conference,North Carolina State University Raleigh North ...,-78.674087,35.77185,POINT (-78.67408695452633 35.77184965),1570.465493,785.232746
556,4.0,Loyola–Chicago,"Hartford, CT",1985.0,1985556,Loyola University Chicago,Ramblers,Chicago,Illinois,Private/Catholic,Missouri Valley Conference,Loyola University Chicago Chicago Illinois,-87.668422,41.944842,POINT (-87.66842176669064 41.94484179999999),769.915951,192.478988
557,4.0,Ohio State,"Tulsa, OK",1985.0,1985557,The Ohio State University,Buckeyes,Columbus,Ohio,State,Big Ten Conference,The Ohio State University Columbus Ohio,-83.028663,40.005709,POINT (-83.02866259769122 40.00570905),752.697761,188.17444
558,4.0,LSU,"Dayton, OH",1985.0,1985558,Louisiana State University,Tigers,Baton Rouge,Louisiana,State,Southeastern Conference,Louisiana State University Baton Rouge Louisiana,-91.185968,30.405709,POINT (-91.18596767189877 30.40570885),757.03493,189.258733
559,4.0,UNLV,"Salt Lake City, UT",1985.0,1985559,"University of Nevada, Las Vegas",Rebels,Paradise,Nevada,State,Mountain West Conference,UNLV Paradise Nevada,-115.141832,36.107155,POINT (-115.1418318610852 36.1071554),366.49083,91.622707


## Aggregate distances by school

While the distance dataframe is useful as is, it is most interesting to aggregate the data at different levels. Of most interest is aggregating at the school level. By using the Pandas `describe` method, the mean, standard deviation, minimum, maximum, and quantiles are quickly calculated for each school.

In [4]:
# group dataframe by schools and apply describe method
schoolsWeighted = data.groupby('school_common_name').describe()

# preserve the weighted distance aggregation only
schoolsWtDistAgg = schoolsWeighted.weightedDist
schoolsWtDistAgg.head()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
school_common_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alabama,3.0,106.176252,113.012814,35.688443,41.000389,46.312334,141.420156,236.527978
Arizona,20.0,452.769203,248.679502,0.422532,318.696526,441.213022,627.777335,836.290801
Arkansas,7.0,266.569269,156.967603,112.569124,175.986248,194.322906,315.150827,576.818699
Auburn,2.0,478.002069,27.162203,458.795492,468.39878,478.002069,487.605358,497.208647
BYU,2.0,283.9681,150.936519,177.239864,230.603982,283.9681,337.332219,390.696337


In [5]:
# preserve the distance aggregation only
schoolsDistAgg = schoolsWeighted.distance
schoolsDistAgg.head()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
school_common_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alabama,3.0,182.734855,133.91073,47.584591,116.416964,185.249336,250.309987,315.370637
Arizona,20.0,717.907509,409.012004,0.422532,436.882843,663.978409,885.605911,1540.123947
Arkansas,7.0,524.660818,185.149712,191.705381,450.276495,576.818699,613.128514,777.291625
Auburn,2.0,1166.195307,946.090007,497.208647,831.701977,1166.195307,1500.688636,1835.181966
BYU,2.0,958.632538,854.401097,354.479728,656.556133,958.632538,1260.708942,1562.785347


## Add tags to column names to differentiate between weighted and unweighted statistics

Since both the weighted and unweighted distances were aggregated and have the same `description()` column names, tags can be concatenated to the columns to differentiate the two.

In [6]:
# concatenate `_wtDist` to each column and apply to weighted distance dataframe
schoolsWtDistAgg.columns = [col + '_wtDist' for col in schoolsWtDistAgg.columns]
display(schoolsWtDistAgg.head())

# concatenate `_dist` to each column and apply to UNweighted distance dataframe
schoolsDistAgg.columns = [col + '_dist' for col in schoolsDistAgg.columns]
schoolsDistAgg.head()

Unnamed: 0_level_0,count_wtDist,mean_wtDist,std_wtDist,min_wtDist,25%_wtDist,50%_wtDist,75%_wtDist,max_wtDist
school_common_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alabama,3.0,106.176252,113.012814,35.688443,41.000389,46.312334,141.420156,236.527978
Arizona,20.0,452.769203,248.679502,0.422532,318.696526,441.213022,627.777335,836.290801
Arkansas,7.0,266.569269,156.967603,112.569124,175.986248,194.322906,315.150827,576.818699
Auburn,2.0,478.002069,27.162203,458.795492,468.39878,478.002069,487.605358,497.208647
BYU,2.0,283.9681,150.936519,177.239864,230.603982,283.9681,337.332219,390.696337


Unnamed: 0_level_0,count_dist,mean_dist,std_dist,min_dist,25%_dist,50%_dist,75%_dist,max_dist
school_common_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alabama,3.0,182.734855,133.91073,47.584591,116.416964,185.249336,250.309987,315.370637
Arizona,20.0,717.907509,409.012004,0.422532,436.882843,663.978409,885.605911,1540.123947
Arkansas,7.0,524.660818,185.149712,191.705381,450.276495,576.818699,613.128514,777.291625
Auburn,2.0,1166.195307,946.090007,497.208647,831.701977,1166.195307,1500.688636,1835.181966
BYU,2.0,958.632538,854.401097,354.479728,656.556133,958.632538,1260.708942,1562.785347


## Append distance and weighted distance dataframes

Because both of the aggregations will be merged into the `data` dataframe, they should be appended together in one dataframe. They can be merged by their matching index keys (school names).

In [7]:
# append two dataframes into one
aggs = pd.merge(schoolsDistAgg, schoolsWtDistAgg, left_on=schoolsDistAgg.index.get_level_values('school_common_name'), right_on=schoolsWtDistAgg.index.get_level_values('school_common_name'))
aggs.head()

Unnamed: 0,key_0,count_dist,mean_dist,std_dist,min_dist,25%_dist,50%_dist,75%_dist,max_dist,count_wtDist,mean_wtDist,std_wtDist,min_wtDist,25%_wtDist,50%_wtDist,75%_wtDist,max_wtDist
0,Alabama,3.0,182.734855,133.91073,47.584591,116.416964,185.249336,250.309987,315.370637,3.0,106.176252,113.012814,35.688443,41.000389,46.312334,141.420156,236.527978
1,Arizona,20.0,717.907509,409.012004,0.422532,436.882843,663.978409,885.605911,1540.123947,20.0,452.769203,248.679502,0.422532,318.696526,441.213022,627.777335,836.290801
2,Arkansas,7.0,524.660818,185.149712,191.705381,450.276495,576.818699,613.128514,777.291625,7.0,266.569269,156.967603,112.569124,175.986248,194.322906,315.150827,576.818699
3,Auburn,2.0,1166.195307,946.090007,497.208647,831.701977,1166.195307,1500.688636,1835.181966,2.0,478.002069,27.162203,458.795492,468.39878,478.002069,487.605358,497.208647
4,BYU,2.0,958.632538,854.401097,354.479728,656.556133,958.632538,1260.708942,1562.785347,2.0,283.9681,150.936519,177.239864,230.603982,283.9681,337.332219,390.696337


## Prep original `data` dataframe for merging

The weighted and unweighted distance aggregations need to be merged with the main school dataframe. Because the aggregations are at the school level, we only need the generic school information columns - the `year`, `seed`, `site`, `id`, and `geometry` columns can be dropped from `data`. Furthermore, all duplicate entries can be dropped.



In [8]:
# drop all duplicate school names, drop columns that change depending on the year
schoolsTrimmed = data.drop_duplicates(['school_common_name']).drop(['year', 'seed', 'site', 'id', 'geometry', 'distance', 'weightedDist'], axis=1)
schoolsTrimmed.head()

Unnamed: 0,school_common_name,school_full_name,team,city,state,type,conference,address,lng,lat
0,Duke,Duke University,Blue Devils,Durham,North Carolina,Private/Non-sectarian,Atlantic Coast Conference,Duke University Durham North Carolina,-78.94423,36.000156
1,Gonzaga,Gonzaga University,Bulldogs,Spokane,Washington,Private/Catholic,West Coast Conference,Gonzaga University Spokane Washington,-117.403044,47.666739
2,North Carolina,University of North Carolina at Chapel Hill,Tar Heels,Chapel Hill,North Carolina,State,Atlantic Coast Conference,University of North Carolina at Chapel Hill Ch...,-79.047753,35.905035
3,Virginia,University of Virginia,Cavaliers,Charlottesville,Virginia,State,Atlantic Coast Conference,University of Virginia Charlottesville Virginia,-78.5055,38.041058
4,Michigan State,Michigan State University,Spartans,East Lansing,Michigan,State,Big Ten Conference,Michigan State University East Lansing Michigan,-84.477916,42.718568


In [9]:
# merge schools with weights distance aggregation on
schools = pd.merge(schoolsTrimmed, aggs, how='left', left_on='school_common_name', right_on='key_0')
display(schools.head())

# sort by mean weighted distance so that higher means (and thus bigger proportional circles) are plotted first
schoolsSorted = schools.sort_values(by=['mean_wtDist'], ascending=False)
schoolsSorted.head()

Unnamed: 0,school_common_name,school_full_name,team,city,state,type,conference,address,lng,lat,...,75%_dist,max_dist,count_wtDist,mean_wtDist,std_wtDist,min_wtDist,25%_wtDist,50%_wtDist,75%_wtDist,max_wtDist
0,Duke,Duke University,Blue Devils,Durham,North Carolina,Private/Non-sectarian,Atlantic Coast Conference,Duke University Durham North Carolina,-78.94423,36.000156,...,343.770399,1803.361557,31.0,199.620996,213.558151,6.496434,47.628087,119.284591,245.259017,901.680779
1,Gonzaga,Gonzaga University,Bulldogs,Spokane,Washington,Private/Catholic,West Coast Conference,Gonzaga University Spokane Washington,-117.403044,47.666739,...,549.55294,1119.082957,9.0,330.135966,218.888659,71.446791,172.048309,274.77647,549.55294,559.541479
2,North Carolina,University of North Carolina at Chapel Hill,Tar Heels,Chapel Hill,North Carolina,State,Atlantic Coast Conference,University of North Carolina at Chapel Hill Ch...,-79.047753,35.905035,...,508.798769,2337.597702,26.0,307.16137,353.877026,24.457163,72.01418,152.316673,403.59095,1350.195687
3,Virginia,University of Virginia,Cavaliers,Charlottesville,Virginia,State,Atlantic Coast Conference,University of Virginia Charlottesville Virginia,-78.5055,38.041058,...,293.052544,327.90369,6.0,186.074983,78.810193,81.975923,156.368012,165.801977,219.044928,312.854085
4,Michigan State,Michigan State University,Spartans,East Lansing,Michigan,State,Big Ten Conference,Michigan State University East Lansing Michigan,-84.477916,42.718568,...,603.284444,1627.091032,12.0,277.847078,177.997907,31.596715,162.006954,257.218856,411.150705,602.375689


Unnamed: 0,school_common_name,school_full_name,team,city,state,type,conference,address,lng,lat,...,75%_dist,max_dist,count_wtDist,mean_wtDist,std_wtDist,min_wtDist,25%_wtDist,50%_wtDist,75%_wtDist,max_wtDist
73,St. John's,St. John's University,Red Storm,Jamaica,New York,Private/Catholic,Big East Conference,St. John's University Jamaica New York,-73.990073,40.729944,...,2119.943868,2451.644135,5.0,1323.0309,987.277121,135.466645,470.073375,1589.957901,1968.012445,2451.644135
85,VCU,Virginia Commonwealth University,Rams,Richmond,Virginia,State,Atlantic 10 Conference,Virginia Commonwealth University Richmond Virg...,-77.453064,37.548215,...,1627.855318,1627.855318,1.0,1220.891489,,1220.891489,1220.891489,1220.891489,1220.891489,1220.891489
30,Miami (FL),University of Miami,Hurricanes,Coral Gables,Florida,Private/Non-Sectarian,Atlantic Coast Conference,University of Miami Coral Gables Florida,-80.278692,25.717279,...,1242.757534,1263.390158,3.0,796.943001,170.992055,611.062455,721.643192,832.223928,889.883274,947.542619
55,Stanford,Stanford University,Cardinal,Palo Alto,California,Private/Non-Sectarian,Pac-12 Conference,Stanford University Palo Alto California,-122.169365,37.431314,...,1886.870713,2327.966451,8.0,763.259514,604.600269,172.380743,370.195366,615.038453,984.872258,1996.471237
79,Seton Hall,Seton Hall University,Pirates,South Orange,New Jersey,Private/Catholic,Big East Conference,Seton Hall University South Orange New Jersey,-74.246858,40.743372,...,1992.668446,2106.603033,4.0,710.434824,427.877504,110.144223,553.247379,839.146778,996.334223,1053.301516


In [12]:
# Find overall mean for mapping purposes
print(schoolsSorted['mean_wtDist'].mean())
schoolsSorted['mean_dist'].mean()

322.2826147535282


668.5360821712304

## Write to CSV

In [11]:
# schoolsSorted.to_csv('../data/cleaned/schools-wtAvg.csv', index=False)
schoolsSorted.to_csv('../data/edits/schools-wtAvg.csv', index=False)

## Group data by school and seed

To provide a view for each school's seeding data, the data should be grouped by both the school name and seed. The `describe` method calculates various statistics for the datasets. The weighted and unweighted distances should then be joined together in one dataset.

In [13]:
# group by school and seed, calculate averages with describe method
seeds = data.groupby(['school_common_name', 'seed']).describe()
seedsDist = seeds.distance
seedsWtDist = seeds.weightedDist

display(seedsDist.head())
display(seedsWtDist.head())

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
school_common_name,seed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,2.0,2.0,181.477614,189.353329,47.584591,114.531103,181.477614,248.424126,315.370637
Alabama,4.0,1.0,185.249336,,185.249336,185.249336,185.249336,185.249336,185.249336
Arizona,1.0,6.0,494.251956,299.834034,0.422532,381.259265,515.160055,700.179484,836.290801
Arizona,2.0,7.0,726.654327,252.877572,431.299825,591.576261,591.576261,884.965899,1110.619884
Arizona,3.0,4.0,698.422557,640.554735,99.817037,263.048092,576.874623,1012.249088,1540.123947


Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
school_common_name,seed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,2.0,2.0,136.108211,142.014997,35.688443,85.898327,136.108211,186.318094,236.527978
Alabama,4.0,1.0,46.312334,,46.312334,46.312334,46.312334,46.312334,46.312334
Arizona,1.0,6.0,494.251956,299.834034,0.422532,381.259265,515.160055,700.179484,836.290801
Arizona,2.0,7.0,544.990745,189.658179,323.474869,443.682195,443.682195,663.724424,832.964913
Arizona,3.0,4.0,349.211279,320.277368,49.908519,131.524046,288.437311,506.124544,770.061973


### Join weighted and unweighted dataframes together

The weighted and unweighted distance dataframes should be joined together. Because the dataframes need to be joined by two matching columns (`school_common_name` and `seed`), it's easier to simply create new columns in the `seedsDist` dataframe with the columns from the `seedsWtDist` dataframe.

In [14]:
# turn off copy warning
pd.options.mode.chained_assignment = None  # default='warn'

seedsDist['mean_wtDist'] = seedsWtDist['mean']
seedsDist['std_wtDist'] = seedsWtDist['std']
seedsDist['min_wtDist'] = seedsWtDist['min']
seedsDist['25%_wtDist'] = seedsWtDist['25%']
seedsDist['50%_wtDist'] = seedsWtDist['50%']
seedsDist['75%_wtDist'] = seedsWtDist['75%']
seedsDist['max_wtDist'] = seedsWtDist['max']
seedsDist

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max,mean_wtDist,std_wtDist,min_wtDist,25%_wtDist,50%_wtDist,75%_wtDist,max_wtDist
school_common_name,seed,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Alabama,2.0,2.0,181.477614,189.353329,47.584591,114.531103,181.477614,248.424126,315.370637,136.108211,142.014997,35.688443,85.898327,136.108211,186.318094,236.527978
Alabama,4.0,1.0,185.249336,,185.249336,185.249336,185.249336,185.249336,185.249336,46.312334,,46.312334,46.312334,46.312334,46.312334,46.312334
Arizona,1.0,6.0,494.251956,299.834034,0.422532,381.259265,515.160055,700.179484,836.290801,494.251956,299.834034,0.422532,381.259265,515.160055,700.179484,836.290801
Arizona,2.0,7.0,726.654327,252.877572,431.299825,591.576261,591.576261,884.965899,1110.619884,544.990745,189.658179,323.474869,443.682195,443.682195,663.724424,832.964913
Arizona,3.0,4.0,698.422557,640.554735,99.817037,263.048092,576.874623,1012.249088,1540.123947,349.211279,320.277368,49.908519,131.524046,288.437311,506.124544,770.061973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wisconsin,4.0,3.0,1147.879361,216.360561,978.180869,1026.060612,1073.940354,1232.728606,1391.516859,286.969840,54.090140,244.545217,256.515153,268.485089,308.182152,347.879215
Xavier,1.0,1.0,241.905944,,241.905944,241.905944,241.905944,241.905944,241.905944,241.905944,,241.905944,241.905944,241.905944,241.905944,241.905944
Xavier,2.0,1.0,312.114653,,312.114653,312.114653,312.114653,312.114653,312.114653,234.085990,,234.085990,234.085990,234.085990,234.085990,234.085990
Xavier,3.0,2.0,320.621222,111.320214,241.905944,281.263583,320.621222,359.978861,399.336501,160.310611,55.660107,120.952972,140.631792,160.310611,179.989431,199.668250


## Calculate overall seed averages

As a point of reference, the weighted and unweighted statistics for the overall seeds (independent of schools) should be calculated.

In [15]:
# group data by seed
seedsAll = data.groupby('seed').describe()

# merge weighted and unweighted averages on 'seed', add custom suffixes to differentiate between the two
seedsMerged = pd.merge(seedsAll.distance, seedsAll.weightedDist, 
               left_on=seedsAll.distance.index.get_level_values('seed'), 
               right_on=seedsAll.weightedDist.index.get_level_values('seed'),
               suffixes=('_dist', '_wtDist'))

# rename `key_0` column as `seed` for clarity
seedsMerged['seed'] = seedsMerged['key_0']
seedsMerged

Unnamed: 0,key_0,count_dist,mean_dist,std_dist,min_dist,25%_dist,50%_dist,75%_dist,max_dist,count_wtDist,mean_wtDist,std_wtDist,min_wtDist,25%_wtDist,50%_wtDist,75%_wtDist,max_wtDist,seed
0,1.0,140.0,353.10024,397.134861,0.422532,117.437566,236.95051,441.207585,2451.644135,140.0,353.10024,397.134861,0.422532,117.437566,236.95051,441.207585,2451.644135,1.0
1,2.0,140.0,482.063794,472.943037,0.981766,159.721862,313.950805,596.242757,2119.943868,140.0,361.547845,354.707278,0.736325,119.791397,235.463103,447.182068,1589.957901,2.0
2,3.0,140.0,689.270606,544.591218,1.466436,335.383444,489.935499,875.712131,2337.597702,140.0,344.635303,272.295609,0.733218,167.691722,244.96775,437.856066,1168.798851,3.0
3,4.0,140.0,828.830488,649.744606,36.418554,311.176383,599.43039,1322.723322,2416.716871,140.0,207.207622,162.436151,9.104639,77.794096,149.857597,330.68083,604.179218,4.0


## Write to CSV

In [15]:
# leave in indexes to preserve school name and seed
# this is pulled into QGIS and saved as geojson and converted to .json for proper formatting
# seedsDist.to_csv('../data/cleaned/seeds-by-school.csv')
# seedsMerged.to_csv('../data/cleaned/seeds-overall.csv', index=False)

seedsDist.to_csv('../data/edits/seeds-by-school.csv')
seedsMerged.to_csv('../data/edits/seeds-overall.csv', index=False)

# Aggregate by Conference

In [16]:
# calculate statistics at conference level
conf = data.groupby('conference').describe()

# merge weighted distance and unweighted distance in single dataframe
confAll = pd.merge(conf.distance, conf.weightedDist, 
             left_on=conf.distance.index.get_level_values('conference'), 
             right_on=conf.weightedDist.index.get_level_values('conference'),
             suffixes=('_dist', '_wtDist'))

confAll.head()

Unnamed: 0,key_0,count_dist,mean_dist,std_dist,min_dist,25%_dist,50%_dist,75%_dist,max_dist,count_wtDist,mean_wtDist,std_wtDist,min_wtDist,25%_wtDist,50%_wtDist,75%_wtDist,max_wtDist
0,American Athletic Conference,35.0,494.079245,492.232987,22.735965,198.950567,322.423198,594.844672,2040.588578,35.0,349.534236,393.651249,11.663134,163.338241,241.817399,376.704615,1708.173734
1,Atlantic 10 Conference,11.0,831.663282,813.620612,38.152179,122.335522,274.465784,1672.842579,1915.382321,11.0,374.499737,407.087137,19.07609,58.985972,274.465784,456.041265,1220.891489
2,Atlantic Coast Conference,139.0,551.82604,609.414154,0.981766,119.284591,329.693239,650.215042,2416.716871,139.0,282.279692,280.564175,0.733218,82.21515,188.853288,373.875824,1350.195687
3,Big 12 Conference,78.0,505.828178,356.667691,19.554545,234.912413,425.126504,728.527982,1590.504708,78.0,280.214761,197.517561,9.314351,146.621031,212.508882,395.44866,858.601127
4,Big East Conference,44.0,629.232049,684.677728,10.967198,241.905944,354.607109,719.271108,2451.644135,44.0,377.75678,507.145521,5.483599,118.250785,200.694289,373.523196,2451.644135


## Write to CSV

In [17]:
# confAll.to_csv('../data/cleaned/conference-agg.csv')
confAll.to_csv('../data/edits/conference-agg.csv')

# Playground - Work In Progress Below

In [18]:
tmp = data.groupby('conference').describe()
dist = tmp.distance
wtDist = tmp.weightedDist
wtDist

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
conference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
American Athletic Conference,35.0,349.534236,393.651249,11.663134,163.338241,241.817399,376.704615,1708.173734
Atlantic 10 Conference,11.0,374.499737,407.087137,19.07609,58.985972,274.465784,456.041265,1220.891489
Atlantic Coast Conference,139.0,282.279692,280.564175,0.733218,82.21515,188.853288,373.875824,1350.195687
Big 12 Conference,78.0,280.214761,197.517561,9.314351,146.621031,212.508882,395.44866,858.601127
Big East Conference,44.0,377.75678,507.145521,5.483599,118.250785,200.694289,373.523196,2451.644135
Big Ten Conference,94.0,300.924687,301.389361,8.289089,95.18161,207.547235,393.013274,1643.370952
Missouri Valley Conference,2.0,142.286969,70.982234,92.094949,117.190959,142.286969,167.382978,192.478988
Mountain West Conference,13.0,283.630768,187.559343,56.381651,130.467715,268.050129,366.49083,747.310474
Pac-12 Conference,59.0,442.068632,385.240519,0.422532,181.910244,364.657785,568.80845,1996.471237
Southeastern Conference,74.0,288.658318,242.811537,23.860646,118.355641,213.089864,413.320067,1468.175399


In [27]:
data.describe()
data.loc[(data.year == 2012) & (data.seed == 4)].distance.describe()

count       4.000000
mean     1338.858773
std       710.788853
min       454.228415
25%       919.012369
50%      1477.284698
75%      1897.131102
max      1946.637283
Name: distance, dtype: float64

In [19]:
m = pd.merge(tmp.distance, tmp.weightedDist, 
               left_on=tmp.distance.index.get_level_values('conference'), 
               right_on=tmp.weightedDist.index.get_level_values('conference'),
               suffixes=('_dist', '_wtDist'))
m

Unnamed: 0,key_0,count_dist,mean_dist,std_dist,min_dist,25%_dist,50%_dist,75%_dist,max_dist,count_wtDist,mean_wtDist,std_wtDist,min_wtDist,25%_wtDist,50%_wtDist,75%_wtDist,max_wtDist
0,American Athletic Conference,35.0,494.079245,492.232987,22.735965,198.950567,322.423198,594.844672,2040.588578,35.0,349.534236,393.651249,11.663134,163.338241,241.817399,376.704615,1708.173734
1,Atlantic 10 Conference,11.0,831.663282,813.620612,38.152179,122.335522,274.465784,1672.842579,1915.382321,11.0,374.499737,407.087137,19.07609,58.985972,274.465784,456.041265,1220.891489
2,Atlantic Coast Conference,139.0,551.82604,609.414154,0.981766,119.284591,329.693239,650.215042,2416.716871,139.0,282.279692,280.564175,0.733218,82.21515,188.853288,373.875824,1350.195687
3,Big 12 Conference,78.0,505.828178,356.667691,19.554545,234.912413,425.126504,728.527982,1590.504708,78.0,280.214761,197.517561,9.314351,146.621031,212.508882,395.44866,858.601127
4,Big East Conference,44.0,629.232049,684.677728,10.967198,241.905944,354.607109,719.271108,2451.644135,44.0,377.75678,507.145521,5.483599,118.250785,200.694289,373.523196,2451.644135
5,Big Ten Conference,94.0,623.751314,598.330747,8.289089,138.51447,443.413452,908.402459,2374.855107,94.0,300.924687,301.389361,8.289089,95.18161,207.547235,393.013274,1643.370952
6,Missouri Valley Conference,2.0,569.147874,283.928937,368.379797,468.763836,569.147874,669.531913,769.915951,2.0,142.286969,70.982234,92.094949,117.190959,142.286969,167.382978,192.478988
7,Mountain West Conference,13.0,608.781068,411.96699,225.526604,358.773402,366.49083,864.345702,1494.620949,13.0,283.630768,187.559343,56.381651,130.467715,268.050129,366.49083,747.310474
8,Pac-12 Conference,59.0,773.020595,604.191722,0.422532,368.625239,591.576261,916.38863,2341.325928,59.0,442.068632,385.240519,0.422532,181.910244,364.657785,568.80845,1996.471237
9,Southeastern Conference,74.0,535.245621,426.811327,47.584591,232.094496,463.399851,642.398195,1933.308875,74.0,288.658318,242.811537,23.860646,118.355641,213.089864,413.320067,1468.175399


In [20]:
display(data.weightedDist.describe())
# display(data.loc[data.weightedDist > 2700])

count     560.000000
mean      316.622753
std       315.490350
min         0.422532
25%       112.196301
50%       214.944656
75%       417.406746
max      2451.644135
Name: weightedDist, dtype: float64

Unnamed: 0,seed,school_common_name,site,year,id,school_full_name,team,city,state,type,conference,address,lng,lat,geometry,distance,weightedDist


In [42]:
display(data.distance.describe())
display(data.loc[data.distance < 1])
display(data.loc[data.distance > 2400])

# display(data.loc[(data.distance > 370) & (data.distance < 380) & (data.seed == 1.0)])

count     560.000000
mean      588.316282
std       554.330822
min         0.422532
25%       198.950567
50%       394.244365
75%       754.276043
max      2451.644135
Name: distance, dtype: float64

Unnamed: 0,seed,school_common_name,site,year,id,school_full_name,team,city,state,type,conference,address,lng,lat,geometry,distance,weightedDist
307,1.0,Arizona,"Tucson, AZ",2000.0,2000307,University of Arizona,Wildcats,Tucson,Arizona,State,Pac-12 Conference,University of Arizona Tucson Arizona,-110.976884,32.228775,POINT (-110.9768841153795 32.22877495),0.422532,0.422532
516,2.0,Syracuse,"Syracuse, NY",1987.0,1987516,Syracuse University,Orange,Syracuse,New York,Private/Methodist,Atlantic Coast Conference,Syracuse University Syracuse New York,-76.133309,43.038306,POINT (-76.13330882751831 43.03830645),0.981766,0.736325
532,2.0,Syracuse,"Syracuse, NY",1986.0,1986532,Syracuse University,Orange,Syracuse,New York,Private/Methodist,Atlantic Coast Conference,Syracuse University Syracuse New York,-76.133309,43.038306,POINT (-76.13330882751831 43.03830645),0.981766,0.736325


Unnamed: 0,seed,school_common_name,site,year,id,school_full_name,team,city,state,type,conference,address,lng,lat,geometry,distance,weightedDist
108,4.0,Syracuse,"San Jose, CA",2013.0,2013108,Syracuse University,Orange,Syracuse,New York,Private/Methodist,Atlantic Coast Conference,Syracuse University Syracuse New York,-76.133309,43.038306,POINT (-76.13330882751831 43.03830645),2416.716871,604.179218
531,1.0,St. John's,"Long Beach, California",1986.0,1986531,St. John's University,Red Storm,Jamaica,New York,Private/Catholic,Big East Conference,St. John's University Jamaica New York,-73.990073,40.729944,POINT (-73.99007259999999 40.72994420000001),2451.644135,2451.644135


In [36]:
# display(data.loc[(data.distance > 500) & (data.distance < 530) & (data.seed == 2.0)])

In [37]:
# display(data.loc[(data.distance > 730) & (data.distance < 745) & (data.seed == 3.0)])

In [56]:
# display(data.loc[(data.distance > 900) & (data.distance < 930) & (data.seed == 4.0)])
data

Unnamed: 0,seed,school_common_name,site,year,id,school_full_name,team,city,state,type,conference,address,lng,lat,geometry,distance,weightedDist
0,1.0,Duke,"Columbia, SC",2019.0,20190,Duke University,Blue Devils,Durham,North Carolina,Private/Non-sectarian,Atlantic Coast Conference,Duke University Durham North Carolina,-78.944230,36.000156,POINT (-78.94422972195878 36.00015569999999),181.862864,181.862864
1,1.0,Gonzaga,"Salt Lake City, UT",2019.0,20191,Gonzaga University,Bulldogs,Spokane,Washington,Private/Catholic,West Coast Conference,Gonzaga University Spokane Washington,-117.403044,47.666739,POINT (-117.4030438539681 47.66673855000001),549.552940,549.552940
2,1.0,North Carolina,"Columbus, OH",2019.0,20192,University of North Carolina at Chapel Hill,Tar Heels,Chapel Hill,North Carolina,State,Atlantic Coast Conference,University of North Carolina at Chapel Hill Ch...,-79.047753,35.905035,POINT (-79.04775326525106 35.90503535),353.448252,353.448252
3,1.0,Virginia,"Columbia, SC",2019.0,20193,University of Virginia,Cavaliers,Charlottesville,Virginia,State,Atlantic Coast Conference,University of Virginia Charlottesville Virginia,-78.505500,38.041058,POINT (-78.50549960183569 38.0410576),312.854085,312.854085
4,2.0,Michigan State,"Des Moines, IA",2019.0,20194,Michigan State University,Spartans,East Lansing,Michigan,State,Big Ten Conference,Michigan State University East Lansing Michigan,-84.477916,42.718568,POINT (-84.47791570930522 42.71856800000001),473.633831,355.225373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555,3.0,NC State,"Albuquerque, NM",1985.0,1985555,North Carolina State University,Wolfpack,Raleigh,North Carolina,State,Atlantic Coast Conference,North Carolina State University Raleigh North ...,-78.674087,35.771850,POINT (-78.67408695452633 35.77184965),1570.465493,785.232746
556,4.0,Loyola–Chicago,"Hartford, CT",1985.0,1985556,Loyola University Chicago,Ramblers,Chicago,Illinois,Private/Catholic,Missouri Valley Conference,Loyola University Chicago Chicago Illinois,-87.668422,41.944842,POINT (-87.66842176669064 41.94484179999999),769.915951,192.478988
557,4.0,Ohio State,"Tulsa, OK",1985.0,1985557,The Ohio State University,Buckeyes,Columbus,Ohio,State,Big Ten Conference,The Ohio State University Columbus Ohio,-83.028663,40.005709,POINT (-83.02866259769122 40.00570905),752.697761,188.174440
558,4.0,LSU,"Dayton, OH",1985.0,1985558,Louisiana State University,Tigers,Baton Rouge,Louisiana,State,Southeastern Conference,Louisiana State University Baton Rouge Louisiana,-91.185968,30.405709,POINT (-91.18596767189877 30.40570885),757.034930,189.258733


In [43]:
# data.distance.plot()
data.weightedDist.describe()

count     560.000000
mean      316.622753
std       315.490350
min         0.422532
25%       112.196301
50%       214.944656
75%       417.406746
max      2451.644135
Name: weightedDist, dtype: float64

In [81]:
# df.sort_values(by=['col1'])
pd.set_option('display.max_columns', None)
# schoolsSorted.sort_values(by=['count_dist']).tail(10)
# schoolsSorted.sort_values(by=['mean_wtDist']).tail(10)
# schoolsSorted.loc[schoolsSorted.count_dist == 1].describe()
# list(schoolsSorted.loc[schoolsSorted.count_dist >= 10].school_common_name)

# schoolsSorted.loc[schoolsSorted.count_dist >= 10].describe()
schoolsSorted.loc[(schoolsSorted.school_common_name == 'Duke') | (schoolsSorted.school_common_name == 'Kentucky') |
                 (schoolsSorted.school_common_name == 'Kansas') | (schoolsSorted.school_common_name == 'North Carolina')].describe()


Unnamed: 0,lng,lat,count_dist,mean_dist,std_dist,min_dist,25%_dist,50%_dist,75%_dist,max_dist,count_wtDist,mean_wtDist,std_wtDist,min_wtDist,25%_wtDist,50%_wtDist,75%_wtDist,max_wtDist
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,-84.436138,37.222571,27.0,397.864461,393.599334,35.08416,121.628713,254.863083,521.954358,1653.353277,27.0,277.124542,268.374715,23.986086,96.46445,168.915046,345.856069,1118.315176
std,7.661406,1.515396,3.91578,70.719146,150.829801,26.02644,70.102595,94.979415,130.965024,561.629146,3.91578,55.905589,82.759007,22.551019,45.18325,40.561229,75.522383,344.675619
min,-95.24775,35.905035,22.0,299.229824,246.686265,8.661912,47.628087,119.284591,343.770399,1004.278451,22.0,199.620996,183.171797,6.496434,47.628087,119.284591,245.259017,753.208838
25%,-87.190552,35.976376,25.0,371.245133,315.244709,20.50835,71.132222,233.233158,467.541677,1352.201162,25.0,256.129981,205.961562,8.609872,65.917657,144.058653,309.023915,864.562794
50%,-81.776287,37.013596,27.5,417.652024,362.717739,30.857284,120.505585,280.064337,550.82442,1635.768478,27.5,291.06384,268.225018,16.885757,95.008597,176.481791,366.934916,1125.938233
75%,-79.021872,38.259791,29.5,444.271352,441.072364,45.433094,171.002076,301.694262,605.237102,1936.920593,29.5,312.058401,330.63817,32.261971,125.555389,201.338185,403.76707,1379.690615
max,-78.94423,38.958058,31.0,456.92397,602.275592,69.960161,197.875595,340.039066,642.398195,2337.597702,31.0,326.749494,353.877026,55.676395,148.212518,203.412011,404.29543,1468.175399


In [27]:
import geocoder as g

In [28]:
g.osm('west point highlands ny')

<[OK] Osm - Geocode [Sunrise Highway, Patchogue, Suffolk County, New York, 11772, United States of America]>

In [29]:
import matplotlib
%matplotlib inline

seed = data.groupby('seed').describe()
seed.distance
data.groupby('seed').distance.median()

seed
1.0    236.950510
2.0    313.950805
3.0    489.935499
4.0    599.430390
Name: distance, dtype: float64

In [30]:
school = data.groupby('school_common_name').describe()
school.distance.head()

# print (df.drop_duplicates(['Cat']))
d = data.drop_duplicates(['school_common_name']).drop(['distance', 'seed', 'site', 'id', 'geometry'], axis=1)
d

m = pd.merge(d, school.distance, how='left', left_on='school_common_name', right_on=school.distance.index.get_level_values('school_common_name'))
m

# df.sort_values(by=['col1'])
m.sort_values(by=['mean'])

Unnamed: 0,school_common_name,year,school_full_name,team,city,state,type,conference,address,lng,lat,weightedDist,count,mean,std,min,25%,50%,75%,max
83,DePaul,1987.0,DePaul University,Blue Demons,Chicago,Illinois,Private/Catholic,Big East Conference,DePaul University Chicago Illinois,-87.654726,41.924020,6.170214,1.0,12.340428,,12.340428,12.340428,12.340428,12.340428,12.340428
82,La Salle,1990.0,La Salle University,Explorers,Philadelphia,Pennsylvania,Private/Catholic,Atlantic 10 Conference,La Salle University Philadelphia Pennsylvania,-75.154018,40.037470,43.862579,1.0,175.450316,,175.450316,175.450316,175.450316,175.450316,175.450316
68,Alabama,2002.0,University of Alabama,Crimson Tide,Tuscaloosa,Alabama,State,Southeastern Conference,University of Alabama Tuscaloosa Alabama,-87.539674,33.212082,236.527978,3.0,182.734855,133.910730,47.584591,116.416964,185.249336,250.309987,315.370637
3,Virginia,2019.0,University of Virginia,Cavaliers,Charlottesville,Virginia,State,Atlantic Coast Conference,University of Virginia Charlottesville Virginia,-78.505500,38.041058,312.854085,6.0,236.798274,73.542623,156.368012,175.687990,233.647923,293.052544,327.903690
27,Butler,2017.0,Butler University,Bulldogs,Indianapolis,Indiana,Private/Non-Sectarian,Big East Conference,Butler University Indianapolis Indiana,-86.173749,39.840719,59.637635,1.0,238.550541,,238.550541,238.550541,238.550541,238.550541,238.550541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,St. John's,2000.0,St. John's University,Red Storm,Jamaica,New York,Private/Catholic,Big East Conference,St. John's University Jamaica New York,-73.990073,40.729944,1589.957901,5.0,1604.322756,819.455876,541.866581,940.146750,1968.012445,2119.943868,2451.644135
85,VCU,1985.0,Virginia Commonwealth University,Rams,Richmond,Virginia,State,Atlantic 10 Conference,Virginia Commonwealth University Richmond Virg...,-77.453064,37.548215,1220.891489,1.0,1627.855318,,1627.855318,1627.855318,1627.855318,1627.855318,1627.855318
46,Saint Louis,2013.0,Saint Louis University,Billikens,St. Louis,Missouri,Private/Catholic,Atlantic 10 Conference,Saint Louis University St. Louis Missouri,-90.231677,38.635284,429.457460,1.0,1717.829839,,1717.829839,1717.829839,1717.829839,1717.829839,1717.829839
67,Dayton,2003.0,University of Dayton,Flyers,Dayton,Ohio,Private/Catholic,Atlantic 10 Conference,University of Dayton Dayton Ohio,-84.179195,39.738460,433.236949,1.0,1732.947796,,1732.947796,1732.947796,1732.947796,1732.947796,1732.947796


In [31]:
# m.to_json('../data/cleaned/schools-default.json')
# m.to_csv('../data/cleaned/schools-default.csv', index=False)