In [None]:
import pandas as pd
import seaborn as sns; sns.set(style="white", color_codes=True)
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
filename = 'NCAA_data.csv'
data = pd.read_csv(filename)

In [None]:
data.info()

# Glossary
- **Rk** -- Rank
 - **School** -- * = NCAA Tournament appearance
 - **Conf** -- Conference
 - **W** -- Wins
 - **L** -- Losses
 - **Pts** -- Points Per Game
 - **Opp** -- Opponent Points Per Game
 - **MOV** -- Margin of Victory = Pts - Opp
 - **SOS** -- Strength of Schedule 
   - A rating of strength of schedule. The rating is denominated in points above/below average, where zero is average. Non-Division I games are excluded from the ratings.
 - **SRS**
   - **OSRS** -- The offensive component of the Simple Rating System (SRS) 
     - A rating that takes into account average point differential and strength of schedule. The rating is denominated in points above/below average, where zero is average. Non-Division I games are excluded from the ratings.
   - **DSRS** -- The defensive component of the Simple Rating System (SRS) 
     - A rating that takes into account average point differential and strength of schedule. The rating is denominated in points above/below average, where zero is average. Non-Division I games are excluded from the ratings.
   - **SRS** -- Simple Rating System
     - A rating that takes into account average point differential and strength of schedule. The rating is denominated in points above/below average, where zero is average. Non-Division I games are excluded from the ratings.
 - **Adjusted**
   - **ORtg** -- Offensive Rating = 100 * (PTS / Poss)
     - An estimate of points scored (for teams) or points produced (for players) per 100 possessions.
   - **DRtg** -- Defensive Rating = 100 * (Opp PTS / Poss)
     - An estimate of points allowed per 100 possessions.
   - **NRtg** -- Net Rating = 100 * ((PTS - Opp PTS) / Poss))
     - An estimate of point differential per 100 possessions.

In [None]:
print(data.head())

In [None]:
print(data.tail())

### We hypothesize that Rank has the most predictive abilities for determining the Winner of March Madness 
### So which metric determines a teams rank? We hypothesize either Wins, Average Points, MOV, SRS, or NRtg.

# Correlation

In [None]:
f,ax = plt.subplots(figsize=(8, 8))
sns.heatmap(data.corr(), annot=True, linewidths=.5, fmt= '.2f',ax=ax, cmap="RdBu")
plt.show()

# Those who Win

In [None]:
print('**All numbers rounded**')
print("The average team wins {:.0f} games,\n99% of teams win {:.0f} games or less,\nwhile the most games won is is {:.0f}, which belongs to rank 3.".format(data['W'].mean(),data['W'].quantile(0.99), data['W'].max()))

In [None]:
sns.jointplot("Rk", "W", data=data, kind="reg", xlim=(70, 0), ylim=(0, 40), color="b", height=8)

While wins is negativly correlated to Rank, the third best team has the most wins. Thus, wins could not directly predict Rank 1.

# Those who Score

In [None]:
print('**All numbers rounded**')
print("The average team scores {:.0f} points/game,\n99% of teams score {:.0f} points/game or less,\nwhile the highest points/game average is {:.0f}, which belongs to rank 1.".format(data['Pts'].mean(),data['Pts'].quantile(0.99), data['Pts'].max()))

In [None]:
sns.jointplot("Rk", "Pts", data=data, kind="reg", xlim=(70, 0), ylim=(65, 90), color="b", height=8)

While Rank 1 does have the highest average points, the correlation to Rank is pretty weak.
Perhaps this is because some teams have easier schedules than other.
So, lets take the schedule into consideration:

# Those who blowout their opponents

In [None]:
print('**All numbers rounded**')
print("The average team beats their oppenents by {:.0f} points/game,\n99% of teams beat their oppenents by {:.0f} points/game or less,\nwhile the the team that blows out all their opponents typically wins by {:.0f} points/game, which belongs to rank 1.".format(data['MOV'].mean(),data['MOV'].quantile(0.99), data['MOV'].max()))

In [None]:
sns.jointplot("Rk", "MOV", data=data, kind="reg", xlim=(70, 0), ylim=(-5, 30), color="b", height=8)

Rank 1 beats every team worse than any other team, but there are still high MOV's in lowest rank teams

# Those who blowout better opponents

In [None]:
print('**All numbers rounded**')
print("The average team beats good oppenents by {:.0f} points/game,\n99% of teams beat good oppenents by {:.0f} points/game or less,\nwhile the the team that blows out all their opponents typically wins by {:.0f} points/game, which belongs to rank 1.".format(data['SRS'].mean(),data['SRS'].quantile(0.99), data['SRS'].max()))

In [None]:
sns.jointplot("Rk", 'SRS', data=data, kind="reg", xlim=(70, 0), ylim=(0, 30), color="b", height=8)

Simple Rating System (SRS) clearly produces a much tighter trend line, and appears to spike towards the better ranked teams. Because SRS is calculated using Strength of Schedule, I assume that explains the spike.

# Those who blowout opponents per 100 possessions

In [None]:
print('**All numbers rounded**')
print("The average team beats oppenents by {:.0f} points/game per 100 possessions,\n99% of teams beat oppenents by {:.0f} points/game or less per 100 possessions,\nwhile the team that blows out all their opponents per typically wins by {:.0f} points/game per 100 possessions, \nwhich belongs to rank 3.".format(data['NRtg'].mean(),data['NRtg'].quantile(0.99), data['NRtg'].max()))

In [None]:
sns.jointplot("Rk", 'NRtg', data=data, kind="reg", xlim=(70, 0), ylim=(10, 40), color="b", height=8)

Net Rating, which is MOV/100 possessions, displays the highest correlation to rank. Furthermore, this metric removes the outliers in the lower ranks. Again, we can see a spike towards the better ranked teams. But, Rank 3 actually shows the has the highest Net Rating.

In [None]:
data