In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('../data/scraped_for_modeling_labeled.csv')
data.head(3)

Unnamed: 0,Team,offense_points_per_game,season,games_played,offense_downs_Third Downs_PCT,offense_downs_Fourth Downs_PCT,offense_passing_CMP%,offense_passing_AVG,offense_passing_YDS/G,offense_passing_RTG,...,offense_receiving_FUM_per_game,offense_rushing_FUM_per_game,defense_receiving_FUM_per_game,defense_rushing_FUM_per_game,offense_receiving_LST_FUM_ratio,offense_rushing_LST_FUM_ratio,defense_receiving_LST_FUM_ratio,defense_rushing_LST_FUM_ratio,winner,played
0,Kansas City Chiefs,30.2,2004,16,47.2,28.6,66.0,8.3,275.4,94.9,...,0.125,0.4375,0.4375,0.5625,1.0,0.571429,0.428571,0.444444,0,0
1,Indianapolis Colts,32.6,2004,16,42.7,57.1,67.0,9.0,288.9,119.7,...,0.25,0.5625,0.375,0.625,0.75,0.444444,0.666667,0.4,0,0
2,Green Bay Packers,26.5,2004,16,47.3,57.1,63.9,7.6,278.1,93.9,...,0.3125,0.5625,0.1875,0.3125,0.6,0.666667,0.666667,0.4,0,0


### Time analysis

In [3]:
def plot_data_over_time(y):
    
    fig = px.box(data, 
                  x = 'season', y = y, hover_data = ['Team'],
                  title = y + ' over time', 
                  labels = {'season' : 'Season'},
                  width = 850, height = 450)
    fig.update_layout(showlegend=False)
    fig.update_yaxes(visible = True, showticklabels = True, title = None)
    fig.show()
    
def plot_co_relationship_between_feats(x,y):
    
    fig = px.scatter(data, x = x, y = y, hover_data = ['Team', 'season'],
                     title = 'Relationship between ' + x + ' and ' + y, 
                     labels = {'season' : 'Season'}, trendline = 'ols',
                     width = 850, height = 450)
    fig.update_layout(showlegend=False)
    fig.show()
    
    
def plot_effect_on_SB_win(x):
    
    fig = px.box(data, y = 'winner', x = x, orientation = 'h',
                  title = 'Effect of ' + x + ' on Super Bowl victories', hover_data = ['Team', 'season'],
                  labels = {'winner' : 'Winner'},
                  width = 700, height = 400,  
                  category_orders={'winner' : [1,0]})
    fig.update_layout(showlegend=False)
    fig.update_xaxes(visible = True, showticklabels = True, title = None)
    fig.update_yaxes(visible = True, showticklabels = True, title = None)
    fig.show()
    
def plot_effect_on_SB_play(x):
    
    fig = px.box(data, y = 'played', x = x, orientation = 'h',
                  title = 'Effect of ' + x + ' on playing in the Super Bowl', hover_data = ['Team', 'season'],
                  labels = {'played' : 'Played in SB'},
                  width = 700, height = 400,  
                  category_orders={'winner' : [1,0]})
    fig.update_layout(showlegend=False)
    fig.update_xaxes(visible = True, showticklabels = True, title = None)
    fig.update_yaxes(visible = True, showticklabels = True, title = None)
    fig.show()

In [6]:
interesting_cols = ['special_kicking_Extra Points_XP%', 'special_returning_Kickoffs_att_per_game', 'offense_downs_Fourth Downs_ATT_per_game']
for y in interesting_cols:
    plot_data_over_time(y)

### Co-distribution analysis

In [7]:
cols = data.drop(['Team', 'season', 'games_played'], 1).columns.tolist()

In [8]:
THRESHOLD = .8

correls = data[cols].corr().unstack()
correls = correls[((correls > THRESHOLD) & (correls != 1)) | ((correls < -THRESHOLD) & (correls != -1))].reset_index()
correls['ordered-cols'] = correls.apply(lambda x: '-'.join(sorted([x['level_0'],x['level_1']])),axis=1)
correls = correls.drop_duplicates(['ordered-cols'])
correls.drop(['ordered-cols'], axis=1, inplace=True)
correls.reset_index(inplace = True, drop = True)

In [9]:
for x, y in zip(correls.level_0, correls.level_1):
    plot_co_relationship_between_feats(x, y)

In [10]:
plot_co_relationship_between_feats('offense_downs_Penalties_YDS_per_game', 'defense_downs_Penalties_YDS_per_game')

In [11]:
plot_co_relationship_between_feats('offense_points_per_game', 'defense_points_per_game')

### Relationship with target

In [12]:
cols = ['offense_passing_AVG', 'offense_passing_YDS/G', 'offense_points_per_game', 'defense_points_per_game']

for col in cols:
    plot_effect_on_SB_play(col)

In [None]:
# cols = data.drop(['Team', 'season', 'games_played'], 1).columns
# for i in cols:
    
#     plt.figure(figsize = (7, 4))
#     sns.distplot(data[i], kde = False, bins = 25, 
#                  hist_kws = {'edgecolor' : 'black', 'linewidth' : 1})

#     plt.title('Distribution of ' + i)
#     plt.xlabel(None)
#     plt.ylabel(None)
#     plt.show()

In [None]:
# fig, ax = plt.subplots(figsize = (30, 20))
# sns.heatmap(data.corr(), 
#             ax = ax, cmap = 'coolwarm', center = 0, annot = True, fmt = '.2g')
# plt.title('Correlations\n')
# plt.show()