In [2]:
## import pandas and numpy libraries to work with dataframes and perform calculations
import pandas as pd
import numpy as np

## import datetime and warnings libraries for ease
import datetime
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=DeprecationWarning)

In [3]:
## import sklearn and relevant libaries for machine learning
import sklearn
from sklearn import model_selection
from sklearn.feature_selection import RFE
from sklearn.metrics import brier_score_loss, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV as CCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier

## import urllib, requests and BeautifulSoup for webscraping
import urllib.request as urllib
import requests
from bs4 import BeautifulSoup

In [4]:
## import most updated data from the below links:
# spreadspoke_scores: https://www.kaggle.com/tobycrabtree/nfl-scores-and-betting-data
# nfl_teams: https://www.kaggle.com/tobycrabtree/nfl-scores-and-betting-data
# nfl_elo: https://projects.fivethirtyeight.com/2020-nfl-predictions/
df = pd.read_csv('spreadspoke_scores.csv')
elo = pd.read_csv('nfl_elo.csv')
teams = pd.read_csv('nfl_teams.csv')

## ----------------The following cell of code is NOT entirely mine----------------
### The code is inspired by Ty Walters' at https://github.com/TyWalters/NFL-Prediction-Model
### He did a great job cleaning the data and prepping it for use. Check out his project on github.

In [109]:
## replace empty string with NaN
df = df.replace(r'^\s*$', np.nan, regex=True)

# stadium neutral and schedule playoff as boolean
df['stadium_neutral'] = df.stadium_neutral.astype(int)
df['schedule_playoff'] = df.schedule_playoff.astype(int)


## eliminates rows containing specific null values, all games prior to 2003 and Week 1 games
df = df[(df.score_home.isnull() == False) & (df.team_favorite_id.isnull() == False) & (df.over_under_line.isnull() == False) &
        (df.schedule_season >= 2003)]

## resets index and changes data type of over under line variable
df.reset_index(drop=True, inplace=True)
df['over_under_line'] = df.over_under_line.astype(float)

## map team abbreviations onto team names
df['team_home'] = df.team_home.map(teams.set_index('team_name')['team_id'].to_dict())
df['team_away'] = df.team_away.map(teams.set_index('team_name')['team_id'].to_dict())

# creating home favorite and away favorite columns (fill na with 0's)
df.loc[df.team_favorite_id == df.team_home, 'home_favorite'] = 1
df.loc[df.team_favorite_id == df.team_away, 'away_favorite'] = 1
df.home_favorite.fillna(0, inplace=True)
df.away_favorite.fillna(0, inplace=True)

# creating over / under column (fill na with 0's)
df.loc[((df.score_home + df.score_away) > df.over_under_line), 'over'] = 1
df.over.fillna(0, inplace=True)

# change data type of date columns
df['schedule_date'] = pd.to_datetime(df['schedule_date'])
elo['date'] = pd.to_datetime(elo['date'])


# fixing some schedule_week column errors and converting column to integer data type
df.loc[(df.schedule_week == '18'), 'schedule_week'] = '17'
df.loc[(df.schedule_week == 'Wildcard') | (df.schedule_week == 'WildCard'), 'schedule_week'] = '18'
df.loc[(df.schedule_week == 'Division'), 'schedule_week'] = '19'
df.loc[(df.schedule_week == 'Conference'), 'schedule_week'] = '20'
df.loc[(df.schedule_week == 'Superbowl') | (df.schedule_week == 'SuperBowl'), 'schedule_week'] = '21'
df['schedule_week'] = df.schedule_week.astype(int)


# Cleaning games_elo and df to merge correctly
wsh_map = {'WSH' : 'WAS'}
elo.loc[elo.team1 == 'WSH', 'team1'] = 'WAS' 
elo.loc[elo.team2 == 'WSH', 'team2'] = 'WAS'

# Cleaning games_elo and df to merge correctly
oak_map = {'OAK' : 'LVR'}
elo.loc[elo.team1 == 'OAK', 'team1'] = 'LVR' 
elo.loc[elo.team2 == 'OAK', 'team2'] = 'LVR'

# fix dates
df.loc[(df.schedule_date == '2016-09-19') & (df.team_home == 'MIN'), 'schedule_date'] = datetime.datetime(2016, 9, 18)
df.loc[(df.schedule_date == '2017-01-22') & (df.schedule_week == 21), 'schedule_date'] = datetime.datetime(2017, 2, 5)
df.loc[(df.schedule_date == '1990-01-27') & (df.schedule_week == 21), 'schedule_date'] = datetime.datetime(1990, 1, 28)
df.loc[(df.schedule_date == '1990-01-13'), 'schedule_date'] = datetime.datetime(1990, 1, 14)
elo.loc[(elo.date == '2016-01-09'), 'date'] = datetime.datetime(2016, 1, 10)
elo.loc[(elo.date == '2016-01-08'), 'date'] = datetime.datetime(2016, 1, 9)
elo.loc[(elo.date == '2016-01-16'), 'date'] = datetime.datetime(2016, 1, 17)
elo.loc[(elo.date == '2016-01-15'), 'date'] = datetime.datetime(2016, 1, 16)

# merge elo with df
df = df.merge(elo, left_on=['schedule_date', 'team_home', 'team_away'], right_on=['date', 'team1', 'team2'], how='left')

# merge to fix neutral games where team_home and team_away are switched
elo2 = elo.rename(columns={'team1' : 'team2', 'team2' : 'team1', 'elo1_pre' : 'elo2_pre', 'elo2_pre' : 'elo1_pre'})
elo2['qbelo_prob1'] = 1 - elo2.qbelo_prob1
df = df.merge(elo2, left_on=['schedule_date', 'team_home', 'team_away'], right_on=['date', 'team1', 'team2'], how='left')

In [110]:
# removing weeks 1 and 17 from data
# week 1 is difficult to predict due to lack of information
# week 17 can be tricky to predict with teams resting players
df = df[(df.schedule_week != 1) & (df.schedule_week != 17)]

In [111]:
# separating merged columns into x and y cols
x_cols = ['date_x', 'season_x', 'neutral_x', 'playoff_x', 'team1_x',
           'team2_x', 'elo1_pre_x', 'elo2_pre_x', 'elo_prob1_x', 'elo_prob2_x',
           'elo1_post_x', 'elo2_post_x', 'qbelo1_pre_x', 'qbelo2_pre_x', 'qb1_x',
           'qb2_x', 'qb1_value_pre_x', 'qb2_value_pre_x', 'qb1_adj_x', 'qb2_adj_x',
           'qbelo_prob1_x', 'qbelo_prob2_x', 'qb1_game_value_x',
           'qb2_game_value_x', 'qb1_value_post_x', 'qb2_value_post_x',
           'qbelo1_post_x', 'qbelo2_post_x', 'score1_x', 'score2_x']
y_cols = ['date_y', 'season_y', 'neutral_y', 'playoff_y', 'team1_y', 'team2_y', 'elo1_pre_y', 'elo2_pre_y',
          'qbelo_prob1_y', 'score1_y', 'score2_y','elo_prob1_y','elo_prob2_y', 'elo1_post_y', 'elo2_post_y', 'qbelo1_pre_y',
           'qbelo2_pre_y', 'qb1_y', 'qb2_y', 'qb1_value_pre_y', 'qb2_value_pre_y',
           'qb1_adj_y', 'qb2_adj_y', 'qbelo_prob2_y', 'qb1_game_value_y',
           'qb2_game_value_y', 'qb1_value_post_y', 'qb2_value_post_y',
           'qbelo1_post_y', 'qbelo2_post_y']

# filling null values for games_elo merged cols
for x, y in zip(x_cols, y_cols):
    df[x] = df[x].fillna(df[y]) 
    
# dropping duplicate y columns
df = df.drop(columns=y_cols)

# dropping indicies of rows that do not merge properly (bad habit, but works as short-term solution)
df = df.drop([115,3077,2518,1703,580,2993,1445,615,3045,
              3737,800,1983,766,2669,2236,587,2402,1334,2076,
              1174,1868,2782,1601,2135,2727])

# remove _x ending from column names
df.columns = df.columns.str.replace('_x', '')
df = df[['schedule_date', 'schedule_season', 'schedule_week', 'schedule_playoff',
       'team_home', 'score_home', 'score_away', 'team_away',
       'team_favorite_id', 'spread_favorite', 'over_under_line', 'stadium',
       'stadium_neutral', 'home_favorite', 'away_favorite',
       'over',
       'elo1_pre', 'elo2_pre', 'elo_prob1', 'elo_prob2', 'qb1', 'qb2', 'qb1_value_pre',
       'qb2_value_pre', 'qb1_adj', 'qb2_adj', 'qbelo_prob1', 'qbelo_prob2']]

# removing any rows with null values
df = df.dropna()

In [112]:
## Data Exploration --> also inspired by Ty Walters

# creating new columns that provide additonal insight
df['result'] = (df.score_home > df.score_away).astype(int)
df['home_cover'] = ((df.home_favorite == 1) & ((df.score_away - df.score_home) < df.spread_favorite)).astype(int)
df['away_cover'] = ((df.away_favorite == 1) & ((df.score_home - df.score_away) < df.spread_favorite)).astype(int)
df['favorite_cover'] = (((df.home_favorite == 1) & ((df.score_away - df.score_home) < df.spread_favorite)) | ((df.away_favorite == 1) & ((df.score_home - df.score_away) < df.spread_favorite))).astype(int)
df['elo_pre_diff'] = (df.elo1_pre - df.elo2_pre)
df['elo_prob_diff'] = (df.elo_prob1 - df.elo_prob2)
df['qb_value_pre_diff'] = (df.qb1_value_pre.astype(float) - df.qb2_value_pre.astype(float))
df['qb_adj_diff'] = (df.qb1_adj.astype(float) - df.qb2_adj.astype(float))
df['qbelo_prob_diff'] = (df.qbelo_prob1 - df.qbelo_prob2)

# creating new variables
home_win = "{:.2f}".format((sum((df.result == 1) & (df.stadium_neutral == 0)) / len(df)) * 100)
away_win = "{:.2f}".format((sum((df.result == 0) & (df.stadium_neutral == 0)) / len(df)) * 100)
under_line = "{:.2f}".format((sum((df.score_home + df.score_away) < df.over_under_line) / len(df)) * 100)
over_line = "{:.2f}".format((sum((df.score_home + df.score_away) > df.over_under_line) / len(df)) * 100)

favored = "{:.2f}".format((sum(((df.home_favorite == 1) & (df.result == 1)) | ((df.away_favorite == 1) & (df.result == 0)))
                           / len(df)) * 100)

cover = "{:.2f}".format((sum(((df.home_favorite == 1) & ((df.score_away - df.score_home) < df.spread_favorite)) | 
                             ((df.away_favorite == 1) & ((df.score_home - df.score_away) < df.spread_favorite))) 
                         / len(df)) * 100)

ats = "{:.2f}".format((sum(((df.home_favorite == 1) & ((df.score_away - df.score_home) > df.spread_favorite)) | 
                           ((df.away_favorite == 1) & ((df.score_home - df.score_away) > df.spread_favorite))) 
                       / len(df)) * 100)

# displaying general information of the dataset, obtained from the new variables
print("Number of Games: " + str(len(df)))
print("Home Straight Up Win Percentage: " + home_win + "%")
print("Away Straight Up Win Percentage: " + away_win + "%")
print("Under Percentage: " + under_line + "%")
print("Over Percentage: " + over_line + "%")
print("Favored Win Percentage: " + favored + "%")
print("Cover The Spread Percentage: " + cover + "%")
print("Against The Spread Percentage: " + ats + "%")

Number of Games: 4070
Home Straight Up Win Percentage: 56.46%
Away Straight Up Win Percentage: 42.68%
Under Percentage: 49.31%
Over Percentage: 48.77%
Favored Win Percentage: 65.72%
Cover The Spread Percentage: 46.98%
Against The Spread Percentage: 49.36%


In [113]:
# checking what columns are in the dataset
df.columns

Index(['schedule_date', 'schedule_season', 'schedule_week', 'schedule_playoff',
       'team_home', 'score_home', 'score_away', 'team_away',
       'team_favorite_id', 'spread_favorite', 'over_under_line', 'stadium',
       'stadium_neutral', 'home_favorite', 'away_favorite', 'over', 'elo1_pre',
       'elo2_pre', 'elo_prob1', 'elo_prob2', 'qb1', 'qb2', 'qb1_value_pre',
       'qb2_value_pre', 'qb1_adj', 'qb2_adj', 'qbelo_prob1', 'qbelo_prob2',
       'result', 'home_cover', 'away_cover', 'favorite_cover', 'elo_pre_diff',
       'elo_prob_diff', 'qb_value_pre_diff', 'qb_adj_diff', 'qbelo_prob_diff'],
      dtype='object')

In [114]:
# checking to see dataset has correct looking data
df

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,...,qbelo_prob2,result,home_cover,away_cover,favorite_cover,elo_pre_diff,elo_prob_diff,qb_value_pre_diff,qb_adj_diff,qbelo_prob_diff
16,2003-09-14,2003,2,0,ARI,0.0,38.0,SEA,SEA,-5.0,...,0.618470,0,0,1,1,-158.199000,-0.261994,-30.107628,3.410046,-0.236941
17,2003-09-14,2003,2,0,ATL,31.0,33.0,WAS,ATL,-3.0,...,0.332021,0,0,0,0,61.661000,0.349224,-41.785353,-25.089353,0.335958
18,2003-09-14,2003,2,0,BAL,33.0,13.0,CLE,BAL,-2.5,...,0.464146,1,1,0,1,-4.551000,0.172252,25.580934,7.396353,0.071708
19,2003-09-14,2003,2,0,GB,31.0,6.0,DET,GB,-7.0,...,0.288526,1,1,0,1,130.243000,0.509426,-3.278527,-23.689201,0.422948
20,2003-09-14,2003,2,0,IND,33.0,7.0,TEN,IND,-2.5,...,0.512823,1,1,0,1,-71.655000,-0.019152,-62.287513,-19.240996,-0.025647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4653,2020-11-01,2020,8,0,KC,35.0,9.0,NYJ,KC,-19.5,...,0.075688,1,1,0,1,404.020024,0.874041,166.116847,13.067144,0.848625
4654,2020-11-01,2020,8,0,MIA,28.0,17.0,LAR,LAR,-3.5,...,0.656094,1,0,0,0,-101.957301,-0.105972,-84.994062,-83.134516,-0.312188
4655,2020-11-01,2020,8,0,PHI,23.0,9.0,DAL,PHI,-11.5,...,0.175320,1,1,0,1,66.650176,0.361769,140.112180,189.683751,0.649360
4656,2020-11-01,2020,8,0,SEA,37.0,27.0,SF,SEA,-3.0,...,0.414053,1,1,0,1,18.726553,0.236425,118.819860,28.513466,0.171893


In [115]:
# creating entirely new dataframe where teams' data is split by the favorite team and underdog team,
# rather than the home and away team. This way it will be easier to establish trends between all favored teams
# and all underdog teams, which is more telling than home and away teams.

fav = pd.DataFrame()
fav['schedule_date'] = df['schedule_date']
fav['schedule_season'] = df['schedule_season']
fav['schedule_week'] = df['schedule_week']
fav['schedule_playoff'] = df['schedule_playoff']
fav['favorite'] = np.where(df['team_favorite_id']==df['team_away'], df['team_away'], df['team_home'])
fav['underdog'] = np.where(df['team_favorite_id']==df['team_away'], df['team_home'], df['team_away'])
fav['score_favorite'] = np.where(df['team_favorite_id']==df['team_away'], df['score_away'], df['score_home'])
fav['score_underdog'] = np.where(df['team_favorite_id']==df['team_away'], df['score_home'], df['score_away'])
fav['spread_favorite'] = abs(df['spread_favorite'])
fav['over_under_line'] = df['over_under_line']
fav['stadium'] = df['stadium']
fav['stadium_neutral'] = df['stadium_neutral']
fav['home_favorite'] = df['home_favorite']
fav['away_favorite'] = df['away_favorite']
fav['over'] = df['over']
fav['elo_pre_favorite'] = np.where(df['team_favorite_id']==df['team_away'], df['elo2_pre'], df['elo1_pre'])
fav['elo_pre_underdog'] = np.where(df['team_favorite_id']==df['team_away'], df['elo1_pre'], df['elo2_pre'])
fav['elo_prob_favorite'] = np.where(df['team_favorite_id']==df['team_away'], df['elo_prob2'], df['elo_prob1'])
fav['elo_prob_underdog'] = np.where(df['team_favorite_id']==df['team_away'], df['elo_prob1'], df['elo_prob2'])
fav['qb_favorite'] = np.where(df['team_favorite_id']==df['team_away'], df['qb2'], df['qb1'])
fav['qb_underdog'] = np.where(df['team_favorite_id']==df['team_away'], df['qb1'], df['qb2'])
fav['qb_value_pre_favorite'] = np.where(df['team_favorite_id']==df['team_away'], df['qb2_value_pre'], df['qb1_value_pre'])
fav['qb_value_pre_underdog'] = np.where(df['team_favorite_id']==df['team_away'], df['qb1_value_pre'], df['qb2_value_pre'])
fav['qb_adj_favorite'] = np.where(df['team_favorite_id']==df['team_away'], df['qb2_adj'], df['qb1_adj'])
fav['qb_adj_underdog'] = np.where(df['team_favorite_id']==df['team_away'], df['qb1_adj'], df['qb2_adj'])
fav['qbelo_prob_favorite'] = np.where(df['team_favorite_id']==df['team_away'], df['qbelo_prob2'], df['qbelo_prob1'])
fav['qbelo_prob_underdog'] = np.where(df['team_favorite_id']==df['team_away'], df['qbelo_prob1'], df['qbelo_prob2'])
fav['result'] = (fav['score_favorite'] > fav['score_underdog']).astype(int)
fav['favorite_cover'] = ((fav.score_favorite - fav.score_underdog) > fav.spread_favorite).astype(int)
fav['elo_pre_diff'] = (fav.elo_pre_favorite - fav.elo_pre_underdog)
fav['elo_prob_diff'] = (fav.elo_prob_favorite - fav.elo_prob_underdog)
fav['qb_value_pre_diff'] = (fav.qb_value_pre_favorite.astype(float) - fav.qb_value_pre_underdog.astype(float))
fav['qb_adj_diff'] = (fav.qb_adj_favorite.astype(float) - fav.qb_adj_underdog.astype(float))
fav['qbelo_prob_diff'] = (fav.qbelo_prob_favorite - fav.qbelo_prob_underdog)
# adding margin of victory column
fav['margin'] = fav['score_favorite'] - fav['score_underdog']

In [116]:
# confirming no null value rows are in the dataframe
fav = fav.dropna()

In [117]:
# creating nfl abbreviation dictionary so it is possible to go back and forth between team names and abbreviations
nfl_abbr = {
    "KC": "Kansas City",
    "NO": "New Orleans",
    "BAL": "Baltimore",
    "PIT": "Pittsburgh",
    "TB": "Tampa Bay",
    "GB": "Green Bay",
    "LAR": "LA Rams",
    "SEA": "Seattle",
    "IND": "Indianapolis",
    "SF": "San Francisco",
    "NE": "New England",
    "TEN": "Tennessee",
    "MIA": "Miami",
    "BUF": "Buffalo",
    "ARI": "Arizona",
    "ATL": "Atlanta",
    "CAR": "Carolina",
    "MIN": "Minnesota",
    "LVR": "Las Vegas",
    "CLE": "Cleveland",
    "HOU": "Houston",
    "CHI": "Chicago",
    "PHI": "Philadelphia",
    "WAS": "Washington",
    "NYG": "NY Giants",
    "LAC": "LA Chargers",
    "DEN": "Denver",
    "DET": "Detroit",
    "CIN": "Cincinnati",
    "DAL": "Dallas",
    "JAX": "Jacksonville",
    "NYJ": "NY Jets",
}

In [118]:
# change data type of date columns
fav['schedule_date'] = df['schedule_date'].astype(str)

In [15]:
# creating dictionary of beautiful soup objects for the predictive ratings page of each unique date in dataset

# base link of website
link = 'https://www.teamrankings.com/nfl/ranking/predictive-by-other?date='

# initialize dictionary
soup_dict = {}

# for each date, attach date to end of base link, and create a soup object for that url
for x in fav["schedule_date"].unique():
    date = x
    year = x.split('-')[0]
    month = x.split('-')[1]
    day = str(int(x.split('-')[2]) - 1)
    site = link + year + '-' + month + '-' + day
    
    html = urllib.urlopen(site)
    soup = BeautifulSoup(html.read())
    
    soup_dict[date] = soup

In [16]:
## for each row in dataset, use appropriate soup object to scrape predictive ratings and ranks

# initalize appropriate lists
favorite_predictive_rating=[]
underdog_predictive_rating=[]
favorite_predictive_rank=[]
underdog_predictive_rank=[]
counter = 0;


for x in fav.iterrows():
    
    # translate team abbreviations to team names
    team1 = nfl_abbr[x[1][4]]
    team2 = nfl_abbr[x[1][5]]
    
    # pull date
    date = x[1][0]
    
    # search main datatable of website
    table = soup_dict[date].find("table",{"class": "datatable"})
    # make a list of all the td tags
    td_tags = table.find_all("td")
    # for each tag in the list, check to see if the html text is the desired team name
    # if it is, scrape the data in the tag preceeding the the team name (rank)
    # and the data succeeding the team name (rating)
    for i in range(len(td_tags)):
        if td_tags[i].find("a") is not None:
            if td_tags[i].find("a").get_text() == team1:
                rating = td_tags[i+1].get_text()
                if rating == '--':
                    rating = 0;
                favorite_predictive_rating.append(rating)
                favorite_predictive_rank.append(td_tags[i-1].get_text())
            if td_tags[i].find("a").get_text() == team2:
                rating2 = td_tags[i+1].get_text()
                if rating2 == '--':
                    rating2 = 0;
                underdog_predictive_rating.append(rating2)
                underdog_predictive_rank.append(td_tags[i-1].get_text())
    # these counters add NaN to keep list appropriate length if team name is not found for some reason
    if len(favorite_predictive_rating) == counter:
        favorite_predictive_rating.append("NaN")
    if len(underdog_predictive_rating) == counter:
        underdog_predictive_rating.append("NaN")
    counter+=1

In [17]:
## creating dictionary of beautiful soup objects for the predictive ratings page of each unique date in dataset


link = 'https://www.teamrankings.com/nfl/ranking/home-by-other?date='
soup_dict_home = {}
for x in fav["schedule_date"].unique():
    date = x
    year = x.split('-')[0]
    month = x.split('-')[1]
    day = str(int(x.split('-')[2]) - 1)
    site = link + year + '-' + month + '-' + day
    
    html = urllib.urlopen(site)
    soup = BeautifulSoup(html.read())
    
    soup_dict_home[date] = soup

In [18]:
favorite_home_rating=[]
underdog_home_rating=[]
favorite_home_rank=[]
underdog_home_rank=[]
counter = 0;
for x in fav.iterrows():
    
    team1 = nfl_abbr[x[1][4]]
    team2 = nfl_abbr[x[1][5]]
    
    date = x[1][0]
    
    table = soup_dict_home[date].find("table",{"class": "datatable"})
    td_tags = table.find_all("td")
    for i in range(len(td_tags)):
        if td_tags[i].find("a") is not None:
            if td_tags[i].find("a").get_text() == team1:
                rating = td_tags[i+1].get_text()
                if rating == '--':
                    rating = 0;
                favorite_home_rating.append(rating)
                favorite_home_rank.append(td_tags[i-1].get_text())
            if td_tags[i].find("a").get_text() == team2:
                rating2 = td_tags[i+1].get_text()
                if rating2 == '--':
                    rating2 = 0;
                underdog_home_rating.append(rating2)
                underdog_home_rank.append(td_tags[i-1].get_text())
    if len(favorite_home_rating) == counter:
        favorite_home_rating.append("NaN")
    if len(underdog_home_rating) == counter:
        underdog_home_rating.append("NaN")
    counter+=1

In [19]:
link = 'https://www.teamrankings.com/nfl/ranking/away-by-other?date='
soup_dict_away = {}
for x in fav["schedule_date"].unique():
    date = x
    year = x.split('-')[0]
    month = x.split('-')[1]
    day = str(int(x.split('-')[2]) - 1)
    site = link + year + '-' + month + '-' + day
    
    html = urllib.urlopen(site)
    soup = BeautifulSoup(html.read())
    
    soup_dict_away[date] = soup

In [20]:
favorite_away_rating=[]
underdog_away_rating=[]
favorite_away_rank=[]
underdog_away_rank=[]
counter = 0;
for x in fav.iterrows():
    team1 = nfl_abbr[x[1][4]]
    team2 = nfl_abbr[x[1][5]]
    
    date = x[1][0]
    
    table = soup_dict_away[date].find("table",{"class": "datatable"})
    td_tags = table.find_all("td")
    for i in range(len(td_tags)):
        if td_tags[i].find("a") is not None:
            if td_tags[i].find("a").get_text() == team1:
                rating = td_tags[i+1].get_text()
                if rating == '--':
                    rating = 0;
                favorite_away_rating.append(rating)
                favorite_away_rank.append(td_tags[i-1].get_text())
            if td_tags[i].find("a").get_text() == team2:
                rating2 = td_tags[i+1].get_text()
                if rating2 == '--':
                    rating2 = 0;
                underdog_away_rating.append(rating2)
                underdog_away_rank.append(td_tags[i-1].get_text())
    if len(favorite_away_rating) == counter:
        favorite_away_rating.append("NaN")
    if len(underdog_away_rating) == counter:
        underdog_away_rating.append("NaN")
    counter+=1

In [119]:
fav['favorite_predictive_rating'] = favorite_predictive_rating
fav['favorite_predictive_rating'] = fav['favorite_predictive_rating'].astype(float)
fav['underdog_predictive_rating'] = underdog_predictive_rating
fav['underdog_predictive_rating'] = fav['underdog_predictive_rating'].astype(float)
fav['predictive_rating_difference'] = fav['favorite_predictive_rating'] - fav['underdog_predictive_rating']
fav['favorite_predictive_rank'] = favorite_predictive_rank
fav['favorite_predictive_rank'] = fav['favorite_predictive_rank'].astype(float)
fav['underdog_predictive_rank'] = underdog_predictive_rank
fav['underdog_predictive_rank'] = fav['underdog_predictive_rank'].astype(float)
fav['predictive_rank_difference'] = fav['favorite_predictive_rank'] - fav['underdog_predictive_rank']

fav['favorite_home_rating'] = favorite_home_rating
fav['favorite_home_rating'] = fav['favorite_home_rating'].astype(float)
fav['underdog_home_rating'] = underdog_home_rating
fav['underdog_home_rating'] = fav['underdog_home_rating'].astype(float)
fav['home_rating_difference'] = fav['favorite_home_rating'] - fav['underdog_home_rating']
fav['favorite_home_rank'] = favorite_home_rank
fav['favorite_home_rank'] = fav['favorite_home_rank'].astype(float)
fav['underdog_home_rank'] = underdog_home_rank
fav['underdog_home_rank'] = fav['underdog_home_rank'].astype(float)
fav['home_rank_difference'] = fav['favorite_home_rank'] - fav['underdog_home_rank']

fav['favorite_away_rating'] = favorite_away_rating
fav['favorite_away_rating'] = fav['favorite_away_rating'].astype(float)
fav['underdog_away_rating'] = underdog_away_rating
fav['underdog_away_rating'] = fav['underdog_away_rating'].astype(float)
fav['away_rating_difference'] = fav['favorite_away_rating'] - fav['underdog_away_rating']
fav['favorite_away_rank'] = favorite_away_rank
fav['favorite_away_rank'] = fav['favorite_away_rank'].astype(float)
fav['underdog_away_rank'] = underdog_away_rank
fav['underdog_away_rank'] = fav['underdog_away_rank'].astype(float)
fav['away_rank_difference'] = fav['favorite_away_rank'] - fav['underdog_away_rank']

fav['favorite_location_rating'] = np.where(fav['home_favorite']==1, fav['favorite_home_rating'], fav['favorite_away_rating'])
fav['underdog_location_rating'] = np.where(fav['away_favorite']==1, fav['underdog_home_rating'], fav['underdog_away_rating'])
fav['location_difference_rating'] = fav['favorite_location_rating'] - fav['underdog_location_rating']

fav['favorite_location_rank'] = np.where(fav['home_favorite']==1, fav['favorite_home_rank'], fav['favorite_away_rank'])
fav['underdog_location_rank'] = np.where(fav['away_favorite']==1, fav['underdog_home_rank'], fav['underdog_away_rank'])
fav['location_difference_rank'] = fav['favorite_location_rank'] - fav['underdog_location_rank']

In [120]:
fav.to_csv('favorite_underdog_data.csv')

In [123]:
fav = pd.read_csv('favorite_underdog_data.csv')
fav = fav.loc[:, 'schedule_date'::]
# change data type of date columns
fav['schedule_date'] = fav['schedule_date'].astype(str)

In [124]:
link = 'https://www.teamrankings.com/nfl/stat/points-per-game?date='
soup_dict_ppg = {}
for x in fav["schedule_date"].unique():
    date = x
    year = x.split('-')[0]
    month = x.split('-')[1]
    day = str(int(x.split('-')[2]) - 1)
    site = link + year + '-' + month + '-' + day
    
    html = urllib.urlopen(site)
    soup = BeautifulSoup(html.read())
    
    soup_dict_ppg[date] = soup

In [125]:
favorite_ppg=[]
underdog_ppg=[]
favorite_ppg_recent=[]
underdog_ppg_recent=[]
favorite_ppg_rank=[]
underdog_ppg_rank=[]
counter = 0;
for x in fav.iterrows():
    
    team1 = nfl_abbr[x[1][4]]
    team2 = nfl_abbr[x[1][5]]
    
    date = x[1][0]
    
   
    table = soup_dict_ppg[date].find("table",{"class": "datatable"})
    td_tags = table.find_all("td")
    for i in range(len(td_tags)):
        if td_tags[i].find("a") is not None:
            if td_tags[i].find("a").get_text() == team1:
                
                rating = td_tags[i+1].get_text()
                if rating == '--':
                    rating = 21;
                    
                favorite_ppg.append(rating)
                favorite_ppg_rank.append(td_tags[i-1].get_text())
                
                recent = td_tags[i+2].get_text()
                if recent == '--':
                    recent = 21;
                favorite_ppg_recent.append(recent)
                
            if td_tags[i].find("a").get_text() == team2:
                
                rating2 = td_tags[i+1].get_text()
                if rating2 == '--':
                    rating2 = 0;
                    
                underdog_ppg.append(rating2)
                underdog_ppg_rank.append(td_tags[i-1].get_text())
                
                recent2 = td_tags[i+2].get_text()
                if recent2 == '--':
                    recent2 = 21;
                underdog_ppg_recent.append(recent2)
                
    if len(favorite_ppg) == counter:
        favorite_ppg.append("NaN")
    if len(underdog_ppg) == counter:
        underdog_ppg.append("NaN")
    counter+=1

In [126]:
link = 'https://www.teamrankings.com/nfl/stat/average-scoring-margin?date='
soup_dict_margin = {}
for x in fav["schedule_date"].unique():
    date = x
    year = x.split('-')[0]
    month = x.split('-')[1]
    day = str(int(x.split('-')[2]) - 1)
    site = link + year + '-' + month + '-' + day
    
    html = urllib.urlopen(site)
    soup = BeautifulSoup(html.read())
    
    soup_dict_margin[date] = soup

In [127]:
favorite_margin=[]
underdog_margin=[]
favorite_margin_recent=[]
underdog_margin_recent=[]
favorite_margin_rank=[]
underdog_margin_rank=[]
counter = 0;
for x in fav.iterrows():
    
    team1 = nfl_abbr[x[1][4]]
    team2 = nfl_abbr[x[1][5]]
    
    date = x[1][0]
    
   
    table = soup_dict_margin[date].find("table",{"class": "datatable"})
    td_tags = table.find_all("td")
    for i in range(len(td_tags)):
        if td_tags[i].find("a") is not None:
            if td_tags[i].find("a").get_text() == team1:
                
                rating = td_tags[i+1].get_text()
                if rating == '--':
                    rating = 0;
                    
                favorite_margin.append(rating)
                favorite_margin_rank.append(td_tags[i-1].get_text())
                
                recent = td_tags[i+2].get_text()
                if recent == '--':
                    recent = 0;
                favorite_margin_recent.append(recent)
                
            if td_tags[i].find("a").get_text() == team2:
                
                rating2 = td_tags[i+1].get_text()
                if rating2 == '--':
                    rating2 = 0;
                    
                underdog_margin.append(rating2)
                underdog_margin_rank.append(td_tags[i-1].get_text())
                
                recent2 = td_tags[i+2].get_text()
                if recent2 == '--':
                    recent2 = 0;
                underdog_margin_recent.append(recent2)
                
    if len(favorite_margin) == counter:
        favorite_margin.append("NaN")
    if len(underdog_margin) == counter:
        underdog_margin.append("NaN")
    counter+=1

In [128]:
link = 'https://www.teamrankings.com/nfl/stat/red-zone-scoring-pct?date='
soup_dict_redzone = {}
for x in fav["schedule_date"].unique():
    date = x
    year = x.split('-')[0]
    month = x.split('-')[1]
    day = str(int(x.split('-')[2]) - 1)
    site = link + year + '-' + month + '-' + day
    
    html = urllib.urlopen(site)
    soup = BeautifulSoup(html.read())
    
    soup_dict_redzone[date] = soup

In [129]:
favorite_redzone=[]
underdog_redzone=[]
favorite_redzone_recent=[]
underdog_redzone_recent=[]
favorite_redzone_rank=[]
underdog_redzone_rank=[]
counter = 0;
for x in fav.iterrows():
    
    team1 = nfl_abbr[x[1][4]]
    team2 = nfl_abbr[x[1][5]]
    
    date = x[1][0]
    
   
    table = soup_dict_redzone[date].find("table",{"class": "datatable"})
    td_tags = table.find_all("td")
    for i in range(len(td_tags)):
        if td_tags[i].find("a") is not None:
            if td_tags[i].find("a").get_text() == team1:
                
                rating = td_tags[i+1].get_text().replace("%","")
                if rating == '--':
                    rating = 55;
                    
                favorite_redzone.append(rating)
                favorite_redzone_rank.append(td_tags[i-1].get_text())
                
                recent = td_tags[i+2].get_text().replace("%","")
                if recent == '--':
                    recent = 55;
                favorite_redzone_recent.append(recent)
                
            if td_tags[i].find("a").get_text() == team2:
                
                rating2 = td_tags[i+1].get_text().replace("%","")
                if rating2 == '--':
                    rating2 = 55;
                    
                underdog_redzone.append(rating2)
                underdog_redzone_rank.append(td_tags[i-1].get_text())
                
                recent2 = td_tags[i+2].get_text().replace("%","")
                if recent2 == '--':
                    recent2 = 55;
                underdog_redzone_recent.append(recent2)
                
    if len(favorite_redzone) == counter:
        favorite_redzone.append("NaN")
    if len(underdog_redzone) == counter:
        underdog_redzone.append("NaN")
    counter+=1

In [130]:
link = 'https://www.teamrankings.com/nfl/stat/yards-per-play?date='
soup_dict_ypp = {}
for x in fav["schedule_date"].unique():
    date = x
    year = x.split('-')[0]
    month = x.split('-')[1]
    day = str(int(x.split('-')[2]) - 1)
    site = link + year + '-' + month + '-' + day
    
    html = urllib.urlopen(site)
    soup = BeautifulSoup(html.read())
    
    soup_dict_ypp[date] = soup

In [131]:
favorite_ypp=[]
underdog_ypp=[]
favorite_ypp_recent=[]
underdog_ypp_recent=[]
favorite_ypp_rank=[]
underdog_ypp_rank=[]
counter = 0;
for x in fav.iterrows():
    
    team1 = nfl_abbr[x[1][4]]
    team2 = nfl_abbr[x[1][5]]
    
    date = x[1][0]
    
    table = soup_dict_ypp[date].find("table",{"class": "datatable"})
    td_tags = table.find_all("td")
    for i in range(len(td_tags)):
        if td_tags[i].find("a") is not None:
            if td_tags[i].find("a").get_text() == team1:
                
                rating = td_tags[i+1].get_text()
                if rating == '--':
                    rating = 5.2;
                    
                favorite_ypp.append(rating)
                favorite_ypp_rank.append(td_tags[i-1].get_text())
                
                recent = td_tags[i+2].get_text()
                if recent == '--':
                    recent = 5.2;
                favorite_ypp_recent.append(recent)
                
            if td_tags[i].find("a").get_text() == team2:
                
                rating2 = td_tags[i+1].get_text()
                if rating2 == '--':
                    rating2 = 5.2;
                    
                underdog_ypp.append(rating2)
                underdog_ypp_rank.append(td_tags[i-1].get_text())
                
                recent2 = td_tags[i+2].get_text()
                if recent2 == '--':
                    recent2 = 5.2;
                underdog_ypp_recent.append(recent2)
                
    if len(favorite_ypp) == counter:
        favorite_ypp.append("NaN")
    if len(underdog_ypp) == counter:
        underdog_ypp.append("NaN")
    counter+=1

In [133]:
fav['favorite_ppg'] = favorite_ppg
fav['favorite_ppg'] = fav['favorite_ppg'].astype(float)
fav['underdog_ppg'] = underdog_ppg
fav['underdog_ppg'] = fav['underdog_ppg'].astype(float)
fav['favorite_ppg_recent'] = favorite_ppg_recent
fav['favorite_ppg_recent'] = fav['favorite_ppg_recent'].astype(float)
fav['underdog_ppg_recent'] = underdog_ppg_recent
fav['underdog_ppg_recent'] = fav['underdog_ppg_recent'].astype(float)
fav['favorite_ppg_rank'] = favorite_ppg_rank
fav['favorite_ppg_rank'] = fav['favorite_ppg_rank'].astype(float)
fav['underdog_ppg_rank'] = underdog_ppg_rank
fav['underdog_ppg_rank'] = fav['underdog_ppg_rank'].astype(float)

fav['ppg_diff'] = fav['favorite_ppg'] - fav['underdog_ppg']
fav['ppg_recent_diff'] = fav['favorite_ppg_recent'] - fav['underdog_ppg_recent']
fav['ppg_rank_diff'] = fav['favorite_ppg_rank'] - fav['underdog_ppg_rank']


fav['favorite_margin'] = favorite_margin
fav['favorite_margin'] = fav['favorite_margin'].astype(float)
fav['underdog_margin'] = underdog_margin
fav['underdog_margin'] = fav['underdog_margin'].astype(float)
fav['favorite_margin_recent'] = favorite_margin_recent
fav['favorite_margin_recent'] = fav['favorite_margin_recent'].astype(float)
fav['underdog_margin_recent'] = underdog_margin_recent
fav['underdog_margin_recent'] = fav['underdog_margin_recent'].astype(float)
fav['favorite_margin_rank'] = favorite_margin_rank
fav['favorite_margin_rank'] = fav['favorite_margin_rank'].astype(float)
fav['underdog_margin_rank'] = underdog_margin_rank
fav['underdog_margin_rank'] = fav['underdog_margin_rank'].astype(float)

fav['margin_diff'] = fav['favorite_margin'] - fav['underdog_margin']
fav['margin_recent_diff'] = fav['favorite_margin_recent'] - fav['underdog_margin_recent']
fav['margin_rank_diff'] = fav['favorite_margin_rank'] - fav['underdog_margin_rank']

fav['favorite_redzone'] = favorite_redzone
fav['favorite_redzone'] = fav['favorite_redzone'].astype(float)
fav['underdog_redzone'] = underdog_redzone
fav['underdog_redzone'] = fav['underdog_redzone'].astype(float)
fav['favorite_redzone_recent'] = favorite_redzone_recent
fav['favorite_redzone_recent'] = fav['favorite_redzone_recent'].astype(float)
fav['underdog_redzone_recent'] = underdog_redzone_recent
fav['underdog_redzone_recent'] = fav['underdog_redzone_recent'].astype(float)
fav['favorite_redzone_rank'] = favorite_redzone_rank
fav['favorite_redzone_rank'] = fav['favorite_redzone_rank'].astype(float)
fav['underdog_redzone_rank'] = underdog_redzone_rank
fav['underdog_redzone_rank'] = fav['underdog_redzone_rank'].astype(float)

fav['redzone_diff'] = fav['favorite_redzone'] - fav['underdog_redzone']
fav['redzone_recent_diff'] = fav['favorite_redzone_recent'] - fav['underdog_redzone_recent']
fav['redzone_rank_diff'] = fav['favorite_redzone_rank'] - fav['underdog_redzone_rank']

fav['favorite_ypp'] = favorite_ypp
fav['favorite_ypp'] = fav['favorite_ypp'].astype(float)
fav['underdog_ypp'] = underdog_ypp
fav['underdog_ypp'] = fav['underdog_ypp'].astype(float)
fav['favorite_ypp_recent'] = favorite_ypp_recent
fav['favorite_ypp_recent'] = fav['favorite_ypp_recent'].astype(float)
fav['underdog_ypp_recent'] = underdog_ypp_recent
fav['underdog_ypp_recent'] = fav['underdog_ypp_recent'].astype(float)
fav['favorite_ypp_rank'] = favorite_ypp_rank
fav['favorite_ypp_rank'] = fav['favorite_ypp_rank'].astype(float)
fav['underdog_ypp_rank'] = underdog_ypp_rank
fav['underdog_ypp_rank'] = fav['underdog_ypp_rank'].astype(float)

fav['ypp_diff'] = fav['favorite_ypp'] - fav['underdog_ypp']
fav['ypp_recent_diff'] = fav['favorite_ypp_recent'] - fav['underdog_ypp_recent']
fav['ypp_rank_diff'] = fav['favorite_ypp_rank'] - fav['underdog_ypp_rank']

In [134]:
fav.to_csv('favorite_underdog_data(1).csv')

In [5]:
fav = pd.read_csv('favorite_underdog_data(1).csv')
fav = fav.loc[:, 'schedule_date'::]
# change data type of date columns
fav['schedule_date'] = fav['schedule_date'].astype(str)

In [1]:
# summary statistics
pd.set_option('display.max_rows', None)
fav

NameError: name 'pd' is not defined

In [8]:
fav.columns

Index(['schedule_date', 'schedule_season', 'schedule_week', 'schedule_playoff',
       'favorite', 'underdog', 'score_favorite', 'score_underdog',
       'spread_favorite', 'over_under_line', 'stadium', 'stadium_neutral',
       'home_favorite', 'away_favorite', 'over', 'elo_pre_favorite',
       'elo_pre_underdog', 'elo_prob_favorite', 'elo_prob_underdog',
       'qb_favorite', 'qb_underdog', 'qb_value_pre_favorite',
       'qb_value_pre_underdog', 'qb_adj_favorite', 'qb_adj_underdog',
       'qbelo_prob_favorite', 'qbelo_prob_underdog', 'result',
       'favorite_cover', 'elo_pre_diff', 'elo_prob_diff', 'qb_value_pre_diff',
       'qb_adj_diff', 'qbelo_prob_diff', 'margin',
       'favorite_predictive_rating', 'underdog_predictive_rating',
       'predictive_rating_difference', 'favorite_predictive_rank',
       'underdog_predictive_rank', 'predictive_rank_difference',
       'favorite_home_rating', 'underdog_home_rating',
       'home_rating_difference', 'favorite_home_rank', 'under

In [42]:
features = fav[['schedule_season', 'schedule_playoff','home_favorite','elo_pre_favorite', 'elo_pre_underdog',
               'elo_prob_favorite', 'elo_prob_underdog',
               'qb_value_pre_favorite', 'qb_value_pre_underdog', 'qb_adj_favorite',
               'qb_adj_underdog', 'qbelo_prob_favorite', 'qbelo_prob_underdog', 'elo_pre_diff', 'elo_prob_diff',
               'qb_value_pre_diff', 'qb_adj_diff', 'qbelo_prob_diff','favorite_predictive_rating','underdog_predictive_rating',
               'predictive_rating_difference', 'favorite_predictive_rank','underdog_predictive_rank','predictive_rank_difference',
               'favorite_home_rating', 'underdog_home_rating','home_rating_difference', 'favorite_home_rank', 'underdog_home_rank',
               'home_rank_difference', 'favorite_away_rating', 'underdog_away_rating','away_rating_difference', 'favorite_away_rank', 'underdog_away_rank',
               'away_rank_difference', 'favorite_location_rating', 'underdog_location_rating', 'location_difference_rating',
               'favorite_location_rank', 'underdog_location_rank', 'location_difference_rank', 'favorite_ppg', 'underdog_ppg', 'favorite_ppg_recent',
               'underdog_ppg_recent', 'favorite_ppg_rank', 'underdog_ppg_rank', 'ppg_diff', 'ppg_recent_diff', 'ppg_rank_diff', 'favorite_margin',
               'underdog_margin', 'favorite_margin_recent', 'underdog_margin_recent', 'favorite_margin_rank',
               'underdog_margin_rank', 'margin_diff', 'margin_recent_diff',
               'margin_rank_diff', 'favorite_redzone', 'underdog_redzone',
               'favorite_redzone_recent', 'underdog_redzone_recent',
               'favorite_redzone_rank', 'underdog_redzone_rank', 'redzone_diff',
               'redzone_recent_diff', 'redzone_rank_diff', 'favorite_ypp',
               'underdog_ypp', 'favorite_ypp_recent', 'underdog_ypp_recent',
               'favorite_ypp_rank', 'underdog_ypp_rank', 'ypp_diff', 'ypp_recent_diff',
               'ypp_rank_diff' ]]
target = fav["result"]
new = fav[['result', 'favorite_cover','schedule_season', 'schedule_playoff','home_favorite','elo_pre_favorite', 'elo_pre_underdog',
               'elo_prob_favorite', 'elo_prob_underdog',
               'qb_value_pre_favorite', 'qb_value_pre_underdog', 'qb_adj_favorite',
               'qb_adj_underdog', 'qbelo_prob_favorite', 'qbelo_prob_underdog', 'elo_pre_diff', 'elo_prob_diff',
               'qb_value_pre_diff', 'qb_adj_diff', 'qbelo_prob_diff','favorite_predictive_rating','underdog_predictive_rating',
               'predictive_rating_difference', 'favorite_predictive_rank','underdog_predictive_rank','predictive_rank_difference',
               'favorite_home_rating', 'underdog_home_rating','home_rating_difference', 'favorite_home_rank', 'underdog_home_rank',
               'home_rank_difference', 'favorite_away_rating', 'underdog_away_rating','away_rating_difference', 'favorite_away_rank', 'underdog_away_rank',
               'away_rank_difference', 'favorite_location_rating', 'underdog_location_rating', 'location_difference_rating',
               'favorite_location_rank', 'underdog_location_rank', 'location_difference_rank', 'favorite_ppg', 'underdog_ppg', 'favorite_ppg_recent',
               'underdog_ppg_recent', 'favorite_ppg_rank', 'underdog_ppg_rank', 'ppg_diff', 'ppg_recent_diff', 'ppg_rank_diff', 'favorite_margin',
               'underdog_margin', 'favorite_margin_recent', 'underdog_margin_recent', 'favorite_margin_rank',
               'underdog_margin_rank', 'margin_diff', 'margin_recent_diff',
               'margin_rank_diff', 'favorite_redzone', 'underdog_redzone',
               'favorite_redzone_recent', 'underdog_redzone_recent',
               'favorite_redzone_rank', 'underdog_redzone_rank', 'redzone_diff',
               'redzone_recent_diff', 'redzone_rank_diff', 'favorite_ypp',
               'underdog_ypp', 'favorite_ypp_recent', 'underdog_ypp_recent',
               'favorite_ypp_rank', 'underdog_ypp_rank', 'ypp_diff', 'ypp_recent_diff',
               'ypp_rank_diff' ]]

In [43]:
new.corr()

Unnamed: 0,result,favorite_cover,schedule_season,schedule_playoff,home_favorite,elo_pre_favorite,elo_pre_underdog,elo_prob_favorite,elo_prob_underdog,qb_value_pre_favorite,...,redzone_rank_diff,favorite_ypp,underdog_ypp,favorite_ypp_recent,underdog_ypp_recent,favorite_ypp_rank,underdog_ypp_rank,ypp_diff,ypp_recent_diff,ypp_rank_diff
result,1.0,0.6799,-0.019297,-0.009262,0.035676,0.087261,-0.094601,0.17235,-0.17235,0.065578,...,-0.052204,0.048303,-0.056112,0.050653,-0.064255,-0.042179,0.052764,0.079679,0.085809,-0.06974
favorite_cover,0.6799,1.0,-0.019591,-0.009459,-0.019259,0.014072,-0.005749,0.006198,-0.006198,0.009979,...,-0.009616,0.014664,-0.001876,0.012589,-0.007433,-0.010566,-0.005739,0.012543,0.014937,-0.003449
schedule_season,-0.019297,-0.019591,1.0,-0.011309,-0.030967,0.012052,0.010444,-0.023785,0.023785,0.401334,...,0.009953,0.218828,0.238477,0.177817,0.178301,0.006604,-0.010096,-0.01754,-0.001054,0.012277
schedule_playoff,-0.009262,-0.009459,-0.011309,1.0,0.084251,0.239274,0.317917,-0.022594,0.022594,0.115197,...,0.005232,0.052456,0.104734,0.021416,0.081055,-0.100383,-0.145078,-0.040749,-0.044722,0.034212
home_favorite,0.035676,-0.019259,-0.030967,0.084251,1.0,-0.129402,0.189677,0.309358,-0.309358,-0.111853,...,0.069728,-0.097497,0.102491,-0.103078,0.086112,0.095716,-0.118559,-0.152556,-0.141204,0.157389
elo_pre_favorite,0.087261,0.014072,0.012052,0.239274,-0.129402,1.0,0.347867,0.465096,-0.465096,0.491461,...,-0.10336,0.328682,0.150901,0.278356,0.1383,-0.370112,-0.165399,0.132913,0.103746,-0.147166
elo_pre_underdog,-0.094601,-0.005749,0.010444,0.317917,0.189677,0.347867,1.0,-0.46046,0.46046,0.20875,...,0.092094,0.158526,0.365486,0.139495,0.333382,-0.187379,-0.410951,-0.160771,-0.145663,0.167516
elo_prob_favorite,0.17235,0.006198,-0.023785,-0.022594,0.309358,0.465096,-0.46046,1.0,-1.0,0.153276,...,-0.114983,0.069244,-0.117643,0.043235,-0.110388,-0.080483,0.132167,0.142807,0.114815,-0.156362
elo_prob_underdog,-0.17235,-0.006198,0.023785,0.022594,-0.309358,-0.465096,0.46046,-1.0,1.0,-0.153276,...,0.114983,-0.069244,0.117643,-0.043235,0.110388,0.080483,-0.132167,-0.142807,-0.114815,0.156362
qb_value_pre_favorite,0.065578,0.009979,0.401334,0.115197,-0.111853,0.491461,0.20875,0.153276,-0.153276,1.0,...,-0.167746,0.599351,0.191062,0.534435,0.154276,-0.552084,-0.097238,0.306982,0.282462,-0.330087


In [32]:
## import and intialize scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

## fit scaler to data
scaler.fit(features)
features_scaled = scaler.transform(features)

In [33]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(features_scaled, target)

def RFE_feature_selection():

    select = RFE(DecisionTreeRegressor(), n_features_to_select=10)
    
    #fit the RFE selector to the training data
    select.fit(X_train, y_train)

    #transform training and testing sets so only the selected features are retained
    X_train_selected = select.transform(X_train)
    X_test_selected = select.transform(X_test)
    
    model = KNeighborsClassifier(n_neighbors=7).fit(X=X_train_selected, y=y_train)
    
    print("Selected features after RFE:")
    for i in range(len(select.get_support())):
        if select.get_support()[i]:
            print(features.columns[i]) #gets the features for which get_support is true
    
    accuracy_train = model.score(X_train_selected, y_train)
    accuracy_test = model.score(X_test_selected, y_test)

    print("\nkNN Classification performance with selected features:")
    print("\t Prediction accuracy on the train data:", f"{accuracy_train:.2%} \n")
    print("\t Prediction accuracy on the test data:", f"{accuracy_test:.2%} \n")

RFE_feature_selection()

Selected features after RFE:
qb_adj_underdog
qbelo_prob_underdog
qb_value_pre_diff
underdog_home_rating
home_rating_difference
favorite_away_rating
underdog_location_rating
underdog_ppg_recent
margin_diff
margin_recent_diff

kNN Classification performance with selected features:
	 Prediction accuracy on the train data: 71.23% 

	 Prediction accuracy on the test data: 63.46% 



In [34]:
estimators = {
    'k-Nearest Neighbor': KNeighborsClassifier(n_neighbors=5), 
    'Gaussian Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Support Vector Machine': LinearSVC(max_iter=100),
    'Logistic Regression': LogisticRegression()}

In [37]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
def classifiers_percentage_split():
    X_train, X_test, y_train, y_test = train_test_split(features, target)
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    for estimator_name, estimator_object in estimators.items():
        estimator_object.fit(X=X_train_scaled, y=y_train)
        accuracy = estimator_object.score(X_test_scaled, y_test)
        predicted = estimator_object.predict(X=X_test_scaled)
        expected = y_test
        class_report = classification_report(y_true=expected, y_pred=predicted)
        print(estimator_name)
        print("\t Prediction accuracy on the test data:", f"{accuracy:.2%} \n")
        print(class_report)

In [38]:
classifiers_percentage_split()

k-Nearest Neighbor
	 Prediction accuracy on the test data: 62.18% 

              precision    recall  f1-score   support

           0       0.41      0.31      0.35       339
           1       0.69      0.78      0.73       679

    accuracy                           0.62      1018
   macro avg       0.55      0.54      0.54      1018
weighted avg       0.60      0.62      0.61      1018

Gaussian Naive Bayes
	 Prediction accuracy on the test data: 57.27% 

              precision    recall  f1-score   support

           0       0.41      0.62      0.49       339
           1       0.74      0.55      0.63       679

    accuracy                           0.57      1018
   macro avg       0.58      0.59      0.56      1018
weighted avg       0.63      0.57      0.58      1018

Decision Tree
	 Prediction accuracy on the test data: 57.56% 

              precision    recall  f1-score   support

           0       0.38      0.41      0.39       339
           1       0.69      0.66   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
