In [1]:
# Used for plotting data
%matplotlib inline
import matplotlib.pyplot as plt
# Used for data storage and manipulation 
import numpy as np
import pandas as pd
# Used for Regression Modelling
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
# Used for Acc metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
# For stepwise regression
import statsmodels.api as sm
# box plots
import seaborn as sns
# pairplot
from seaborn import pairplot
# Correlation plot
from statsmodels.graphics.correlation import plot_corr
%autosave 60

Autosaving every 60 seconds


In [2]:
# Load your data 
data = pd.read_csv("spreadspoke_scores.csv")
# Check out shape
print(data.shape)

#data from 2006-2018 is used for training. This will change, but is done this way for intial testing
no_2020 = data[(data['schedule_season'] < 2018)]
current = no_2020[(no_2020['schedule_season'] > 2005)]



# Adding .head() to your dataset allows you to see the first rows in the dataset. Input a number inside the brackets to specificy how many rows you want returned or else 5 rows are returned.
current.head()
current.tail()

#adding a dictionary to convert team names to numbers
#numbers correspond to alphabetical order
team_dict = {}
team_dict['ARI'] = 0
team_dict['ATL'] = 1
team_dict['BAL'] = 2
team_dict['BUF'] = 3
team_dict['CAR'] = 4
team_dict['CHI'] = 5
team_dict['CIN'] = 6
team_dict['CLE'] = 7
team_dict['DAL'] = 8
team_dict['DEN'] = 9
team_dict['DET'] = 10
team_dict['GB'] = 11
team_dict['HOU'] = 12
team_dict['IND'] = 13
team_dict['JAX'] = 14
team_dict['KC'] = 15
team_dict['LAC'] = 16
team_dict['LAR'] = 17
team_dict['MIA'] = 18
team_dict['MIN'] = 19
team_dict['NE'] = 20
team_dict['NO'] = 21
team_dict['NYG'] = 22
team_dict['NYJ'] = 23
team_dict['LVR'] = 24
team_dict['PHI'] = 25
team_dict['PIT'] = 26
team_dict['SF'] = 27
team_dict['SEA'] = 28
team_dict['TB'] = 29
team_dict['TEN'] = 30
team_dict['WAS'] = 31
#needed to add PICK as it is a value for team_favorite_id. we can discard value by checking if values are equal to 99
team_dict['PICK'] = 99
current.head(50)

(12934, 17)


Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail
8940,9/7/2006,2006,1,False,Pittsburgh Steelers,28.0,17.0,Miami Dolphins,PIT,-1.5,34.5,Heinz Field,False,62.0,3.0,75.0,
8941,9/10/2006,2006,1,False,Arizona Cardinals,34.0,27.0,San Francisco 49ers,ARI,-9.5,44.0,University of Phoenix Stadium,False,72.0,0.0,,DOME
8942,9/10/2006,2006,1,False,Carolina Panthers,6.0,20.0,Atlanta Falcons,CAR,-4.5,39.0,Bank of America Stadium,False,73.0,6.0,76.0,
8943,9/10/2006,2006,1,False,Cleveland Browns,14.0,19.0,New Orleans Saints,CLE,-3.0,36.0,FirstEnergy Stadium,False,65.0,13.0,74.0,
8944,9/10/2006,2006,1,False,Detroit Lions,6.0,9.0,Seattle Seahawks,SEA,-6.0,44.0,Ford Field,False,72.0,0.0,,DOME
8945,9/10/2006,2006,1,False,Green Bay Packers,0.0,26.0,Chicago Bears,CHI,-3.5,35.0,Lambeau Field,False,56.0,11.0,62.0,
8946,9/10/2006,2006,1,False,Houston Texans,10.0,24.0,Philadelphia Eagles,PHI,-6.0,37.0,Reliant Stadium,False,72.0,0.0,,DOME
8947,9/10/2006,2006,1,False,Jacksonville Jaguars,24.0,17.0,Dallas Cowboys,DAL,-2.5,36.5,EverBank Field,False,80.0,10.0,75.0,
8948,9/10/2006,2006,1,False,Kansas City Chiefs,10.0,23.0,Cincinnati Bengals,CIN,-1.5,49.5,Arrowhead Stadium,False,69.0,11.0,83.0,
8949,9/10/2006,2006,1,False,New England Patriots,19.0,17.0,Buffalo Bills,NE,-10.0,41.0,Gillette Stadium,False,64.0,13.0,62.0,


In [3]:
clean_current = current[['schedule_season', 'team_home', 'score_home', 'score_away', 'team_away', 'team_favorite_id', 'spread_favorite', 'over_under_line']]
clean_current.head()

Unnamed: 0,schedule_season,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line
8940,2006,Pittsburgh Steelers,28.0,17.0,Miami Dolphins,PIT,-1.5,34.5
8941,2006,Arizona Cardinals,34.0,27.0,San Francisco 49ers,ARI,-9.5,44.0
8942,2006,Carolina Panthers,6.0,20.0,Atlanta Falcons,CAR,-4.5,39.0
8943,2006,Cleveland Browns,14.0,19.0,New Orleans Saints,CLE,-3.0,36.0
8944,2006,Detroit Lions,6.0,9.0,Seattle Seahawks,SEA,-6.0,44.0


In [4]:
#print(sorted(clean_current["team_home"].unique()))
#print(sorted(clean_current["team_favorite_id"].unique()))

# doing the inane and cleaning the data so the team names match the abbreviations used in team_favorite_id
# this is for ease of comparison later on


str = clean_current.at[8940, 'team_home']
str = str[0]+str[1].upper()+str[2].upper()
print(str)
for index, row in clean_current.iterrows():
    # ----------------- HOME -----------------
    if row['team_home'] == "Green Bay Packers":
        clean_current.at[index, 'team_home'] = "GB"
    elif row['team_home'] == "Jacksonville Jaguars":
        clean_current.at[index, 'team_home'] = "JAX"
    elif row['team_home'] == "Kansas City Chiefs":
        clean_current.at[index, 'team_home'] = "KC"
    elif row['team_home'] == "Los Angeles Chargers" or row['team_home'] == "San Diego Chargers":
        clean_current.at[index, 'team_home'] = "LAC" # San Diego Chargers = LA Chargers. Team ID is LAC
    elif row['team_home'] == "Los Angeles Rams" or row['team_home'] == "St. Louis Rams":
        clean_current.at[index, 'team_home'] = "LAR" # St. Louis Rams = LA Rams. Team ID is LAR
    elif row['team_home'] == "New England Patriots":
        clean_current.at[index, 'team_home'] = "NE"
    elif row['team_home'] == "New Orleans Saints":
        clean_current.at[index, 'team_home'] = "NO"
    elif row['team_home'] == "New York Giants":
        clean_current.at[index, 'team_home'] = "NYG"
    elif row['team_home'] == "New York Jets":
        clean_current.at[index, 'team_home'] = "NYJ"
    elif row['team_home'] == "Oakland Raiders":
        clean_current.at[index, 'team_home'] = "LVR" # Oakland Raiders team ID is LVR (LAs Vegas Raiders)
    elif row['team_home'] == "San Francisco 49ers":
        clean_current.at[index, 'team_home'] = "SF"
    elif row['team_home'] == "Tampa Bay Buccaneers":
        clean_current.at[index, 'team_home'] = "TB"
    # ----------------- AWAY -----------------
    if row['team_away'] == "Green Bay Packers":
        clean_current.at[index, 'team_away'] = "GB"
    elif row['team_away'] == "Jacksonville Jaguars":
        clean_current.at[index, 'team_away'] = "JAX"
    elif row['team_away'] == "Kansas City Chiefs":
        clean_current.at[index, 'team_away'] = "KC"
    elif row['team_away'] == "Los Angeles Chargers" or row['team_away'] == "San Diego Chargers":
        clean_current.at[index, 'team_away'] = "LAC" # San Diego Chargers = LA Chargers. Team ID is LAC
    elif row['team_away'] == "Los Angeles Rams" or row['team_away'] == "St. Louis Rams":
        clean_current.at[index, 'team_away'] = "LAR" # St. Louis Rams = LA Rams. Team ID is LAR
    elif row['team_away'] == "New England Patriots":
        clean_current.at[index, 'team_away'] = "NE"
    elif row['team_away'] == "New Orleans Saints":
        clean_current.at[index, 'team_away'] = "NO"
    elif row['team_away'] == "New York Giants":
        clean_current.at[index, 'team_away'] = "NYG"
    elif row['team_away'] == "New York Jets":
        clean_current.at[index, 'team_away'] = "NYJ"
    elif row['team_away'] == "Oakland Raiders":
        clean_current.at[index, 'team_away'] = "LVR" # Oakland Raiders team ID is LVR (LAs Vegas Raiders)
    elif row['team_away'] == "San Francisco 49ers":
        clean_current.at[index, 'team_away'] = "SF"
    elif row['team_away'] == "Tampa Bay Buccaneers":
        clean_current.at[index, 'team_away'] = "TB"
    if len(clean_current.at[index, 'team_home'])>2:
        str = clean_current.at[index, 'team_home']
        # print(str)
        clean_current.at[index, 'team_home'] = str[:3].upper()
    if len(clean_current.at[index, 'team_away'])>2:
        str = clean_current.at[index, 'team_away']
        # print(str)
        clean_current.at[index, 'team_away'] = str[:3].upper()
        
clean_current.head()

# --------------------------------------------------------

PIT


Unnamed: 0,schedule_season,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line
8940,2006,PIT,28.0,17.0,MIA,PIT,-1.5,34.5
8941,2006,ARI,34.0,27.0,SF,ARI,-9.5,44.0
8942,2006,CAR,6.0,20.0,ATL,CAR,-4.5,39.0
8943,2006,CLE,14.0,19.0,NO,CLE,-3.0,36.0
8944,2006,DET,6.0,9.0,SEA,SEA,-6.0,44.0


In [5]:
"""
create x and y variables for features and results
"""

x = clean_current[['team_home', 'team_away', 'team_favorite_id', 'spread_favorite', 'over_under_line']]
scores = clean_current[['score_home', 'score_away']]
y = []

for index, row in x.iterrows():
    x.at[index, 'team_home'] = team_dict[x.at[index, 'team_home']]
    x.at[index, 'team_away'] = team_dict[x.at[index, 'team_away']]
    x.at[index, 'team_favorite_id'] = team_dict[x.at[index, 'team_favorite_id']]
    if row['team_home'] == row['team_favorite_id']:
        x.at[index, 'team_favorite_id'] = 1
    else:
        x.at[index, 'team_favorite_id'] = 0

for index,row in scores.iterrows():
    val = 1 if row['score_home'] > row['score_away'] else 0
    y.append(val)

x = np.array(x)
y = np.array(y)

print("Feature Matrix:\n",x[:20])
print("Win/Lose:\n",y[:20])



Feature Matrix:
 [[26 18 1 -1.5 '34.5']
 [0 27 1 -9.5 '44']
 [4 1 1 -4.5 '39']
 [7 21 1 -3.0 '36']
 [10 28 0 -6.0 '44']
 [11 5 0 -3.5 '35']
 [12 25 0 -6.0 '37']
 [14 8 0 -2.5 '36.5']
 [15 6 0 -1.5 '49.5']
 [20 3 1 -10.0 '41']
 [22 13 0 -3.0 '47.5']
 [17 9 0 -3.5 '46']
 [29 2 1 -3.0 '33']
 [30 23 1 -2.5 '35']
 [24 16 0 -3.0 '41.5']
 [31 19 1 -4.0 '35.5']
 [1 29 1 -4.5 '36']
 [2 24 1 -13.0 '34']
 [5 10 1 -7.0 '32']
 [6 7 1 -10.5 '40.5']]
Win/Lose:
 [1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 1 1 1]


In [6]:

def get_error_in_leaf(y, ids):
    """
    Returns the errors in a leaf node of a decision tree.
    This function can be used to answer the previous question automatically.
    
    :@param y: all labels
    :@param ids: the subset of indexes in the leaf node
    """
    # YOUR CODE HERE
    n=0
    p=0
    for id in ids:
        if y[id]==0:
            n=n+1
        else:
            p=p+1    
    return min(n,p)
    #raise NotImplementedError()

def error_criteria(y, root, left_child, right_child):
    """
    Returns the number of errors if we split the root into the left child and the right child.
    
    :@param y: all labels
    :@param root: indexes of all the data points in the root
    :@param left_child: the subset of indexes in the left child
    :@param right_child: the subset of indexes in the right child
    """
    # YOUR CODE HERE
    left_child_minority=get_error_in_leaf(y,left_child);
    right_child_minority=get_error_in_leaf(y,right_child);
    return left_child_minority+right_child_minority;
    #raise NotImplementedError()

def value_split_binary_feature(x, y, fid, root, criteria_func):
    left_child = [i for i in root if x[i,fid] == 0]
    right_child = [i for i in root if x[i,fid] == 1]
    return criteria_func(y, root, left_child, right_child)

# Colorado residency should correpsond to the third column in your data x

fid = 2
print(len(y))
root = list(range(len(y))) # root includes all data points
mistakes = value_split_binary_feature(x, y, fid, root, error_criteria)
mistakes

3204


1089

In [140]:
def entropy(y, ids):
    """
    Returns the entropy in the labels for the data points in ids.
    
    :@param y: all labels
    :@param ids: the indexes of data points
    """
    if len(ids) == 0: # deal with corner case when there is no data point.
        return 0
    # YOUR CODE HERE
    p=0
    for i in ids:
        if y[i]==1:
            p=p+1
    p=p/len(ids)
    n=1-p
    logp=np.log2(p) if p>0 else 0
    logn=np.log2(n) if n>0 else 0
    en=-p*logp-n*logn
    return en
    raise NotImplementedError()
    
def information_gain_criteria(y, root, left_child, right_child):
    """
    Returns the information gain by splitting root into left child and right child.
    
    :@param y: all labels
    :@param root: indexes of all the data points in the root
    :@param left_child: the subset of indexes in the left child
    :@param right_child: the subset of indexes in the right child
    """
    # YOUR CODE HERE
    entropy_D=entropy(y,root);
    entropy_D1=entropy(y,left_child);
    entropy_D2=entropy(y,right_child);
    info_gain=entropy_D-(len(left_child)/len(root))*entropy_D1-(len(right_child)/len(root))*entropy_D2
    return info_gain
    raise NotImplementedError()
    
fid = 2
root = list(range(len(y))) # root includes all data points
IG_favorite = value_split_binary_feature(x, y, fid, root, information_gain_criteria)  
print("Information gain : ",IG_favorite)

Information gain :  0.06293802996930464
