In [1]:
import numpy as np
import sklearn

from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

import matplotlib.pyplot as plt

In [27]:
#Lakers = np.genfromtxt("data/lakers.txt", delimiter=None)
#Celtics = np.genfromtxt("data/celtics2.txt",delimiter=None)

def mse(y_pred, y):
    total = 0    
    for n in range(y.shape[0]):
        sub = y[n] - y_pred[n]
        sub = sub * sub
        total += sub    
    return (total / y.shape[0])

def FeatureScores(team1Feature, team1OppDRtg, team2AvgDRtg, degrees):
    """
    Predicts what Team1 will get for a certain basketball statistic, namely the "Four Factors" to basketball success
    
    
    Input: 
        team1Feature: data on the first team's particular basketball statistic, i.e. Field Goals, Free Throws, Rebounds, etc.
        
        team2Feature: same as team1, but 
            
    """
    
    T1OppDRtgtrn,T1OppDRtgtst, T1FeatTrn, T1FeatTst = train_test_split(team1OppDRtg, team1Feature, test_size=0.25, shuffle=False)

    xs = np.linspace(80,150,200)
    xs = xs[ : , np.newaxis ]
    
    # Recording Error for Train and Test for the different Degrees for P1.3b
    errT1FeatTrain = dict()
    errT1FeatTest  = dict()
    
    minErrorDegree = -1
    
    for i, d in enumerate(degrees):
        # Poly Expand on Train
        polyD = PolynomialFeatures(d, include_bias=False).fit(T1OppDRtgtrn.reshape(-1,1))
        XtrnD = polyD.transform(T1OppDRtgtrn.reshape(-1,1))

        scaler = StandardScaler().fit(XtrnD)
        XtrnD = scaler.transform(XtrnD)

        # Linear Regression on PolyExpanded/Scaled matrix
        lrPoly = LinearRegression().fit(XtrnD, T1FeatTrn)

        # Poly Expand on Test
        XtstD = scaler.transform(polyD.transform(T1OppDRtgtst.reshape(-1,1)))

        xsPoly = scaler.transform(polyD.transform(xs))
        ysPoly = lrPoly.predict(xsPoly)

        #axFlat[i].plot(T1OppDRtgtrn, T1FeatTrn, 'ro', label='Train')
        #axFlat[i].plot(T1OppDRtgtst, T1FeatTst, 'go', label='Test')
        #axisSize = axFlat[i].axis()
        #axFlat[i].plot(xs, ysPoly, color='black', label='Prediction')
        #axFlat[i].axis(axisSize)
        #axFlat[i].set_title("Degree %i" % d)

        YtrnPolyPred = lrPoly.predict(XtrnD)
        YtstPolyPred = lrPoly.predict(XtstD)

        #errT1FeatTrain.append(mse(YtrnPolyPred, T1FeatTrn))
        #errT1FeatTest.append(mse(YtstPolyPred, T1FeatTst))

        errT1FeatTrain[d] = (mse(YtrnPolyPred, T1FeatTrn))
        errT1FeatTest[d] = (mse(YtstPolyPred, T1FeatTst))

        # Selecting best-fit degree of polynomial based on error rate between test and train data
        if (minErrorDegree <= -1):
            minErrorDegree = d
        else:
            if (abs(errT1FeatTrain[d] - errT1FeatTest[d]) < abs(errT1FeatTrain[minErrorDegree] - errT1FeatTest[minErrorDegree])):
                minErrorDegree = d
    #plt.show()
    
    polyD = PolynomialFeatures(minErrorDegree, include_bias=False).fit(T1OppDRtgtrn.reshape(-1,1))

    XtrnD = polyD.transform(T1OppDRtgtrn.reshape(-1,1))
    scaler = StandardScaler().fit(XtrnD)
    XtrnD = scaler.transform(XtrnD)

    # Linear Regression on PolyExpanded/Scaled matrix
    lrPolyFiv = LinearRegression().fit(XtrnD, T1FeatTrn)

    fakeArray = np.array([team2AvgDRtg])

    xHat = scaler.transform(polyD.transform(fakeArray.reshape(-1,1)))
    T1FeatHat = lrPolyFiv.predict(xHat)

    return T1FeatHat



def PredictionsMatchup(team1File, team2File, degrees):
    
    """
    Takes in 2 filepaths for data sets of 2 different teams and performs predictions on each of their features to then
    be summed up as a weighted average to predict who performs better at that particular matchup
    
    Weights taken from Dean Oliver's "Four Factors of Basketball Success"
        40% shooting/effective Field goals
        25% Turnovers
        20% Offensive Rebounds
        15% Free Throws
        
    Data formats: delimited by whitespace
    
        effectiveFieldGoals  |  turnovers  |  offensiveRebounds  |  freeThrows  |  opponentDefenseRating  |  teamDefenseRating
        
        ** Do Not Include "|" in Data File **
        
    Uses opponentDefenseRating for each past game as x-axis value to predict how well the team does against certain skill levels
    
    teamDefenseRating: used when predicting the other teams Feature Values
        i.e. if T1 = Lakers, T2 = Celtics:
            Predicting Lakers Features uses Celtics Average Defense Rating as x-value to predict on regression curve
            
    Output:
    
        Weighted Scores for both teams: (Team1 Weighted Score, Team2 Weight Score)
        
    """
    
    team1 = np.genfromtxt(team1File, delimiter=None)
    team2 = np.genfromtxt(team2File, delimiter=None)
    
    #--------------------------------------------------------------
    
    Team1OppDRtg = team1[:, 4]
    Team1AvgDRtg = team1[:, 5].mean()
    #print(LakersAvgDRtg)
    
    Team1eFG = team1[:, 0]
    Team1TOV = team1[:, 1]
    Team1ORB = team1[:, 2]
    Team1FT  = team1[:, 3]
    
    #--------------------------------------------------------------

    Team2OppDRtg = team2[:, 4]
    Team2AvgDRtg = team2[:, 5].mean()
    #print(LakersAvgDRtg)
    
    Team2eFG = team2[:, 0]
    Team2TOV = team2[:, 1]
    Team2ORB = team2[:, 2]
    Team2FT  = team2[:, 3]
    
    #--------------------------------------------------------------
    #--------------------------------------------------------------
    
    T1FG  = FeatureScores(Team1eFG, Team1OppDRtg, Team2AvgDRtg, degrees)
    T1TOV = FeatureScores(Team1TOV, Team1OppDRtg, Team2AvgDRtg, degrees)
    T1ORB = FeatureScores(Team1ORB, Team1OppDRtg, Team2AvgDRtg, degrees)
    T1FT  = FeatureScores(Team1FT, Team1OppDRtg, Team2AvgDRtg, degrees)
    
    #--------------------------------------------------------------
    
    T2FG  = FeatureScores(Team2eFG, Team2OppDRtg, Team1AvgDRtg, degrees)
    T2TOV = FeatureScores(Team2TOV, Team2OppDRtg, Team1AvgDRtg, degrees)
    T2ORB = FeatureScores(Team2ORB, Team2OppDRtg, Team1AvgDRtg, degrees)
    T2FT  = FeatureScores(Team2FT, Team2OppDRtg, Team1AvgDRtg, degrees)
    
    #--------------------------------------------------------------
    print("Predicted Four Factors")
    print("----------------------")
    
    print("Team 1 Factors: ")
    print("-- Field Goal %: ", T1FG)
    print("-- Turnover %:", T1TOV)
    print("-- Offensive Rebound %:", T1ORB)
    print("-- Free Throw %", T1FT)
    print()
    print("Team 2 Factors: ")
    print("-- Field Goal %: ", T2FG)
    print("-- Turnover %:", T2TOV)
    print("-- Offensive Rebound %:", T2ORB)
    print("-- Free Throw %", T2FT)
    print()
    #--------------------------------------------------------------
    
    
    #Calculate who wins:
        # Make sure they are all scaled properly as percentages
        # TOV subtracted from 100 because having less turnovers is better (invert it to properly sum up the greater "score")
    T1GameScore = (0.40 * (T1FG * 100)) + (0.25 * (100-T1TOV)) + (0.20 * (T1ORB)) + (0.15 * (T1FT*100))
    
    T2GameScore = (0.40 * (T2FG * 100)) + (0.25 * (100-T2TOV)) + (0.20 * (T2ORB)) + (0.15 * (T2FT*100))
    
    return (T1GameScore, T2GameScore)
    

In [28]:
degrees = [1, 3, 5, 7, 10, 18]

# Lakers(T1) and Celtics(T2) from 2010 season: Predicting their matchup at the end of the season
t1, t2 = PredictionsMatchup("data/lakers.txt", "data/celtics2.txt", degrees)

print("t1: ", t1)
print("t2: ", t2)

if (t1 > t2):
    print("t1 Wins")
    
elif (t2 > t1):
    print("t2 Wins")
    
else:
    print("Tie")

Predicted Four Factors
----------------------
Team 1 Factors: 
-- Field Goal %:  [0.4803077]
-- Turnover %: [13.11161687]
-- Offensive Rebound %: [26.59332725]
-- Free Throw % [0.22514031]

Team 2 Factors: 
-- Field Goal %:  [0.51092464]
-- Turnover %: [13.98419157]
-- Offensive Rebound %: [21.97258766]
-- Free Throw % [0.23648187]

t1:  [49.63017391]
t2:  [49.88268315]
t2 Wins
