In [1]:
import pandas as pd # pandas dataframes
from sklearn.ensemble import RandomForestClassifier # model that takes in data to train and classify the imports
from sklearn.metrics import f1_score, make_scorer, classification_report 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # one hot encoding of team names
import numpy as np
from collections import defaultdict
import time


In [2]:
# import csv files from git repository
teamPath = '/Users/lorneez/projects/sports_predictor/Sports_Model/Data/teamData.csv'
playerPath = '/Users/lorneez/projects/sports_predictor/Sports_Model/Data/playerData.csv'
outcomePath = '/Users/lorneez/projects/sports_predictor/Sports_Model/Data/outcomeData.csv'

In [3]:
# convert csv files into pandas dataframes
teamData = pd.read_csv(teamPath)
playerData = pd.read_csv(playerPath, engine='python')
outcomeData = pd.read_csv(outcomePath)

In [4]:
teamData

Unnamed: 0,Team,Year,G_PG,MP_PG,FG_PG,FGA_PG,FG%_PG,3P_PG,3PA_PG,3P%_PG,...,TOV%,ORB%,FT/FGA,eFG%.1,TOV%.1,DRB%,FT/FGA.1,Arena,Attend.,Attend./G
0,Phoenix Suns*,2005,82,241.2,40.9,85.6,0.477,9.7,24.7,0.393,...,12.4,27.5,0.222,0.478,12.0,68.3,0.176,America West Arena,726066,17709
1,Sacramento Kings*,2005,82,242.1,39.1,85.1,0.459,6.4,17.0,0.374,...,12.0,28.8,0.226,0.493,13.3,69.0,0.214,ARCO Arena (II),709997,17317
2,Dallas Mavericks*,2005,82,240.6,37.3,81.6,0.457,5.6,15.5,0.364,...,12.5,28.6,0.273,0.472,14.3,70.2,0.239,American Airlines Center,822533,20062
3,Miami Heat*,2005,82,243.7,37.8,77.7,0.486,5.8,15.4,0.377,...,13.1,27.0,0.260,0.460,12.4,74.1,0.242,AmericanAirlines Arena,815143,19882
4,Boston Celtics*,2005,82,242.4,37.1,79.4,0.468,5.3,15.3,0.349,...,14.7,27.2,0.273,0.481,14.2,70.1,0.274,FleetCenter,656081,16002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,Cleveland Cavaliers,2020,65,241.9,40.3,87.9,0.458,11.2,31.8,0.351,...,14.6,24.6,0.172,0.560,11.7,77.4,0.164,Quicken Loans Arena,643008,17861
476,Chicago Bulls,2020,65,241.2,39.6,88.6,0.447,12.2,35.1,0.348,...,13.7,22.8,0.175,0.546,16.3,75.6,0.239,United Center,639352,18804
477,Orlando Magic,2020,65,240.4,39.2,88.8,0.442,10.9,32.0,0.341,...,11.4,22.4,0.191,0.535,13.5,79.0,0.170,Amway Center,529870,17093
478,Golden State Warriors,2020,65,241.9,38.6,88.2,0.438,10.4,31.3,0.334,...,13.2,21.5,0.212,0.553,13.7,76.4,0.193,Chase Center,614176,18064


### Format All Data

In [5]:
# drop unwanted columns
outcomeData = outcomeData.drop(["Start Time", "Box Score", "Notes"], axis=1)

# change OT to integer value for number of OT rounds
outcomeData.loc[(outcomeData["OT?"].isna()),'OT?']= 0
outcomeData.loc[(outcomeData["OT?"] == "OT"),'OT?']= 1
outcomeData.loc[(outcomeData["OT?"] == "2OT"),'OT?']= 2
outcomeData.loc[(outcomeData["OT?"] == "3OT"),'OT?']= 3
outcomeData.loc[(outcomeData["OT?"] == "4OT"),'OT?']= 4


In [6]:
# function for redefining team names
def redefineTeamNames(dataframe):
    
    dataframe = dataframe.replace('Seattle SuperSonics', 'Oklahoma City Thunder', regex=True)
    
    dataframe = dataframe.replace('New Orleans/Oklahoma City Hornets', 'New Orleans Pelicans', regex=True)

    dataframe = dataframe.replace('New Orleans Hornets', 'New Orleans Pelicans', regex=True)
    
    dataframe = dataframe.replace('Charlotte Bobcats', 'Charlotte Hornets', regex=True)
    
    dataframe = dataframe.replace('New Jersey Nets', 'Brooklyn Nets', regex=True)

    return dataframe

In [7]:
# call function to redefine team names
outcomeData = redefineTeamNames(outcomeData)
teamData = redefineTeamNames(teamData)

# set up encoder and use one hot encoding to transform team names
encoding = LabelEncoder()
encoding.fit(outcomeData['Home'].values)
outcomeData['HomeEncoded'] = encoding.transform(outcomeData['Home'].values)
outcomeData['VisitorEncoded'] = encoding.transform(outcomeData['Visitor'].values)

In [8]:
# generate column names
teamDataColumn = teamData.drop(['Team', "Year", "Arena", "Attend.", "Attend./G", "G_PG", "MP_PG"], axis=1).columns
print(teamDataColumn)
testTeamDataColumn = teamData.drop(teamData.columns.difference(["3P_PG", "FGA_PG", "FG%_PG", "3P_PG", 
                                                               "3PA_PG", "3P%_PG", "ORB_PG", "DRB_PG", "TRB_PG", "AST_PG"
                                                                , "STL_PG", "BLK_PG", "TOV_PG", "2P_PG", "2PA_PG"]), axis=1).columns
print(testTeamDataColumn)
# outcomeData = outcomeData.loc[outcomeData['Year'] == 2020]
outcomeData



Index(['FG_PG', 'FGA_PG', 'FG%_PG', '3P_PG', '3PA_PG', '3P%_PG', '2P_PG',
       '2PA_PG', '2P%_PG', 'FT_PG',
       ...
       '3PAr', 'TS%', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'eFG%.1', 'TOV%.1',
       'DRB%', 'FT/FGA.1'],
      dtype='object', length=113)
Index(['FGA_PG', 'FG%_PG', '3P_PG', '3PA_PG', '3P%_PG', '2P_PG', '2PA_PG',
       'ORB_PG', 'DRB_PG', 'TRB_PG', 'AST_PG', 'STL_PG', 'BLK_PG', 'TOV_PG'],
      dtype='object')


Unnamed: 0,Year,Month,Visitor,VisitorPTS,Home,HomePTS,OT?,Attend,Total Points,Winner,HomeEncoded,VisitorEncoded
0,2005,november,Houston Rockets,79,Detroit Pistons,87,0,22076,166,Detroit Pistons,8,10
1,2005,november,Sacramento Kings,98,Dallas Mavericks,107,0,20041,205,Dallas Mavericks,6,25
2,2005,november,Denver Nuggets,78,Los Angeles Lakers,89,0,18997,167,Los Angeles Lakers,13,7
3,2005,november,Indiana Pacers,109,Cleveland Cavaliers,104,2,19730,213,Indiana Pacers,5,11
4,2005,november,Milwaukee Bucks,92,Orlando Magic,93,0,15138,185,Orlando Magic,21,16
...,...,...,...,...,...,...,...,...,...,...,...,...
20429,2020,march,Los Angeles Clippers,131,Golden State Warriors,107,0,18064,238,Los Angeles Clippers,9,12
20430,2020,march,Detroit Pistons,106,Philadelphia 76ers,124,0,20172,230,Philadelphia 76ers,22,8
20431,2020,march,New York Knicks,136,Atlanta Hawks,131,1,15393,267,New York Knicks,0,19
20432,2020,march,Charlotte Hornets,109,Miami Heat,98,0,19600,207,Charlotte Hornets,15,3


### Feature Engineering

#### y data

In [9]:
# function for finding if the home team is the winner
def homeWin(dataframe):
    if dataframe['Winner'] == dataframe['Home']:
        return True
    else:
        return False
    
# add in HomeWin column
outcomeData['HomeWin'] = outcomeData.apply(homeWin, axis=1)

In [None]:
won_last = defaultdict(int)

for index, row in outcomeData.iterrows():
    home_team = row["Home"]
    visitor_team = row["Visitor"]
    outcomeData.loc[index,'HomeLastWin'] = won_last[home_team]
    outcomeData.loc[index,'VisitorLastWin'] = won_last[visitor_team]
    won_last[home_team] = row["HomeWin"]
    won_last[visitor_team] = not row["HomeWin"]

In [None]:
outcomeData["HomeWinStreak"] = 0
outcomeData["VisitorWinStreak"] = 0

win_streak = defaultdict(int)

for index, row in outcomeData.iterrows():
    home_team = row["Home"]
    visitor_team = row["Visitor"]
    outcomeData.loc[index,'HomeWinStreak'] = win_streak[home_team]
    outcomeData.loc[index,'VisitorWinStreak'] = win_streak[visitor_team]
    if row["HomeWin"]:
        win_streak[home_team] += 1
        win_streak[visitor_team] = 0
    else:
        win_streak[home_team] = 0
        win_streak[visitor_team] += 1

In [None]:
outcomeData

In [None]:
def home_team_ranks_higher(row):
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    home_rank = ladder.loc[home_team]["Rk"]
    visitor_rank = ladder.loc[visitor_team]["Rk"]
    return home_rank < visitor_rank

In [None]:
def home_team_won_last(row):
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    
    teams = tuple(sorted([home_team, visitor_team]))
    result = 1 if last_match_winner[teams] == row["Home Team"] else 0
    
    winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
    last_match_winner[teams] = winner
    
    return result
    
outcomeData['HomeTeamWonLast'] = outcomeData.apply(home_team_won_last, axis=1)

In [None]:
# spread
outcomeData["Final Home Spread"] = (outcomeData["VisitorPTS"] - outcomeData["HomePTS"]).astype(float)

#### x data

In [None]:
def differentialVariableGenerator(outcomeData, teamData, teamDataColumn):
                                  
    global teamData_global
    global counter 
    global differentialFeatureData
    x = []
    for col in teamDataColumn:
        x.append(col + "_home")
        x.append(col + "_visitor")
    
    differentialFeatureData = pd.DataFrame(index=range(outcomeData.shape[0]), columns = x)
    print(outcomeData.shape[0])
    teamData_global = teamData
    totalCount = outcomeData.shape[0]
    counter = 0
    overallTime = time.time()
    
    def calcDifferences(row):
                                  
        global teamData_global
        global counter
                                  
        counter = counter + 1;
        start = time.time()
        year = row['Year']
        visitor = row['Visitor']
        home = row['Home']
                                  
        def columnCalc(col):
                                  
            variable = col.name
            var_home = variable+"_home"
            var_visitor = variable+"_visitor"
            year_subset = teamData_global.loc[(teamData_global.Year == year)]
            visitorValue = year_subset.loc[year_subset.Team.astype(str).str.contains(visitor)][variable]
            homeValue = year_subset.loc[year_subset.Team.astype(str).str.contains(home)][variable]
            #difference = homeValue.iloc[0] - visitorValue.iloc[0]
            differentialFeatureData[var_home][counter-1] = homeValue.iloc[0]
            #print(differentialFeatureData[var_home][counter-1])
            differentialFeatureData[var_visitor][counter-1] = visitorValue.iloc[0]
            #print(differentialFeatureData)
                                  
        teamData_global[teamDataColumn].apply(columnCalc)
        #print(differentialFeatureData)
        print(visitor, home, time.time() - start,time.time() - overallTime)
        print("{count}/{total}".format(count = counter, total = totalCount))
                                  
    outcomeData.apply(calcDifferences, axis=1)
    print("Total time {time} seconds".format(time = time.time() - overallTime))
    return differentialFeatureData

In [None]:
testOutcomeData = differentialVariableGenerator(outcomeData, teamData, testTeamDataColumn)

In [None]:
testOutcomeData.to_csv('/Users/lorneez/projects/sports_predictor/Sports_Model/Data/feature_data.csv')
testOutcomeData

In [None]:
x_columns = []
for col in teamDataColumn:
    x_columns.append(col + "_DIFF")
print(x_columns)

### Training Model

In [None]:
# target column (in this case, whether or not the home team will win)
y = outcomeData["HomeWin"]
# y = outcomeData["Final Home Spread"]

# columns that will be used to make the prediction
# X = outcomeData[x_columns]
# X = testOutcomeData
X_previous_wins = outcomeData[["HomeLastWin", "VisitorLastWin"]].values
X_win_streak = outcomeData[["HomeLastWin", "VisitorLastWin", "HomeWinStreak", "VisitorWinStreak"]].values

# X = outcomeData[["HomeLastWin", "VisitorLastWin", "HomeTeamRanksHigher", "HomeTeamWonLast"]].values

# set up scorer for testing accuracy
scorer = make_scorer(f1_score, pos_label = None, average = "weighted")

In [None]:
# Set up train test split
# X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y, test_size=0.20, random_state=100)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=100)

In [None]:
y_true = outcomeData["HomeWin"].values

# cross val score
clf = RandomForestClassifier(random_state = 14)
#scores = cross_val_score(clf, X_train, y_train, scoring=scorer)
scores_previous_wins = cross_val_score(clf, X_previous_wins, y_true, scoring=scorer)
scores_win_streak = cross_val_score(clf, X_win_streak, y_true, scoring=scorer)

In [None]:
# mean score
y_pred = [1] * len(y_true)

print("F1 Score to Beat: {:.4f}".format(f1_score(y_true, y_pred, pos_label=None, average='weighted')))

print("F1 Score with Previous Wins: {:.4f}".format(np.mean(scores_previous_wins)))

print("F1 Score with Win Streaks: {:.4f}".format(np.mean(scores_win_streak)))
