In [30]:
##########################################################################################
# CS349 Final PROJECT: NBA GAME PREDICTION
# JIAZHANG WANG NETID:JWM9272
##########################################################################################

from __future__ import division
import math
import numpy as np
import numpy.matlib
import matplotlib.pyplot as plt
import pandas as pd 

from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
import csv

In [31]:
def DataClean(oppo_data,team_data,mis_data):
    '''
    Functions: Clean the data, remove useless feature such as arena name
    Input: 
        1. oppo_data: Opponent_Per_Game_Stat;
        2. team_data: Team_Per_Game_Stat;
        3. mis_data: Miscellaneous_Stat;
    Output:
        new processed data.
    Notation:
        Rk: Rank;  Team: Team Name;  G: Games;  MP: Minutes Played;  
        FG: Field Goals;                FGA:  Field Goal Attempts;           FG%: Field Goal Percentage;  
        3P:  3-Point Field Goals;       3PA:  3-Point Field Goals Attempts;  3P%:  3-Point Field Goal Percentage;
        2P:  2-Point Field Goals;       2PA:  2-Point Field Goals Attempts;  2P%:  2-Point Field Goal Percentage;
        FT:  Free Throws;               FTA:  Free Throws Attempt;           FT%: Free Throws Percentage;
        ORB:  Offensive Rebounds;       DRB: Defensive Rebounds;             TRB:  Total Rebounds;        
        AST: Assists;    STL: Steals;   BLK:  Blocks;   TOV: Turnovers;  PF: Personal Fouls;    PTS: points; 
        Rk: Rank;  Team: Team Name;  Age:Average Age;  W: Wins;  L:Losses;  
        PW: Pythagorean Wins;  PL: Pythagorean Losses;  MOV: Margin of Victory;
        SOS: Strength of Schedule;  SRS: Simple Rating System;  ORtg: Offensive Rating;  DRtg Defensive Rating;
        NRtg: Net Rating;           Pace: Pace Factor;          FTr: Free Throw Attemp Rate;
        3PAr: 3-Point Attempt Rate; TS%: True Shooting Percentages;      eFG%: Effective Field Goal Percentage;
        TOV%: Turnover Percentage;  ORB%: Offensive Rebound Percentage;  FT/FGA: Free Throws Per Field Goal Attempt;
        eFG%: Opponent Effective Field Goal Percentage;                  TOV%: Opponent Turnover Percentage;
        DRB%: Defensive Rebound Percentage;     FT/FGA: Opponent Free Throws Per Field Goal Attempt; 
        Arena: Name of the Arena    Attend.: Attendance                  Attend./G: Attendance Per Home Game
    '''
    
    # Remove useless feature
    # Remove Rank, Games, Minutes Played of home team and opponent team
    oppo_data_trim=oppo_data.drop(['Rk','G','MP'],axis = 1)
    team_data_trim=team_data.drop(['Rk','G','MP'],axis = 1)
    # Remove Rank, Pythagorean Wins,Pythagorean Losses, Arena of miscellaneous data
    mis_data_trim=mis_data.drop(['Rk', 'PW','PL','Arena'],axis = 1)
    # Put all feature together
    merged_temp=pd.merge(oppo_data_trim, team_data_trim, how = 'left', on = 'Team')
    merged_data=pd.merge(merged_temp, mis_data_trim, how = 'left', on = 'Team')
    
    return merged_data.set_index('Team', drop = True, append = False)

def FeatureExtraction(merged_data,label_result):
    '''
    Functions: Add Elo score as one of the feature
    Input: 
        1. merged_data: data merged for feature extraction.
        2. label_result：scores of each team, which would be processed to get a win or lose result.
    Output:
        features and labels for training.
    '''
    train_X=[]
    train_y=[]
    
    for index, rows in label_result.iterrows():
        
        ## 1. Note two teams as visit_team and home team 
        visit_team = rows['Visitor/Neutral']
        visit_team_score = rows['PTSV']
        home_team = rows['Home/Neutral']
        home_team_score = rows['PTSH']
        
        ## 2. Get the current Elo score for each team.
        # this score will be updated after each game
        visit_team_elo=GetEloScore(visit_team)
        home_team_elo=GetEloScore(home_team)
        home_team_elo=home_team_elo+100 # add a home buff for home team
        
        ## 3. Create the feature list.
        # first element of the feature list would be the Elo score;
        feature_visit_list=[visit_team_elo]
        feature_home_list=[home_team_elo]
        feature_visit_add=np.nan_to_num(merged_data.loc[visit_team])#add features from former merged data
        feature_visit_list.extend(feature_visit_add)#append features together
        feature_home_add=np.nan_to_num(merged_data.loc[home_team])
        feature_home_list.extend(feature_home_add)
        feature_visit_home=feature_visit_list+feature_home_list
        

        ##4. Create the train dataset train_X and train_y.
        try:
            train_X=np.vstack((train_X,feature_visit_home))
        except:
            train_X=feature_visit_home.copy()
        
        # save the results as label in one list, train_y. If home team wins, train_y=1
        game_result=np.int(visit_team_score>home_team_score)
        train_y.append(game_result)

        ##5. Update elo scores of the team.        
        # update elo scores of the team
        if visit_team_score>home_team_score:
            visit_team_elo_update, home_team_elo_update = CalcEloScore(visit_team, home_team)
        else:
            home_team_elo_update, visit_team_elo_update = CalcEloScore(home_team, visit_team)
  
        team_elos[visit_team] = visit_team_elo_update
        team_elos[home_team] = home_team_elo_update
    return train_X,train_y
    
    
def GeneratePredictFeature(pred_data,merged_data):
    '''
    Functions: Predict the probability for the visiting team to win the game.
    Input: 
        Team Schedule
    Output:
        Probability for the visiting team to win the game.
    '''  
    
    for index, rows in pred_data.iterrows():
        
        ## 1.Get the name of both visiting and home teams
        visit_team = rows['Visitor/Neutral']
        home_team = rows['Home/Neutral']
        
        ## 2. Get the current elo score for each team.
        visit_team_elo = GetEloScore(visit_team)
        home_team_elo = GetEloScore(home_team)
        
        ## 3. Create the feature
        feature_visit_list=[visit_team_elo]
        feature_home_list=[home_team_elo+100]
        feature_visit_add=np.nan_to_num(merged_data.loc[visit_team])
        feature_visit_list.extend(feature_visit_add)
        feature_home_add=np.nan_to_num(merged_data.loc[home_team])
        feature_home_list.extend(feature_home_add)
        feature_visit_home=feature_visit_list+feature_home_list
        
        ##4. Create the train dataset train_X.    
        try:
            pred_X=np.vstack((pred_X,feature_visit_home))
        except:
            pred_X=feature_visit_home.copy()# if pred_X is null, set the current feature as the first element.
        
    return pred_X


def CalcEloScore(winteam, loseteam):
    '''
    Functions: Calculate the Elo Score
    Input: 
        Win and lose team's names, which would be used as the index to get the score
    Output:
        Updated Elo Scores.
    '''
    
    # get the current elo scores of both teams
    win_elo_score = GetEloScore(winteam)
    lose_elo_score = GetEloScore(loseteam)
    
    # calculate probability to win the game
    win_prob = 1/(1 + math.pow(10,(lose_elo_score - win_elo_score)/400))
    lose_prob = 1/(1 + math.pow(10,(win_elo_score - lose_elo_score)/400))
    
    # determine the coefficient K for different range of R
    if win_elo_score>=2400:
        K=16
    elif win_elo_score<=2100:
        K=32
    else:
        K=24
    
    # update elso score
    win_elo_score_new = round(win_elo_score + K*(1 - win_prob))
    lose_elo_score_new = round(lose_elo_score + K*(0 - lose_prob))
    return win_elo_score_new, lose_elo_score_new


def GetEloScore(team):
    '''
    Functions: Read the Elo Score of the team
    Input: 
        Team name, which would be used as the index to get the score
    Output:
        Elo Score of the team.
    '''
    
    try:
        return team_elos[team]
    except:
        team_elos[team] = init_elo
    return team_elos[team]



def UpdateEloScore(merged_data,label_result):
    '''
    Functions: Update the Elo score of each team
    Input: 
        1. merged_data: data merged for feature extraction.
        2. label_result：scores of each team, which would be processed to get a win or lose result.
    Output:
        features and labels for training.
    '''
    
    for index, rows in label_result.iterrows():
        
        visit_team = rows['Visitor/Neutral']
        visit_team_score = rows['PTSV']
        home_team = rows['Home/Neutral']
        home_team_score = rows['PTSH']
        visit_team_elo=GetEloScore(visit_team)
        home_team_elo=GetEloScore(home_team)
        home_team_elo=home_team_elo+100 
        if visit_team_score>home_team_score:
            visit_team_elo_update, home_team_elo_update = CalcEloScore(visit_team, home_team)
        else:
            home_team_elo_update, visit_team_elo_update = CalcEloScore(home_team, visit_team)
        team_elos[visit_team] = visit_team_elo_update
        team_elos[home_team] = home_team_elo_update

# Load data and process feature

In [33]:
if __name__ == '__main__':
    
    init_elo = 1600
    team_elos = {}
    
    ## 1.Data Collection. refer to:https://www.basketball-reference.com/leagues/NBA_2019.html
    oppo_data=pd.read_csv('new_data/train/18-19Opponent_Per_Game_Stat.csv')
    team_data=pd.read_csv('new_data/train/18-19Team_Per_Game_Stat.csv')
    mis_data=pd.read_csv('new_data/train/18-19Miscellaneous_Stat.csv')
    last_year_result = pd.read_csv( 'new_data/train/18-19TeamResult.csv')
    
    ## 2. Data Pre-processing
    clean_data=DataClean(oppo_data, team_data, mis_data)
    
    ## 3. Feature Extraction
    print('Extracting features...')
    train_X,train_y=FeatureExtraction(clean_data,last_year_result)
    print('Done.')


Extracting features...
Done.
Fitting on game samples...
Fitting done.
Doing cross-validation..
The accuracy is: 
0.648780487804878


# Try different classfiers

In [35]:
    ## 4. Model Training. 
    # Apply different classifiers.
    print('Fitting on game samples...')
    clf = linear_model.LogisticRegression(solver='lbfgs')
    clf.fit(train_X,train_y)
    #clf = GradientBoostingClassifier(n_estimators=50)
    #clf.fit(train_X, train_y)
    #clf = KNeighborsClassifier()
    #clf.fit(train_X, train_y)
    #clf = RandomForestClassifier(n_estimators=8)
    #clf.fit(train_X, train_y)
    #clf = tree.DecisionTreeClassifier()
    #clf.fit(train_X, train_y)
    print('Fitting done.')
    
    ## 5. 10-fold Validation
    print("Doing cross-validation..")
    print('The accuracy is: ')
    print(cross_val_score(clf, train_X, train_y, cv = 10, scoring='accuracy', n_jobs=-1).mean())

Fitting on game samples...
Fitting done.
Doing cross-validation..
The accuracy is: 
0.648780487804878
