In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils import *

# First Dataset: "games_details.csv"

Have a look to the Datasets, one at a time. Firstly, "games_details.csv".

In [None]:
games_details = pd.read_csv('../dataset/games_details.csv', low_memory=False)
games_details.head()

In [None]:
sns.pairplot(games_details)

In [None]:
# ratio_missing_values prints the percentage of missing values in the column
key1 = 'COMMENT'
ratio_missing_values_column(games_details, key1)

key2 = 'START_POSITION'
ratio_missing_values_column(games_details, key2)

'COMMENT': Since the number of valid values is very little wrt the size of the Dataset and there are not clear solutions to fill empty cells, we drop it.

'START_POSITION': same thing.

Moreover, we drop all the columns we think could lead to information leakage and also all columns that we think are not useful in the training model, such as the Nickname of the player. Our cleaned Dataset is the following

In [None]:
Weird_rows = games_details['GAME_ID'] == 10500109
games_details = games_details[~Weird_rows]

columns_to_drop = ['MIN','COMMENT', 'PLAYER_NAME', 'NICKNAME', 'START_POSITION', 'COMMENT', 'TEAM_CITY', 'TEAM_ABBREVIATION', 'FGA', 'FG_PCT', 'FGM', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TO', 'PF', 'PTS', 'PLUS_MINUS']
games_details = games_details.drop(columns = columns_to_drop) # se non conta, si toglie. Altrimenti
# modificarla cambia il risultato
games_details = games_details.dropna()
games_details = games_details.reset_index(drop=True)

games_details.info()
games_details.head()

# Second Dataset: "games.csv"

Now, pass to the second Dataset, which is "games.csv".

In [None]:
games = pd.read_csv('../dataset/games.csv')
games.head()

In [None]:
for j in games['GAME_DATE_EST']:
    games['GAME_DATE_EST'] = games['GAME_DATE_EST'].replace(j,StringToDate(j))

In [None]:
games = games.sort_values(by='GAME_DATE_EST',ascending=False)
games = games.reset_index(drop=True)

In [None]:
teams = pd.read_csv('../dataset/teams.csv')
teams.info()
teams.head()

# Third Dataset: "ranking.csv"
Aggiungere solo ultima partita della squadra avversaria e di quella del giocatore

In [None]:
ranking = pd.read_csv('../dataset/ranking.csv')
ranking.info()
ranking = ranking.drop(columns=['LEAGUE_ID', 'RETURNTOPLAY']) # se soli zeri/NaN: rimosse
ranking.head()

In [None]:
teams = teams.drop(columns=['LEAGUE_ID', 'MAX_YEAR', 'MIN_YEAR','ABBREVIATION','NICKNAME','YEARFOUNDED','CITY','ARENA','ARENACAPACITY','OWNER','GENERALMANAGER','HEADCOACH','DLEAGUEAFFILIATION'])

In [None]:
# Computes 2D arrays for past games and winrates for each team

gamesTeamPlayed = [] #2D array of game_id's
gamesTeamDates = [] #2D array of distances from last game (when defined)
gamesTeamWinrates = [] #2D array of winrates in the past 3 matches (when defined)
for i in teams['TEAM_ID']:
    gamesHome = games['HOME_TEAM_ID'] == i
    gamesVisitor = games['VISITOR_TEAM_ID'] == i
    gamesTeamI = [x or y for x, y in zip(gamesHome,gamesVisitor)]
    tempGamesTeamPlayed = []
    for l in range(len(gamesTeamI)):
        if gamesTeamI[l]:
            tempGamesTeamPlayed.append(games['GAME_ID'][l])
    gamesTeamPlayed.append(tempGamesTeamPlayed)
    indices = []
    counter = 0
    for j in gamesTeamI:
        if j:
            indices.append(counter)
        counter = counter + 1
    gamesTeamDates.append([games['GAME_DATE_EST'][j] for j in indices])
    winrates = []
    for k in range(len(indices)-1,-1,-1):
        if len(indices)-1-k == 0:
            winrates.append(-1)
        elif len(indices)-1-k == 1:
            home = 1 if (games['HOME_TEAM_ID'][indices[k+1]] == i) else 0
            winrate = home*(games['HOME_TEAM_WINS'][indices[k+1]])+(1-home)*(1-games['HOME_TEAM_WINS'][indices[k+1]])
            winrates.append(winrate)
        elif len(indices)-1-k == 2:
            home = 1 if (games['HOME_TEAM_ID'][indices[k+1]] == i) else 0
            winrate = (home*(games['HOME_TEAM_WINS'][indices[k+1]])+(1-home)*(1-games['HOME_TEAM_WINS'][indices[k+1]]) + winrates[1])/2
            winrates.append(winrate)
        elif len(indices)-1-k >= 3:
            home = 1 if (games['HOME_TEAM_ID'][indices[k+1]] == i) else 0
            winrate = home*(games['HOME_TEAM_WINS'][indices[k+1]])+(1-home)*(1-games['HOME_TEAM_WINS'][indices[k+1]])
            home = 1 if (games['HOME_TEAM_ID'][indices[k+2]] == i) else 0
            winrate = winrate + home*(games['HOME_TEAM_WINS'][indices[k+2]])+(1-home)*(1-games['HOME_TEAM_WINS'][indices[k+2]])
            home = 1 if (games['HOME_TEAM_ID'][indices[k+3]] == i) else 0
            winrate = winrate + home*(games['HOME_TEAM_WINS'][indices[k+3]])+(1-home)*(1-games['HOME_TEAM_WINS'][indices[k+3]])
            winrates.append(winrate/3)
    winratesRev = []
    for h in range(len(winrates)-1,-1,-1):
        winratesRev.append(winrates[h])
    gamesTeamWinrates.append(winratesRev)
    

        
def DiffOppWin(game_id,team_id):
    indGameId = games.index[games['GAME_ID'] == game_id].tolist()[0]
    indTeamId = teams['TEAM_ID'].index[teams['TEAM_ID'] == team_id].tolist()[0]
    date = games['GAME_DATE_EST'][indGameId]
    dateId = gamesTeamDates[indTeamId].index(date)
    diff = -1
    if dateId == len(gamesTeamDates[indTeamId])-1:
        diff = -1
    else:
        diff = (date - gamesTeamDates[indTeamId][gamesTeamDates[indTeamId].index(date)+1])
    opposing = 0
    if games['HOME_TEAM_ID'][indGameId] == team_id:
        opposing = games['VISITOR_TEAM_ID'][indGameId]
    else:
        opposing = games['HOME_TEAM_ID'][indGameId]
    winrate = gamesTeamWinrates[indTeamId][gamesTeamPlayed[indTeamId].index(game_id)]
    return [diff, opposing, winrate]

def RefinedWinrate(game_id,home_id,visitor_id): #This is meant to edit games, not games_details
    indGameId = games.index[games['GAME_ID'] == game_id].tolist()[0]
    indTeamId = teams['TEAM_ID'].index[teams['TEAM_ID'] == home_id].tolist()[0]
    indOppTeamId = teams['TEAM_ID'].index[teams['TEAM_ID'] == visitor_id].tolist()[0]
    previousGames1 = [gamesTeamPlayed[indTeamId][j] for j in range(gamesTeamPlayed[indTeamId].index(indGameId),len(gamesTeamPlayed[indTeamId]))]
    previousGames2 = [gamesTeamPlayed[indOppTeamId][j] for j in range(gamesTeamPlayed[indOppTeamId].index(indGameId),len(gamesTeamPlayed[indOppTeamId]))]
    commonGames = [id for id in previousGames1 if id in previousGames2]
    if len(commonGames) == 0:
        return -1
    else:
        wins = 0
        for i in commonGames:
            wins = wins + games['HOME_TEAM_WINS'][i]
        wins = wins / len(commonGames)
        return wins


In [None]:
# Adds a column with distance from the last game, winrate and opponent team for every entry in games_details (circa ?? minuti sul fisso, 20 sul portatile)

games_details['DATE_DIFF'] = ''
games_details['OPPOSING_TEAM'] = ''
games_details['WINRATE'] = ''
for i in range(len(games_details['DATE_DIFF'])):
    games_details.loc[i,'DATE_DIFF'] = DiffOppWin(games_details['GAME_ID'][i],games_details['TEAM_ID'][i])[0]
    games_details.loc[i,'OPPOSING_TEAM'] = DiffOppWin(games_details['GAME_ID'][i],games_details['TEAM_ID'][i])[1]
    games_details.loc[i,'WINRATE'] = DiffOppWin(games_details['GAME_ID'][i],games_details['TEAM_ID'][i])[2]

# Aggiungere qui le triplozze medie (usando il fatto che averageThreePointers è ordinato come game_details rispetto ad ogni giocatore)
# RICORDARSI DI TOGLIERE LE RIGHE CON -1

In [None]:
games_details.head(10)

In [None]:
print(games[games['GAME_STATUS_TEXT']=='Final'].equals(games)) # so no useful information, Final is the content of each cell
games = games.drop(columns=['GAME_STATUS_TEXT'])

games_details = complete_games_details(games_details, games)

games_details['TEAM_ID'].astype(str)
games_details['OPPOSING_TEAM_ID'].astype(str) # trasformare in stringhe nome così  da non lavorare
# con numeri enormi vicini ma usare l'encoder e pace

ratio_missing_values_df(games_details) # print percentage of rows in which there is at least one
# missing value
games_details = games_details.dropna() # drop all the rows with nans
games_details = games_details.reset_index(drop=True) # adjust the indexing

games_details.head()

In [None]:
games_details.head(60)

# *Nota*:
Quando il dataset è incompleto, le funzioni che ho scritto non vanno correttamente perché è possibile che droppando le righe si perdano informazioni che permettono di collegare un dataset all'altro. Ci sono quindi molti missing values

In [None]:
print(np.all(games_details['OPPOSING_TEAM'] == games_details['OPPOSING_TEAM_ID']))

In [None]:
games_details = games_details.drop(columns=['OPPOSING_TEAM_ID'])

In [None]:
games_details.head(60)

In [None]:
games_details.to_csv('../dataset/dataset_completo.csv')

# Fourth Dataset: "teams.csv"

Looking the Dataset we can notice that some arena capacity values are missing. We've decided to fill it searching the values on google:\
-Smoothie King Center: 17,805 seats;\
-Barclays Center: 17.732 seats;\
-Wells Fargo Center: 20,318 seats;\
-Talking Stick Resort Arena: 17,071 seats;

Moreover, Amway Center capcity seems to be wrong because its value is 0. So, we correct it:\
-Amway Center: 18,846 seats

In [None]:
players = pd.read_csv('../dataset/players.csv')
players.info()
players

In [None]:
teams.loc[2, 'ARENACAPACITY'] = 17805.0
teams.loc[12, 'ARENACAPACITY'] = 17732.0
teams.loc[14, 'ARENACAPACITY'] = 18846.0
teams.loc[16, 'ARENACAPACITY'] = 20318.0
teams.loc[17, 'ARENACAPACITY'] = 17071.0

# Fifth Dataset: 'players.csv'

# Learning phase

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
# label encoder e non one-hot encoding per evitare di aumentare di molto
# dimensionalità del dataset e perché le date e simili hanno effettivamente un ordine

from sklearn.model_selection import train_test_split, KFold, GridSearchCV

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

We will evaluate the performance of the regression algorithms via k-fold cross-validation.
Before doing that, we choose hyperparameters by means of hyperparameter tuning (quelli su RF sono in realtà più o meno inutili perché si sa già la tendenza al variare di $n_{trees}$ e si ha già la $p$ ottimale. Ma già che c'eravamo...)

To understand the importance of each variable during the decision process, Gini importance is not as effective as feature ablation. Yet the latter is not implemented in scikit-learn, therefore we use Gini as measurement of the importance of variables.

In [None]:
games_details = games_details.drop(columns=['GAME_ID'])
games_details = games_details.sample(frac=0.05)

categorical_columns = games_details.select_dtypes(include=['object']).columns
label_encoder = LabelEncoder()
games_details[list(categorical_columns)] = games_details[list(categorical_columns)].apply(label_encoder.fit_transform)

games_details = games_details.dropna()

X = games_details.loc[:, games_details.columns != 'FG3M']
y = games_details['FG3M']

reg_metrics = pd.DataFrame(columns=['Model', 'MAE', 'MSE'])

In [None]:
num_folds = 8

num_features = X.shape[1]
p = int(np.ceil(num_features / 3))
rf_param_grid = {
    'n_estimators': [50, 100, 500],
    'max_features': ['sqrt', 'log2', p] 
}

rf_regressor = RandomForestRegressor()
scaler1 = StandardScaler()
X_scaled = scaler1.fit_transform(X) # questo non sarebbe legittimo

grid_search = GridSearchCV(rf_regressor, rf_param_grid, cv=num_folds, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

rf_best_params = grid_search.best_params_
print(rf_best_params)

In [None]:
rf_kfolds = KFold(n_splits=num_folds, shuffle=True)
gini_importances = pd.DataFrame(columns=X.columns)

for fold, (train_idx, test_idx) in enumerate(rf_kfolds.split(X)):

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    rf_regressor = RandomForestRegressor(n_estimators = rf_best_params['n_estimators'], max_features = rf_best_params['max_features'])
    
    rf_regressor.fit(X_train, y_train)
    
    y_pred = rf_regressor.predict(X_test)
    
    current_rf_mae = mean_absolute_error(y_test, y_pred)
    current_rf_mse = mean_squared_error(y_test, y_pred)
    
    reg_metrics.loc[len(reg_metrics)] = ['RF', current_rf_mae, current_rf_mse]

    importances = rf_regressor.feature_importances_
    gini_importances.loc[fold] = importances

In [None]:
dummy_kfolds = KFold(n_splits=num_folds, shuffle=True)

for train_idx, test_idx in dummy_kfolds.split(X):
    
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    dummy_regressor = DummyRegressor()
    dummy_regressor.fit(X_train, y_train)
    
    y_dummy_pred = dummy_regressor.predict(X_test)

    current_dummy_mae = mean_absolute_error(y_test, y_dummy_pred)
    current_dummy_mse = mean_squared_error(y_test, y_dummy_pred)

    reg_metrics.loc[len(reg_metrics)] = ['DUMMY', current_dummy_mae, current_dummy_mse]

In [None]:
knn_param_grid = {
    'n_neighbors': [1, 5, 10],
    'p': [1, 2]
}

knn_regressor = KNeighborsRegressor()

grid_search = GridSearchCV(knn_regressor, knn_param_grid, cv=num_folds, scoring='neg_mean_squared_error')
grid_search.fit(X_scaled, y)

knn_best_params = grid_search.best_params_
print(knn_best_params)

In [None]:
knn_kfolds = KFold(n_splits=num_folds, shuffle=True)

for train_idx, test_idx in knn_kfolds.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    knn_regressor = KNeighborsRegressor(n_neighbors = knn_best_params['n_neighbors'], p = knn_best_params['p'])
    knn_regressor.fit(X_train, y_train)
    
    y_pred = knn_regressor.predict(X_test)
    
    current_knn_mae = mean_absolute_error(y_test, y_pred)
    current_knn_mse = mean_squared_error(y_test, y_pred)
    
    reg_metrics.loc[len(reg_metrics)] = ['KNN', current_knn_mae, current_knn_mse]

In [None]:
svm_param_grid = {
    'C': [0.1, 1, 5],
    'gamma': [0.01, 0.1, 1]
}

sv_regressor = SVR()

grid_search_svm = GridSearchCV(sv_regressor, svm_param_grid, cv=num_folds, scoring='neg_mean_squared_error')
grid_search_svm.fit(X_scaled, y)

svm_best_params = grid_search_svm.best_params_
print(svm_best_params)

In [None]:
sv_kfolds = KFold(n_splits=num_folds, shuffle=True)

for train_idx, test_idx in sv_kfolds.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    sv_regressor = SVR(C = svm_best_params['C'], gamma = svm_best_params['gamma'])
    sv_regressor.fit(X_train, y_train)
    
    y_pred = sv_regressor.predict(X_test)
    
    current_sv_mae = mean_absolute_error(y_test, y_pred)
    current_sv_mse = mean_squared_error(y_test, y_pred)
    
    reg_metrics.loc[len(reg_metrics)] = ['SVM', current_sv_mae, current_sv_mse]

In [None]:
print(reg_metrics)
fig, axs = plt.subplots(1, 2, figsize= (12, 4))
sns.boxplot(data=reg_metrics, x="MAE", y="Model", hue = 'Model', ax=axs[0], palette=sns.color_palette('Paired')[1::2])
sns.boxplot(data=reg_metrics, x="MSE", y="Model", hue = 'Model', ax=axs[1], palette=sns.color_palette('Paired')[1::2])
axs[1].set_yticklabels('')
axs[1].set_ylabel('')
plt.tight_layout()
plt.show()

In [None]:
print(gini_importances)
melted_gini = gini_importances.melt(var_name='Column')

# Create boxplot with seaborn
plt.figure(figsize=(8, 6))
sns.boxplot(x='Column', y='value', data=melted_gini, hue='Column', palette='Set3')

plt.title('Boxplot of Gini importances')
plt.xlabel('Columns')
plt.ylabel('Values')
plt.legend(title='Column')
plt.grid(True)
plt.show()