# NCAAM match predictions

## Imports

In [1]:
import os
import re
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

## Import data

In [5]:
path_datasets = 'data/MDataFiles_Stage1/'

df_regular_compact = pd.read_csv(path_datasets + 'MRegularSeasonCompactResults.csv')
df_regular_detailed = pd.read_csv(path_datasets + 'MRegularSeasonDetailedResults.csv')
df_teams = pd.read_csv(path_datasets + 'MTeams.csv')
df_seeds = pd.read_csv(path_datasets + 'MNCAATourneySeeds.csv')
coaches = pd.read_csv(path_datasets + 'MTeamCoaches.csv')
df_tourney_compact = pd.read_csv(path_datasets + 'MNCAATourneyCompactResults.csv')
df_tourney_detailed = pd.read_csv(path_datasets + 'MNCAATourneyDetailedResults.csv')

In [7]:
df_regular_compact.describe()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,NumOT
count,161552.0,161552.0,161552.0,161552.0,161552.0,161552.0,161552.0
mean,2003.228731,74.665359,1286.919004,76.795125,1282.637498,64.692526,0.045137
std,10.05866,33.670352,104.52163,12.06626,104.742961,11.34961,0.249751
min,1985.0,0.0,1101.0,34.0,1101.0,20.0,0.0
25%,1995.0,47.0,1198.0,69.0,1191.0,57.0,0.0
50%,2004.0,77.0,1284.0,76.0,1280.0,64.0,0.0
75%,2012.0,103.0,1380.0,84.0,1375.0,72.0,0.0
max,2019.0,132.0,1466.0,186.0,1466.0,150.0,6.0


### Check data sanity

In [9]:
print('Regular Season compact Results :\n', df_regular_compact.isnull().sum().T)
print("-"*10)

print('Regular Season detailed Results :\n', df_regular_detailed.isnull().sum().T)
print("-"*10)

print('Teamsts :\n', df_teams.isnull().sum().T)
print("-"*10)

print('Seeds :\n', df_seeds.isnull().sum().T)
print("-"*10)

print('Coaches :\n', coaches.isnull().sum().T)
print("-"*10)

print('Tourney compact Results :\n', df_tourney_compact.isnull().sum().T)
print("-"*10)

print('Tourney compact Results :\n', df_tourney_detailed .isnull().sum().T)
print("-"*10)

Regular Season compact Results :
 Season     0
DayNum     0
WTeamID    0
WScore     0
LTeamID    0
LScore     0
WLoc       0
NumOT      0
dtype: int64
----------
Regular Season detailed Results :
 Season     0
DayNum     0
WTeamID    0
WScore     0
LTeamID    0
LScore     0
WLoc       0
NumOT      0
WFGM       0
WFGA       0
WFGM3      0
WFGA3      0
WFTM       0
WFTA       0
WOR        0
WDR        0
WAst       0
WTO        0
WStl       0
WBlk       0
WPF        0
LFGM       0
LFGA       0
LFGM3      0
LFGA3      0
LFTM       0
LFTA       0
LOR        0
LDR        0
LAst       0
LTO        0
LStl       0
LBlk       0
LPF        0
dtype: int64
----------
Teamsts :
 TeamID           0
TeamName         0
FirstD1Season    0
LastD1Season     0
dtype: int64
----------
Seeds :
 Season    0
Seed      0
TeamID    0
dtype: int64
----------
Coaches :
 Season         0
TeamID         0
FirstDayNum    0
LastDayNum     0
CoachName      0
dtype: int64
----------
Tourney compact Results :
 Season    

### Concat match details in one dataframe

In [14]:
# add match type
df_regular_detailed["Type"] = "regular"
df_tourney_detailed["Type"] = "tourney"

# We join the data from the regular matchs and tourney matchs
df_match_detailed = pd.concat([df_regular_detailed, df_tourney_detailed])

# We create an idea for a match between two teams : team1_team2(team1 < team2)
df_match_detailed["Match"] = df_match_detailed \
                                .apply(lambda row: "_".join(map(str, sorted([row["WTeamID"], row["LTeamID"]]))), axis=1)

df_match_detailed["Team1"] = df_match_detailed["Match"].apply(lambda x: int(x.split("_")[0]))
df_match_detailed["Team2"] = df_match_detailed["Match"].apply(lambda x: int(x.split("_")[1]))

df_match_detailed["Label"] = df_match_detailed.apply(lambda row: 1 if row["WTeamID"] == row["Team1"] else 0, axis=1)

# WLoc to numeric type
df_match_detailed["WLoc"] = df_match_detailed["WLoc"].map({"H": 0, "A": 1, "N": 3})
# Type to numeric type
df_match_detailed["Type"] = df_match_detailed["Type"].map({"regular": 0, "tourney": 1})

## Feature engineering

Features initiales Luc:

* % de victoires
* % de victoires à domicile et à l'exterieur
* Nombre de points marqués et encaissés en moyenne
* Nombre d'interceptions et de rebonds
* % de réussites à 3pts
* Nombre de lancers francs tentés/marqués

Features à ajouter possiblement:

* L'équipe joue à domicile ?
* % de victoire contre une équipe de rang équivalent (à 1 ou deux seed près?)

Le travail se répartie entre les différentes personnes du groupe (je prend les trois dernières)

### Nombre d'interceptions et de rebonds par match et par saison

Pour créer ces paramètres nous avons besoin des dataframes suivant :

Comptés commme une interception : blocks, steals

Comptés comme rebond : offensive_rebounds, defensive_rebounds

In [16]:
df_match_detailed.head()

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,WFGM,WFGA,...,LAst,LTO,LStl,LBlk,LPF,Type,Match,Team1,Team2,Label
0,2003,10,1104,68,1328,62,3,0,27,58,...,8,18,9,2,20,0,1104_1328,1104,1328,1
1,2003,10,1272,70,1393,63,3,0,26,62,...,7,12,8,6,16,0,1272_1393,1272,1393,1
2,2003,11,1266,73,1437,61,3,0,24,58,...,9,12,2,5,23,0,1266_1437,1266,1437,1
3,2003,11,1296,56,1457,50,3,0,18,38,...,9,19,4,3,23,0,1296_1457,1296,1457,1
4,2003,11,1400,77,1208,71,3,0,30,61,...,12,10,7,1,14,0,1208_1400,1208,1400,0


In [18]:
df_match_detailed.columns

Index(['Season', 'DayNum', 'WTeamID', 'WScore', 'LTeamID', 'LScore', 'WLoc',
       'NumOT', 'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR',
       'WAst', 'WTO', 'WStl', 'WBlk', 'WPF', 'LFGM', 'LFGA', 'LFGM3', 'LFGA3',
       'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF',
       'Type', 'Match', 'Team1', 'Team2', 'Label'],
      dtype='object')

#### Nombre de rebond en moyenne par match de chaque équipe sur chaque saison

In [38]:
# studied_parameter = "DR"
# data = df_match_detailed[["Season", "WTeamID", "LTeamID", "WDR" , "LDR"]].values
# winners_rebound_count = np.zeros(data.shape[0])
# losers_rebound_count = np.zeros(data.shape[0])
# for i, match in enumerate(data):
#     season = match[0]
#     winner = match[1]
#     loser = match[2]
    
#     previous_seasons = data[data[:,0] < season]
#     winner_previous_wins = previous_seasons[previous_seasons[:,1] == winner]
#     winner_previous_loses = previous_seasons[previous_seasons[:,2] == winner]
#     loser_previous_wins = previous_seasons[previous_seasons[:,1] == loser]
#     loser_previous_loses = previous_seasons[previous_seasons[:,2] == loser]
    
#     winner_rebound_count = np.sum(winner_previous_wins) + np.sum(winner_previous_loses)
#     loser_rebound_count = np.sum(loser_previous_wins) + np.sum(winner_previous_loses)
    
#     winners_rebound_count[i] = winner_rebound_count
#     losers_rebound_count[i] = loser_rebound_count

In [89]:
df_tmp = df_match_detailed[["Season", "WTeamID", "LTeamID", "WDR" , "LDR", "WOR", 'LOR']]
df_rebound_count_metric = pd.DataFrame(columns=["Season", "TeamID", "rebound_count"])
for season in df_tmp.Season.unique():
    df_season = df_tmp.loc[df_metric.Season == season]
    for team in df_teams.TeamID.unique():
        rebound_count_win = df_season[["WDR", "WOR"]].loc[df_season.WTeamID == team].mean().sum()
        rebound_count_lose = df_season[["LDR", "LOR"]].loc[df_season.LTeamID == team].mean().sum()
        new_line = pd.DataFrame({"Season":[season], 
                                 "TeamID":[team], 
                                 "rebound_count":[rebound_count_win + rebound_count_lose]})                                                                 
        df_rebound_count_metric = df_rebound_count_metric.append(new_line, ignore_index=True)

In [87]:
df_rebound_count_metric

Unnamed: 0,Season,TeamID,rebound_count
0,2003,1101,0.000000
1,2003,1102,42.541667
2,2003,1103,59.489011
3,2003,1104,73.274510
4,2003,1105,75.639098
5,2003,1106,72.902564
6,2003,1107,60.476190
7,2003,1108,72.710526
8,2003,1109,0.000000
9,2003,1110,66.178571


#### Nombre de d'intercéptions en moyenne par match de chaque équipe sur chaque saison

In [91]:
df_tmp = df_match_detailed[["Season", "WTeamID", "LTeamID", "WBlk" , "LBlk", "WStl", 'LStl']]
df_intercept_count_metric = pd.DataFrame(columns=["Season", "TeamID", "intercept_count"])
for season in df_tmp.Season.unique():
    df_season = df_tmp.loc[df_metric.Season == season]
    for team in df_teams.TeamID.unique():
        intercept_count_win = df_season[["WBlk", "WStl"]].loc[df_season.WTeamID == team].mean().sum()
        intercept_count_lose = df_season[["LBlk", "LStl"]].loc[df_season.LTeamID == team].mean().sum()
        new_line = pd.DataFrame({"Season":[season], 
                                 "TeamID":[team], 
                                 "intercept_count":[intercept_count_win + intercept_count_lose]})                                                                 
        df_intercept_count_metric = df_intercept_count_metric.append(new_line, ignore_index=True)

In [93]:
df_intercept_count_metric.shape

(6239, 3)

### Pourcentage de réussites à 3 points par match et par saison

In [101]:
df_tmp = df_match_detailed[["Season", "WTeamID", "LTeamID", "WFGM3" , "LFGM3", "WFGA3", 'LFGA3']]
df_3_points_rate_metric = pd.DataFrame(columns=["Season", "TeamID", "3_points_rate"])
for season in df_tmp.Season.unique():
    df_season = df_tmp.loc[df_metric.Season == season]
    for team in df_teams.TeamID.unique():
        
        three_points_made_count_win = df_season["WFGM3"].loc[df_season.WTeamID == team].mean()
        three_points_made_count_lose = df_season["LFGM3"].loc[df_season.LTeamID == team].mean()
        
        three_points_attempted_count_win = df_season["WFGA3"].loc[df_season.WTeamID == team].mean()
        three_points_attempted_count_lose = df_season["LFGA3"].loc[df_season.LTeamID == team].mean()
        
        three_points_rate = (three_points_made_count_win + three_points_made_count_lose)/ (three_points_attempted_count_win + three_points_attempted_count_lose)
        new_line = pd.DataFrame({"Season":[season], 
                                 "TeamID":[team], 
                                 "3_points_rate":[three_points_rate]})                                                                 
        df_3_points_rate_metric = df_3_points_rate_metric.append(new_line, ignore_index=True)

In [102]:
df_3_points_rate_metric

Unnamed: 0,Season,TeamID,3_points_rate
0,2003,1101,
1,2003,1102,0.386952
2,2003,1103,0.340024
3,2003,1104,0.318412
4,2003,1105,0.379061
5,2003,1106,0.347425
6,2003,1107,0.365027
7,2003,1108,0.327446
8,2003,1109,
9,2003,1110,0.374904
