In [538]:
import matplotlib.pyplot as plt
import pandas as pd

In [539]:
df_game_base = pd.read_csv('game.csv')
df_game_base = df_game_base.dropna(how='all', axis=0, subset=['wl_home'])
# on garde que la "Regular Season"
df_game = df_game_base[df_game_base['season_type'].isin(['Regular Season'])]
# on garde que les années supérieures à 2010
df_game= df_game[df_game['season_id'] >= 21989]
# on garde que les colonnes suivantes : 'season_id', 'team_name_home', 'wl_home', 'team_name_away', 'wl_away'
df_game = df_game.loc[:,['season_id', 'team_name_home', 'wl_home', 'team_name_away', 'wl_away']]
df_game

Unnamed: 0,season_id,team_name_home,wl_home,team_name_away,wl_away
23329,21989,Phoenix Suns,W,Golden State Warriors,L
23330,21989,Dallas Mavericks,L,Los Angeles Lakers,W
23331,21989,Seattle SuperSonics,W,Minnesota Timberwolves,L
23332,21989,Detroit Pistons,W,New York Knicks,L
23333,21989,Los Angeles Clippers,W,Houston Rockets,L
...,...,...,...,...,...
65537,22022,New York Knicks,L,Indiana Pacers,W
65538,22022,Brooklyn Nets,L,Philadelphia 76ers,W
65539,22022,Chicago Bulls,W,Detroit Pistons,L
65540,22022,Toronto Raptors,W,Milwaukee Bucks,L


In [540]:
# conversion de season_id en année
def supprimer_premier_2(valeur):
    valeur_str = str(valeur)
    return int(valeur_str.replace('2', '', 1)) if '2' in valeur_str else valeur

df_game['season'] = df_game['season_id'].apply(supprimer_premier_2)
df_game['season'] = pd.to_datetime(df_game['season'], format='%Y').dt.year
df_game

Unnamed: 0,season_id,team_name_home,wl_home,team_name_away,wl_away,season
23329,21989,Phoenix Suns,W,Golden State Warriors,L,1989
23330,21989,Dallas Mavericks,L,Los Angeles Lakers,W,1989
23331,21989,Seattle SuperSonics,W,Minnesota Timberwolves,L,1989
23332,21989,Detroit Pistons,W,New York Knicks,L,1989
23333,21989,Los Angeles Clippers,W,Houston Rockets,L,1989
...,...,...,...,...,...,...
65537,22022,New York Knicks,L,Indiana Pacers,W,2022
65538,22022,Brooklyn Nets,L,Philadelphia 76ers,W,2022
65539,22022,Chicago Bulls,W,Detroit Pistons,L,2022
65540,22022,Toronto Raptors,W,Milwaukee Bucks,L,2022


In [541]:
# total des matchs 'away' gagnés par saison
df_new_game_away = df_game.groupby(['team_name_away', 'season'])['wl_away'].value_counts().reset_index(name='total_away')
df_new_game_away = df_new_game_away.sort_values(by=['team_name_away', 'season', 'wl_away'])
# on conserve que les matchs gagnés
df_new_game_away = df_new_game_away.loc[df_new_game_away['wl_away'] == 'W'].reset_index()
df_new_game_away

Unnamed: 0,index,team_name_away,season,wl_away,total_away
0,1,Atlanta Hawks,1989,W,16
1,3,Atlanta Hawks,1990,W,14
2,5,Atlanta Hawks,1991,W,15
3,7,Atlanta Hawks,1992,W,18
4,8,Atlanta Hawks,1993,W,21
...,...,...,...,...,...
958,1917,Washington Wizards,2018,W,10
959,1919,Washington Wizards,2019,W,9
960,1921,Washington Wizards,2020,W,15
961,1923,Washington Wizards,2021,W,14


In [542]:
# total des matchs 'home' gagnés par saison
df_new_game_home = df_game.groupby(['team_name_home', 'season'])['wl_home'].value_counts().reset_index(name='total_home')
df_new_game_home = df_new_game_home.sort_values(by=['team_name_home', 'season', 'wl_home'])
# on conserve que les matchs gagnés
df_new_game_home = df_new_game_home.loc[df_new_game_home['wl_home'] == 'W'].reset_index()
df_new_game_home

Unnamed: 0,index,team_name_home,season,wl_home,total_home
0,0,Atlanta Hawks,1989,W,25
1,2,Atlanta Hawks,1990,W,29
2,4,Atlanta Hawks,1991,W,23
3,6,Atlanta Hawks,1992,W,25
4,8,Atlanta Hawks,1993,W,36
...,...,...,...,...,...
958,1916,Washington Wizards,2018,W,22
959,1919,Washington Wizards,2019,W,16
960,1920,Washington Wizards,2020,W,19
961,1922,Washington Wizards,2021,W,21


In [543]:
df_new_game_home.insert(loc=len(df_new_game_home.columns), column='total_away', value=df_new_game_away['total_away'])
df_new_game_home

Unnamed: 0,index,team_name_home,season,wl_home,total_home,total_away
0,0,Atlanta Hawks,1989,W,25,16
1,2,Atlanta Hawks,1990,W,29,14
2,4,Atlanta Hawks,1991,W,23,15
3,6,Atlanta Hawks,1992,W,25,18
4,8,Atlanta Hawks,1993,W,36,21
...,...,...,...,...,...,...
958,1916,Washington Wizards,2018,W,22,10
959,1919,Washington Wizards,2019,W,16,9
960,1920,Washington Wizards,2020,W,19,15
961,1922,Washington Wizards,2021,W,21,14


In [544]:
colomns_list = ['total_home', 'total_away']
df_new_game_home['somme'] = df_new_game_home[colomns_list].sum(axis=1)
df_new_game_home

Unnamed: 0,index,team_name_home,season,wl_home,total_home,total_away,somme
0,0,Atlanta Hawks,1989,W,25,16,41
1,2,Atlanta Hawks,1990,W,29,14,43
2,4,Atlanta Hawks,1991,W,23,15,38
3,6,Atlanta Hawks,1992,W,25,18,43
4,8,Atlanta Hawks,1993,W,36,21,57
...,...,...,...,...,...,...,...
958,1916,Washington Wizards,2018,W,22,10,32
959,1919,Washington Wizards,2019,W,16,9,25
960,1920,Washington Wizards,2020,W,19,15,34
961,1922,Washington Wizards,2021,W,21,14,35


In [545]:
# dataframe du total des matchs gagnés par équipe par année

df_new_game = df_new_game_home.loc[:,['team_name_home', 'season', 'somme']]
df_new_game['Team'] = df_new_game['team_name_home']
df_new_game = df_new_game.loc[:,['Team', 'season', 'somme']]
df_new_game

Unnamed: 0,Team,season,somme
0,Atlanta Hawks,1989,41
1,Atlanta Hawks,1990,43
2,Atlanta Hawks,1991,38
3,Atlanta Hawks,1992,43
4,Atlanta Hawks,1993,57
...,...,...,...
958,Washington Wizards,2018,32
959,Washington Wizards,2019,25
960,Washington Wizards,2020,34
961,Washington Wizards,2021,35


In [546]:
df_draft_history = pd.read_csv('draft_history.csv')

In [547]:
df_draft_history['Team']  = df_draft_history['team_city'] + ' ' + df_draft_history['team_name']
# on garde que les années supérieures à 2010
df_draft_history= df_draft_history[df_draft_history['season'] >= 1989]
df_draft_history

Unnamed: 0,person_id,player_name,season,round_number,round_pick,overall_pick,draft_type,team_id,team_city,team_name,team_abbreviation,organization,organization_type,player_profile_flag,Team
6012,442,Pervis Ellison,1989,1,1,1,Draft,1610612758,Sacramento,Kings,SAC,Louisville,College/University,1,Sacramento Kings
6013,198,Danny Ferry,1989,1,2,2,Draft,1610612746,Los Angeles,Clippers,LAC,Duke,College/University,1,Los Angeles Clippers
6014,251,Sean Elliott,1989,1,3,3,Draft,1610612759,San Antonio,Spurs,SAN,Arizona,College/University,1,San Antonio Spurs
6015,779,Glen Rice,1989,1,4,4,Draft,1610612748,Miami,Heat,MIA,Michigan,College/University,1,Miami Heat
6016,462,J.R. Reid,1989,1,5,5,Draft,1610612766,Charlotte,Hornets,CHH,North Carolina,College/University,1,Charlotte Hornets
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7985,1641771,Jalen Slawson,2023,2,24,54,Draft,1610612758,Sacramento,Kings,SAC,Furman,College/University,1,Sacramento Kings
7986,1631209,Isaiah Wong,2023,2,25,55,Draft,1610612754,Indiana,Pacers,IND,Miami (FL),College/University,1,Indiana Pacers
7987,1641844,Tarik Biberovic,2023,2,26,56,Draft,1610612763,Memphis,Grizzlies,MEM,Fenerbahce S.K. (Turkey),Other Team/Club,1,Memphis Grizzlies
7988,1631218,Trayce Jackson-Davis,2023,2,27,57,Draft,1610612764,Washington,Wizards,WAS,Indiana,College/University,1,Washington Wizards


In [548]:
df_draft_history = df_draft_history.loc[:, ['player_name', 'season', 'Team', 'overall_pick']]
df_draft_history

Unnamed: 0,player_name,season,Team,overall_pick
6012,Pervis Ellison,1989,Sacramento Kings,1
6013,Danny Ferry,1989,Los Angeles Clippers,2
6014,Sean Elliott,1989,San Antonio Spurs,3
6015,Glen Rice,1989,Miami Heat,4
6016,J.R. Reid,1989,Charlotte Hornets,5
...,...,...,...,...
7985,Jalen Slawson,2023,Sacramento Kings,54
7986,Isaiah Wong,2023,Indiana Pacers,55
7987,Tarik Biberovic,2023,Memphis Grizzlies,56
7988,Trayce Jackson-Davis,2023,Washington Wizards,57


In [549]:
df_common_player_info = pd.read_csv('common_player_info.csv')

In [550]:

df_common_player_info = df_common_player_info.loc[:,['position', 'display_first_last']]
df_common_player_info = df_common_player_info.fillna('other')
df_common_player_info

Unnamed: 0,position,display_first_last
0,Forward,Alaa Abdelnaby
1,Center,Kareem Abdul-Jabbar
2,Forward-Guard,Tariq Abdul-Wahad
3,Forward,Shareef Abdur-Rahim
4,Forward,Tom Abernethy
...,...,...
4166,Forward,Paul Zipser
4167,Center,Ante Zizic
4168,Center,Jim Zoet
4169,Center,Ivica Zubac


In [551]:
df_draft = df_draft_history.merge(df_common_player_info, left_on='player_name', right_on='display_first_last')
df_draft

Unnamed: 0,player_name,season,Team,overall_pick,position,display_first_last
0,Pervis Ellison,1989,Sacramento Kings,1,Center,Pervis Ellison
1,Danny Ferry,1989,Los Angeles Clippers,2,Forward,Danny Ferry
2,Sean Elliott,1989,San Antonio Spurs,3,Forward,Sean Elliott
3,Glen Rice,1989,Miami Heat,4,Forward-Guard,Glen Rice
4,Glen Rice,1989,Miami Heat,4,Guard-Forward,Glen Rice
...,...,...,...,...,...,...
1468,Vince Williams Jr.,2022,Memphis Grizzlies,47,Guard,Vince Williams Jr.
1469,Kendall Brown,2022,Minnesota Timberwolves,48,Guard,Kendall Brown
1470,Isaiah Mobley,2022,Cleveland Cavaliers,49,Forward,Isaiah Mobley
1471,Tyrese Martin,2022,Golden State Warriors,51,Guard,Tyrese Martin


In [552]:
# dataframe des joueurs avec leur position et numéro de draft
df_draft = df_draft.loc[:, ['overall_pick', 'season', 'position', 'Team']]
df_draft

Unnamed: 0,overall_pick,season,position,Team
0,1,1989,Center,Sacramento Kings
1,2,1989,Forward,Los Angeles Clippers
2,3,1989,Forward,San Antonio Spurs
3,4,1989,Forward-Guard,Miami Heat
4,4,1989,Guard-Forward,Miami Heat
...,...,...,...,...
1468,47,2022,Guard,Memphis Grizzlies
1469,48,2022,Guard,Minnesota Timberwolves
1470,49,2022,Forward,Cleveland Cavaliers
1471,51,2022,Guard,Golden State Warriors


In [553]:
merged_df = pd.merge(df_new_game, df_draft, on=['Team', 'season'], how='outer')
merged_df

Unnamed: 0,Team,season,somme,overall_pick,position
0,Atlanta Hawks,1989,41.0,23.0,Guard
1,Atlanta Hawks,1989,41.0,49.0,Guard
2,Atlanta Hawks,1990,43.0,10.0,Guard
3,Atlanta Hawks,1990,43.0,36.0,Forward
4,Atlanta Hawks,1990,43.0,41.0,Guard
...,...,...,...,...,...
1624,Washington Wizards,2019,25.0,9.0,Forward
1625,Washington Wizards,2020,34.0,37.0,Guard
1626,Washington Wizards,2021,35.0,15.0,Forward
1627,Washington Wizards,2022,35.0,10.0,Guard


In [554]:
merged_df['somme'] = merged_df['somme'].interpolate()
merged_df['overall_pick'] = merged_df['overall_pick'].fillna(61)
merged_df.loc[merged_df['overall_pick'] == 61, 'position'] = 'Nothing'
merged_df

Unnamed: 0,Team,season,somme,overall_pick,position
0,Atlanta Hawks,1989,41.0,23.0,Guard
1,Atlanta Hawks,1989,41.0,49.0,Guard
2,Atlanta Hawks,1990,43.0,10.0,Guard
3,Atlanta Hawks,1990,43.0,36.0,Forward
4,Atlanta Hawks,1990,43.0,41.0,Guard
...,...,...,...,...,...
1624,Washington Wizards,2019,25.0,9.0,Forward
1625,Washington Wizards,2020,34.0,37.0,Guard
1626,Washington Wizards,2021,35.0,15.0,Forward
1627,Washington Wizards,2022,35.0,10.0,Guard


In [555]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier

In [556]:
features_list = ['Team', 'season', 'overall_pick', 'position' ]

X = merged_df.loc[:,features_list] 
y = merged_df.loc[:,"somme"] 

In [557]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0)

In [558]:
numeric_features = [1, 2] # Choose which column index we are going to scale
numeric_transformer = StandardScaler()

categorical_features = [0, 3 ]
categorical_transformer = OneHotEncoder(handle_unknown="ignore", drop='first')


# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing
feature_encoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),    
        ('num', numeric_transformer, numeric_features)
        ]
    )

X_train = feature_encoder.fit_transform(X_train)

In [559]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [560]:
y_train_pred = regressor.predict(X_train)

In [561]:
X_test = feature_encoder.transform(X_test)

In [562]:
y_test_pred = regressor.predict(X_test)

In [563]:
y_test_pred = regressor.predict(X_test)

In [564]:
mse = mean_squared_error(y_test, y_test_pred)

In [565]:
print("R2 score on training set : ", regressor.score(X_train, y_train))
print("R2 score on test set : ", regressor.score(X_test, y_test))
print("MSE est de :", mse)

R2 score on training set :  0.19064129651902095
R2 score on test set :  0.1477105478312809
MSE est de : 150.74759185372642


In [566]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [567]:
gb_model = GradientBoostingRegressor(random_state=42)
gb_model.fit(X_train, y_train)

In [568]:
y_pred_gb = gb_model.predict(X_test)

In [569]:
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)
print("MSE :", mse_gb)
print("Score R2 :", r2_gb)

MSE : 124.42615013828278
Score R2 : 0.29652544340662124


In [570]:
regressor.coef_

array([  6.23351132,  -1.98787751, -10.3381452 ,  -0.84843595,
         2.45256253,  -0.3357962 ,  -1.42126848,  -3.9999266 ,
        -0.70717414,  -2.03296024,   3.96302186,   3.57979846,
         9.00750847,  -5.07603061,   3.3911289 ,   1.51562492,
         0.26482969,   0.68903044, -10.76291737,  -3.27932464,
         3.66609383,  -2.97586414,   1.32385129,  -0.81576479,
         3.0558474 ,  -0.86359989,  -1.70835494,   3.95878058,
         6.19980648,  -4.75329049,   9.2318263 ,   1.47544216,
         0.36008014,   8.28615713, -22.82812484,  -9.66366615,
        -5.19588207,  -0.33538561,  -0.54250573,  -1.26535424,
        -3.41567296,  -0.41407546,   1.03439762,  -0.15835232,
        -4.76517732,  -0.92094076,   2.29647042])