In [424]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier

In [425]:
merged_df = pd.read_csv('dataset_NBA_cat.csv')
merged_df = merged_df.dropna()
merged_df

Unnamed: 0.1,Unnamed: 0,season,Team,win_match,position,total_pick
0,0,2000,Atlanta Hawks,C,Guard,40.0
1,1,2000,Boston Celtics,E,Guard,59.0
2,2,2000,Boston Celtics,E,Guard-Forward,56.0
3,3,2000,Charlotte Hornets,G,Nothing,1.0
4,4,2000,Chicago Bulls,B,Forward,34.0
...,...,...,...,...,...,...
998,998,2022,San Antonio Spurs,C,Forward,43.0
999,999,2022,Toronto Raptors,F,Center-Forward,3.0
1000,1000,2022,Utah Jazz,E,Forward,39.0
1001,1001,2022,Utah Jazz,E,Guard,9.0


In [426]:
features_list = ['Team', 'season', 'total_pick', 'position' ]

X = merged_df.loc[:,features_list]
y = merged_df.loc[:,"win_match"]

In [427]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0)

In [428]:
X_test = X_test.reset_index().drop(columns=["index"])
X_test_phoenix = X_test[X_test["Team"]=="Phoenix Suns"]
display(X_test_phoenix)

Unnamed: 0,Team,season,total_pick,position
59,Phoenix Suns,2005,53.0,Forward-Center
74,Phoenix Suns,2022,30.0,Forward-Guard
75,Phoenix Suns,2021,31.0,Forward-Guard
103,Phoenix Suns,2000,20.0,Guard-Forward
120,Phoenix Suns,2011,47.0,Center
156,Phoenix Suns,2020,58.0,Guard-Forward
157,Phoenix Suns,2002,53.0,Guard-Forward
184,Phoenix Suns,2022,56.0,Guard-Forward


In [429]:
X_test_phoenix.index.tolist()

[59, 74, 75, 103, 120, 156, 157, 184]

In [430]:
numeric_features = ['season', 'total_pick'] # Choose which column index we are going to scale
numeric_transformer = StandardScaler()

categorical_features = ['Team', 'position']
categorical_transformer = OneHotEncoder(handle_unknown="ignore", drop='first')


# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing
feature_encoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),    
        ('num', numeric_transformer, numeric_features)
        ]
    )

X_train = feature_encoder.fit_transform(X_train)

In [431]:
regressor = LogisticRegression()
regressor.fit(X_train, y_train)

In [432]:
y_train_pred = regressor.predict(X_train)
X_test2 = feature_encoder.transform(X_test)
X_test2



<188x45 sparse matrix of type '<class 'numpy.float64'>'
	with 718 stored elements in Compressed Sparse Row format>

In [433]:
y_test_pred = regressor.predict(X_test2)
y_test_pred

array(['F', 'F', 'G', 'F', 'C', 'D', 'D', 'G', 'G', 'D', 'F', 'H', 'E',
       'F', 'C', 'F', 'C', 'H', 'E', 'C', 'G', 'B', 'F', 'C', 'H', 'F',
       'D', 'C', 'G', 'C', 'C', 'F', 'G', 'C', 'C', 'G', 'E', 'C', 'G',
       'E', 'G', 'F', 'B', 'C', 'I', 'D', 'F', 'B', 'C', 'G', 'F', 'F',
       'F', 'C', 'B', 'C', 'G', 'G', 'B', 'C', 'E', 'E', 'I', 'F', 'C',
       'D', 'F', 'G', 'H', 'G', 'D', 'D', 'I', 'C', 'E', 'E', 'G', 'D',
       'F', 'F', 'D', 'F', 'I', 'I', 'G', 'G', 'F', 'G', 'H', 'F', 'F',
       'D', 'G', 'H', 'B', 'F', 'F', 'G', 'D', 'G', 'G', 'H', 'F', 'J',
       'E', 'C', 'F', 'F', 'E', 'B', 'D', 'F', 'H', 'D', 'H', 'I', 'E',
       'F', 'G', 'F', 'G', 'C', 'J', 'H', 'D', 'G', 'E', 'C', 'F', 'D',
       'F', 'B', 'B', 'F', 'H', 'F', 'C', 'G', 'H', 'G', 'F', 'F', 'I',
       'F', 'G', 'C', 'D', 'G', 'D', 'C', 'G', 'H', 'I', 'G', 'H', 'C',
       'D', 'J', 'F', 'F', 'F', 'C', 'C', 'G', 'F', 'F', 'F', 'F', 'F',
       'C', 'F', 'I', 'F', 'G', 'F', 'G', 'F', 'E', 'G', 'J', 'H

In [434]:
# mse = mean_squared_error(y_test, y_test_pred)

In [435]:
print("R2 score on training set : ", regressor.score(X_train, y_train))
print("R2 score on test set : ", regressor.score(X_test2, y_test))
print("MSE est de :", mse)

R2 score on training set :  0.3391188251001335
R2 score on test set :  0.19148936170212766
MSE est de : 137.31815806306238


In [436]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [437]:
# gb_model = GradientBoostingRegressor(random_state=42)
# gb_model.fit(X_train, y_train)

In [438]:
# y_pred_gb = gb_model.predict(X_test2)

In [439]:
# mse_gb = mean_squared_error(y_test, y_pred_gb)
# r2_gb = r2_score(y_test, y_pred_gb)
# print("MSE :", mse_gb)
# print("Score R2 :", r2_gb)

In [440]:
regressor.coef_

array([[-1.87339413e-01, -6.19404607e-02,  1.74840133e+00,
        -4.79227345e-02, -1.09936184e-01, -1.25424347e-01,
        -1.10036127e-01, -9.99292568e-02, -1.26887152e-01,
        -1.46693066e-01, -1.04359810e-01, -9.06464685e-02,
        -2.34019211e-02, -6.54195291e-02, -8.38396448e-02,
        -9.74388937e-02, -7.93688384e-02, -1.23227122e-01,
        -1.09263545e-01, -4.76866976e-02, -2.52224222e-02,
        -5.03786438e-02, -5.73571735e-03, -7.37013376e-02,
        -8.16057512e-02, -1.00735952e-01,  1.27288606e+00,
        -1.28922007e-01, -7.85677569e-02, -9.77679051e-02,
        -8.40190389e-02, -3.17378953e-02, -9.07409483e-02,
        -1.02481618e-01, -9.76546988e-02, -1.60863785e-01,
        -2.38132269e-02, -2.05211955e-01, -1.03017626e-01,
        -8.39206531e-02, -3.08607834e-01, -4.92551640e-01,
        -3.95697661e-02,  2.41469564e-01, -7.61701717e-02],
       [-9.51699390e-01,  2.57049366e-01,  3.21090049e-01,
        -3.54297235e-01, -1.63502874e-01,  9.39471167e-

In [456]:
# on crée notre prédiction pour une équipe Team	season	win_match	overall_pick	position
data_dict = {"Team":["Sacramento Kings"], "season":[2023], "total_pick":[61], "position":["Forward-Center"]}
data_to_pred = pd.DataFrame(data_dict)
data_to_pred

Unnamed: 0,Team,season,total_pick,position
0,Sacramento Kings,2023,61,Forward-Center


In [457]:
data_to_pred_encoded = feature_encoder.transform(data_to_pred)
data_to_pred_encoded

<1x45 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [458]:
pred = regressor.predict(data_to_pred_encoded)
pred[0]

'C'