# Logestic Regression Model - PRUNE

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine
from config import db_password
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [2]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/wnba_champs"
engine = create_engine(db_string)

In [3]:
stats_df = pd.read_sql("select * from \"stats\"", db_string);
stats_df.head()

Unnamed: 0,index,Position,Season,Tm,Age,W,L,Win_pct,PW,PL,...,3PAr,TS_pct,eFG_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TOV_pct,opp_DRB%,Opp_FT_FGA
0,0,1,1997-98,HOC,28.8,18,10,0.6429,22,6,...,0.292,0.523,0.473,20.4,36.2,0.266,0.444,22.3,64.7,0.236
1,1,0,1997-98,PHM,27.9,16,12,0.5714,20,8,...,0.247,0.473,0.411,18.8,33.2,0.274,0.441,24.2,68.5,0.245
2,2,0,1997-98,NYL,27.5,17,11,0.6071,17,11,...,0.198,0.482,0.439,21.3,34.6,0.236,0.422,22.7,64.7,0.246
3,3,0,1997-98,LAS,25.5,14,14,0.5,17,11,...,0.136,0.504,0.464,20.4,32.7,0.235,0.423,19.4,67.9,0.263
4,4,0,1997-98,CLR,28.1,15,13,0.5357,16,12,...,0.108,0.515,0.464,21.6,32.2,0.276,0.449,20.2,67.1,0.219


In [4]:
test_df= pd.read_sql("select * from \"cy_stats\"", db_string)
test_df.head()

Unnamed: 0,index,Position,Season,Tm,Age,W,L,Win_pct,PW,PL,...,3PAr,TS_pct,eFG_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TOV_pct,opp_DRB%,Opp_FT_FGA
0,0,0,2021-22,COS,28.3,26,6,0.8125,28,4,...,0.298,0.543,0.497,15.9,31.2,0.22,0.459,16.1,82.1,0.201
1,1,1,2021-22,LVA,26.4,24,8,0.75,26,6,...,0.192,0.559,0.508,12.6,21.3,0.255,0.464,12.5,80.0,0.165
2,2,0,2021-22,SES,28.1,21,11,0.6563,22,10,...,0.328,0.556,0.512,13.9,20.9,0.206,0.483,14.4,78.6,0.18
3,3,0,2021-22,MLY,27.6,22,10,0.6875,21,11,...,0.296,0.553,0.508,15.8,22.3,0.218,0.476,14.6,79.7,0.194
4,4,0,2021-22,PHM,28.4,19,13,0.5938,19,13,...,0.332,0.553,0.507,14.7,23.8,0.235,0.466,11.3,75.1,0.167


In [5]:
# Pull out Season, TM and Team_Name into DF
name_df = stats_df[['Season', 'Tm',]]
name_df.head()

Unnamed: 0,Season,Tm
0,1997-98,HOC
1,1997-98,PHM
2,1997-98,NYL
3,1997-98,LAS
4,1997-98,CLR


In [6]:
#list(stats_df.columns)

In [7]:
all_features_df = stats_df
all_features_df.head()

Unnamed: 0,index,Position,Season,Tm,Age,W,L,Win_pct,PW,PL,...,3PAr,TS_pct,eFG_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TOV_pct,opp_DRB%,Opp_FT_FGA
0,0,1,1997-98,HOC,28.8,18,10,0.6429,22,6,...,0.292,0.523,0.473,20.4,36.2,0.266,0.444,22.3,64.7,0.236
1,1,0,1997-98,PHM,27.9,16,12,0.5714,20,8,...,0.247,0.473,0.411,18.8,33.2,0.274,0.441,24.2,68.5,0.245
2,2,0,1997-98,NYL,27.5,17,11,0.6071,17,11,...,0.198,0.482,0.439,21.3,34.6,0.236,0.422,22.7,64.7,0.246
3,3,0,1997-98,LAS,25.5,14,14,0.5,17,11,...,0.136,0.504,0.464,20.4,32.7,0.235,0.423,19.4,67.9,0.263
4,4,0,1997-98,CLR,28.1,15,13,0.5357,16,12,...,0.108,0.515,0.464,21.6,32.2,0.276,0.449,20.2,67.1,0.219


In [8]:
stats_df.head()

Unnamed: 0,index,Position,Season,Tm,Age,W,L,Win_pct,PW,PL,...,3PAr,TS_pct,eFG_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TOV_pct,opp_DRB%,Opp_FT_FGA
0,0,1,1997-98,HOC,28.8,18,10,0.6429,22,6,...,0.292,0.523,0.473,20.4,36.2,0.266,0.444,22.3,64.7,0.236
1,1,0,1997-98,PHM,27.9,16,12,0.5714,20,8,...,0.247,0.473,0.411,18.8,33.2,0.274,0.441,24.2,68.5,0.245
2,2,0,1997-98,NYL,27.5,17,11,0.6071,17,11,...,0.198,0.482,0.439,21.3,34.6,0.236,0.422,22.7,64.7,0.246
3,3,0,1997-98,LAS,25.5,14,14,0.5,17,11,...,0.136,0.504,0.464,20.4,32.7,0.235,0.423,19.4,67.9,0.263
4,4,0,1997-98,CLR,28.1,15,13,0.5357,16,12,...,0.108,0.515,0.464,21.6,32.2,0.276,0.449,20.2,67.1,0.219


In [9]:
# Drop un needed columns
all_features_df.drop(['index',
 'Season',
 'Tm',
 'W',
 'L',
 #'Win_pct',
 #'MOV',
 #'SOS',
 #'SRS',
 #'Pace',
 #'ORtg',
 #'DRtg',
 #'eFG_pct',
 #'TS_pct',
 #'TOV_pct',
 #'ORB_pct',
 #'FT_FGA',
 #'Opp_eFG_pct',
 #'Opp_TS_pct',
 #'Opp_TOV_pct',
 #'Opp_ORB_pct',
 #'Opp_FT_FGA'
              ], axis=1, inplace=True)

In [10]:
all_features_df.head()

Unnamed: 0,Position,Age,Win_pct,PW,PL,MOV,SOS,SRS,ORtg,DRtg,...,3PAr,TS_pct,eFG_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TOV_pct,opp_DRB%,Opp_FT_FGA
0,1,28.8,0.6429,22,6,6.32,-0.79,5.53,99.4,90.7,...,0.292,0.523,0.473,20.4,36.2,0.266,0.444,22.3,64.7,0.236
1,0,27.9,0.5714,20,8,4.0,-0.48,3.38,91.4,86.1,...,0.247,0.473,0.411,18.8,33.2,0.274,0.441,24.2,68.5,0.245
2,0,27.5,0.6071,17,11,2.39,-0.32,2.22,90.6,87.4,...,0.198,0.482,0.439,21.3,34.6,0.236,0.422,22.7,64.7,0.246
3,0,25.5,0.5,17,11,2.21,-0.23,1.63,93.9,91.1,...,0.136,0.504,0.464,20.4,32.7,0.235,0.423,19.4,67.9,0.263
4,0,28.1,0.5357,16,12,1.79,-0.22,1.56,94.4,92.0,...,0.108,0.515,0.464,21.6,32.2,0.276,0.449,20.2,67.1,0.219


In [11]:
# Remove Position target from features data
y = all_features_df.Position
X = all_features_df.drop(columns=["Position"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)


In [12]:
feature_names = X.columns

In [13]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [14]:
# Create a logistic regression model
classifier_AF = LogisticRegression(max_iter=1000)
classifier_AF

LogisticRegression(max_iter=1000)

In [15]:
# Fit (train) our model by using the training data
classifier_AF.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=1000)

In [16]:
# Validate the model by using the test data
print(f"Logistic Regression Training Data Score: {classifier_AF.score(X_train_scaled, y_train)}")
print(f"Logistic Regression Testing Data Score: {classifier_AF.score(X_test_scaled, y_test)}")

Logistic Regression Training Data Score: 0.9282700421940928
Logistic Regression Testing Data Score: 0.9240506329113924


All features score = 0.92405

Find accuracy using top 10 RFE features

In [17]:
stats_df = pd.read_sql("select * from \"stats\"", db_string);
stats_df.head()

Unnamed: 0,index,Position,Season,Tm,Age,W,L,Win_pct,PW,PL,...,3PAr,TS_pct,eFG_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TOV_pct,opp_DRB%,Opp_FT_FGA
0,0,1,1997-98,HOC,28.8,18,10,0.6429,22,6,...,0.292,0.523,0.473,20.4,36.2,0.266,0.444,22.3,64.7,0.236
1,1,0,1997-98,PHM,27.9,16,12,0.5714,20,8,...,0.247,0.473,0.411,18.8,33.2,0.274,0.441,24.2,68.5,0.245
2,2,0,1997-98,NYL,27.5,17,11,0.6071,17,11,...,0.198,0.482,0.439,21.3,34.6,0.236,0.422,22.7,64.7,0.246
3,3,0,1997-98,LAS,25.5,14,14,0.5,17,11,...,0.136,0.504,0.464,20.4,32.7,0.235,0.423,19.4,67.9,0.263
4,4,0,1997-98,CLR,28.1,15,13,0.5357,16,12,...,0.108,0.515,0.464,21.6,32.2,0.276,0.449,20.2,67.1,0.219


In [19]:
# Drop un needed columns
stats_df.drop(['index',
 'Season',
 'Tm',
 'W',
 'L',
 #'Win_pct',
 #'MOV',
 #'SOS',
 #'SRS',
 #'Pace',
 #'ORtg',
 #'DRtg',
 #'eFG_pct',
 #'TS_pct',
 #'TOV_pct',
 #'ORB_pct',
 #'FT_FGA',
 #'Opp_eFG_pct',
 #'Opp_TS_pct',
 #'Opp_TOV_pct',
 #'Opp_ORB_pct',
 #'Opp_FT_FGA'
              ], axis=1, inplace=True)

In [20]:
# Remove Position target from features data
y = stats_df.Position
X = stats_df.drop(columns=["Position"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [21]:
# https://towardsdatascience.com/a-look-into-feature-importance-in-logistic-regression-models-a4aa970f9b0f
from sklearn.feature_selection import RFE

predictors = X_train
selector = RFE(classifier_AF, n_features_to_select = 1)
selector = selector.fit(predictors, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [22]:
order = selector.ranking_
order

array([17,  1,  5, 16, 13,  2,  4,  8,  9,  3, 14,  7, 12, 20, 22, 15, 18,
        6, 21, 10, 11, 19])

In [23]:
order_df = pd.DataFrame(order, columns = ['importance'])
order_df.head()

Unnamed: 0,importance
0,17
1,1
2,5
3,16
4,13


In [24]:
feature_importance = pd.DataFrame(feature_names, columns = ["feature"])
feature_importance.head()

Unnamed: 0,feature
0,Age
1,Win_pct
2,PW
3,PL
4,MOV


In [25]:
feature_imp_df = order_df.join(feature_importance)
feature_imp_df

Unnamed: 0,importance,feature
0,17,Age
1,1,Win_pct
2,5,PW
3,16,PL
4,13,MOV
5,2,SOS
6,4,SRS
7,8,ORtg
8,9,DRtg
9,3,NRtg


In [26]:
rfe_df = stats_df
rfe_df.head()

Unnamed: 0,Position,Age,Win_pct,PW,PL,MOV,SOS,SRS,ORtg,DRtg,...,3PAr,TS_pct,eFG_pct,TOV_pct,ORB_pct,FT_FGA,Opp_eFG_pct,Opp_TOV_pct,opp_DRB%,Opp_FT_FGA
0,1,28.8,0.6429,22,6,6.32,-0.79,5.53,99.4,90.7,...,0.292,0.523,0.473,20.4,36.2,0.266,0.444,22.3,64.7,0.236
1,0,27.9,0.5714,20,8,4.0,-0.48,3.38,91.4,86.1,...,0.247,0.473,0.411,18.8,33.2,0.274,0.441,24.2,68.5,0.245
2,0,27.5,0.6071,17,11,2.39,-0.32,2.22,90.6,87.4,...,0.198,0.482,0.439,21.3,34.6,0.236,0.422,22.7,64.7,0.246
3,0,25.5,0.5,17,11,2.21,-0.23,1.63,93.9,91.1,...,0.136,0.504,0.464,20.4,32.7,0.235,0.423,19.4,67.9,0.263
4,0,28.1,0.5357,16,12,1.79,-0.22,1.56,94.4,92.0,...,0.108,0.515,0.464,21.6,32.2,0.276,0.449,20.2,67.1,0.219


In [28]:
# Drop un needed columns
rfe_df.drop([#'index',
 #'Season',
 #'Tm',
 #'G',
 #'W',
 #'L',
 #'Win_pct',
 #'MOV',
 #'SOS',
 #'SRS',
 'Pace',
 #'ORtg',
 #'DRtg',
 #'eFG_pct',
 'TS_pct',
 'TOV_pct',
 'ORB_pct',
 #'FT_FGA',
 'Opp_eFG_pct',
 #'Opp_TOV_pct',
 #'Opp_FT_FGA'
              ], axis=1, inplace=True)

In [29]:
rfe_df.head()

Unnamed: 0,Position,Age,Win_pct,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,FTr,3PAr,eFG_pct,FT_FGA,Opp_TOV_pct,opp_DRB%,Opp_FT_FGA
0,1,28.8,0.6429,22,6,6.32,-0.79,5.53,99.4,90.7,8.7,0.357,0.292,0.473,0.266,22.3,64.7,0.236
1,0,27.9,0.5714,20,8,4.0,-0.48,3.38,91.4,86.1,5.3,0.359,0.247,0.411,0.274,24.2,68.5,0.245
2,0,27.5,0.6071,17,11,2.39,-0.32,2.22,90.6,87.4,3.2,0.354,0.198,0.439,0.236,22.7,64.7,0.246
3,0,25.5,0.5,17,11,2.21,-0.23,1.63,93.9,91.1,2.8,0.348,0.136,0.464,0.235,19.4,67.9,0.263
4,0,28.1,0.5357,16,12,1.79,-0.22,1.56,94.4,92.0,2.4,0.382,0.108,0.464,0.276,20.2,67.1,0.219


In [30]:
stats_df.head()

Unnamed: 0,Position,Age,Win_pct,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,FTr,3PAr,eFG_pct,FT_FGA,Opp_TOV_pct,opp_DRB%,Opp_FT_FGA
0,1,28.8,0.6429,22,6,6.32,-0.79,5.53,99.4,90.7,8.7,0.357,0.292,0.473,0.266,22.3,64.7,0.236
1,0,27.9,0.5714,20,8,4.0,-0.48,3.38,91.4,86.1,5.3,0.359,0.247,0.411,0.274,24.2,68.5,0.245
2,0,27.5,0.6071,17,11,2.39,-0.32,2.22,90.6,87.4,3.2,0.354,0.198,0.439,0.236,22.7,64.7,0.246
3,0,25.5,0.5,17,11,2.21,-0.23,1.63,93.9,91.1,2.8,0.348,0.136,0.464,0.235,19.4,67.9,0.263
4,0,28.1,0.5357,16,12,1.79,-0.22,1.56,94.4,92.0,2.4,0.382,0.108,0.464,0.276,20.2,67.1,0.219


In [31]:
# Remove Position target from features data
y = rfe_df.Position
X = rfe_df.drop(columns=["Position"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [32]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [33]:
# Create a logistic regression model
classifier_RFE = LogisticRegression(max_iter=1000)
classifier_RFE

LogisticRegression(max_iter=1000)

In [34]:
# Fit (train) our model by using the training data
classifier_RFE.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=1000)

In [35]:
# Validate the model by using the test data
print(f"Logistic Regression with RFE features only Training Data Score: {classifier_RFE.score(X_train_scaled, y_train)}")
print(f"Logistic Regression with RFE features only Testing Data Score: {classifier_RFE.score(X_test_scaled, y_test)}")

Logistic Regression with RFE features only Training Data Score: 0.9324894514767933
Logistic Regression with RFE features only Testing Data Score: 0.9240506329113924


RFE features score = 0.92405

In [36]:
# Pull out Season, TM and Team_Name into DF
test_name_df = test_df[['Season', 'Tm']]
test_name_df.head()

Unnamed: 0,Season,Tm
0,2021-22,COS
1,2021-22,LVA
2,2021-22,SES
3,2021-22,MLY
4,2021-22,PHM


In [39]:
test_df_pos= test_df['Position']
test_df.drop(['index',
 'Position',
 'Season',
 'Tm',
 'W',
 'L',
 #'Win_pct',
 #'MOV',
 #'SOS',
 #'SRS',
 'Pace',
 #'ORtg',
 #'DRtg',
 #'eFG_pct',
 'TS_pct',
 'TOV_pct',
 'ORB_pct',
 #'FT_FGA',
 'Opp_eFG_pct',
 #'Opp_TOV_pct',
 #'Opp_FT_FGA'
             ], axis=1, inplace=True)
test_df

Unnamed: 0,Age,Win_pct,PW,PL,MOV,SOS,SRS,ORtg,DRtg,NRtg,FTr,3PAr,eFG_pct,FT_FGA,Opp_TOV_pct,opp_DRB%,Opp_FT_FGA
0,28.3,0.8125,28,4,9.81,-0.95,8.86,106.9,93.7,13.2,0.27,0.298,0.497,0.22,16.1,82.1,0.201
1,26.4,0.75,26,6,9.06,-0.62,8.44,109.1,98.0,11.1,0.312,0.192,0.508,0.255,12.5,80.0,0.165
2,28.1,0.6563,22,10,4.44,-0.63,3.8,106.3,100.7,5.6,0.246,0.328,0.512,0.206,14.4,78.6,0.18
3,27.6,0.6875,21,11,4.0,-0.37,3.63,104.2,99.1,5.1,0.26,0.296,0.508,0.218,14.6,79.7,0.194
4,28.4,0.5938,19,13,2.56,-0.12,2.45,106.5,103.2,3.3,0.295,0.332,0.507,0.235,11.3,75.1,0.167
5,28.4,0.5,18,14,1.31,-0.22,1.1,102.9,101.3,1.6,0.243,0.304,0.493,0.205,15.9,73.9,0.212
6,24.8,0.4375,15,17,-0.53,0.3,-0.23,104.3,105.0,-0.7,0.233,0.327,0.477,0.193,14.1,77.6,0.206
7,28.7,0.375,11,21,-3.84,0.26,-3.58,101.5,106.4,-4.9,0.251,0.396,0.475,0.203,15.3,76.4,0.208
8,27.5,0.375,10,22,-4.34,0.55,-3.79,93.7,99.3,-5.6,0.237,0.306,0.462,0.181,19.2,71.4,0.267
9,27.4,0.25,9,23,-5.63,0.24,-5.38,99.3,106.4,-7.1,0.214,0.269,0.459,0.154,16.8,73.7,0.234


In [40]:
df_X_test_scaled = scaler.fit_transform(test_df)

In [41]:
df_test_df=classifier_RFE.predict(df_X_test_scaled)
df_test_df

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [42]:
cy_test_df = pd.DataFrame(df_test_df, columns = ['Position'])
cy_test_df.head()

Unnamed: 0,Position
0,0
1,1
2,0
3,0
4,0


In [43]:
joined_df = test_name_df.join(cy_test_df)
joined_df

Unnamed: 0,Season,Tm,Position
0,2021-22,COS,0
1,2021-22,LVA,1
2,2021-22,SES,0
3,2021-22,MLY,0
4,2021-22,PHM,0
5,2021-22,CSK,0
6,2021-22,DAW,0
7,2021-22,WAM,0
8,2021-22,LAS,0
9,2021-22,ATD,0


In [44]:
classifier_RFE.predict_proba(df_X_test_scaled)

array([[5.97526798e-01, 4.02473202e-01],
       [4.00212807e-01, 5.99787193e-01],
       [8.34604477e-01, 1.65395523e-01],
       [8.56750660e-01, 1.43249340e-01],
       [8.73400949e-01, 1.26599051e-01],
       [9.66584545e-01, 3.34154554e-02],
       [9.89550066e-01, 1.04499343e-02],
       [9.95954190e-01, 4.04580957e-03],
       [9.98656634e-01, 1.34336639e-03],
       [9.98937430e-01, 1.06256990e-03],
       [9.97730228e-01, 2.26977174e-03],
       [9.99371088e-01, 6.28911500e-04]])