In [247]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [248]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [249]:
train_data.head(1000)

Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs,Id
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,1.6,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0.0,0
1,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,0.9,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1.0,3
2,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,1.3,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1.0,4
3,Don MacLean,62,10.9,6.6,2.5,5.8,43.5,0.0,0.1,50.0,1.5,1.8,81.1,0.5,1.4,2.0,0.6,0.2,0.1,0.7,1.0,6
4,Tracy Murray,48,10.3,5.7,2.3,5.4,41.5,0.4,1.5,30.0,0.7,0.8,87.5,0.8,0.9,1.7,0.2,0.2,0.1,0.7,1.0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,Lee Mayberry,82,18.3,5.2,2.1,4.6,45.6,0.5,1.3,39.1,0.5,0.8,57.4,0.3,1.1,1.4,3.3,0.7,0.1,1.0,1.0,1331
939,Chris Smith,80,15.8,4.3,1.6,3.6,43.3,0.0,0.2,14.3,1.2,1.5,79.2,0.4,0.8,1.2,2.5,0.6,0.2,0.8,0.0,1335
940,Brent Price,68,12.6,3.9,1.5,4.1,35.8,0.1,0.7,16.7,0.8,1.0,79.4,0.4,1.1,1.5,2.3,0.8,0.0,1.3,1.0,1336
941,Litterial Green,52,12.0,4.5,1.7,3.8,43.9,0.0,0.2,10.0,1.2,1.8,62.5,0.2,0.4,0.7,2.2,0.4,0.1,0.8,1.0,1338


In [250]:
train_data.sample(5)

Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs,Id
215,Dell Curry,67,9.5,4.9,2.1,4.9,42.6,0.3,0.9,28.3,0.4,0.6,78.9,0.4,0.7,1.2,0.9,0.4,0.1,0.7,1.0,307
697,Terence Morris,68,16.3,3.8,1.6,4.3,38.4,0.2,1.1,19.2,0.3,0.4,64.3,1.1,2.0,3.1,0.9,0.3,0.4,0.7,0.0,1009
423,Orlando Johnson,51,12.1,4.0,1.4,3.5,40.0,0.7,1.8,38.3,0.5,0.6,71.9,0.4,1.8,2.2,0.9,0.2,0.2,0.6,0.0,615
289,Dale Ellis,67,15.8,8.2,3.4,7.4,45.6,0.2,0.4,41.4,1.3,1.8,71.9,1.6,2.1,3.7,0.8,0.6,0.1,1.2,1.0,421
278,Doc Rivers,81,23.9,9.3,3.1,6.7,46.2,0.0,0.1,16.7,3.1,4.0,78.5,0.9,1.8,2.7,3.9,1.6,0.4,2.1,1.0,408


In [251]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         943 non-null    object 
 1   GP           943 non-null    int64  
 2   MIN          943 non-null    float64
 3   PTS          943 non-null    float64
 4   FGM          943 non-null    float64
 5   FGA          943 non-null    float64
 6   FG%          943 non-null    float64
 7   3P Made      943 non-null    float64
 8   3PA          943 non-null    float64
 9   3P%          937 non-null    float64
 10  FTM          943 non-null    float64
 11  FTA          943 non-null    float64
 12  FT%          943 non-null    float64
 13  OREB         943 non-null    float64
 14  DREB         943 non-null    float64
 15  REB          943 non-null    float64
 16  AST          943 non-null    float64
 17  STL          943 non-null    float64
 18  BLK          943 non-null    float64
 19  TOV     

In [252]:
X = train_data.drop(['Name', 'Id', 'TARGET_5Yrs'], axis=1)  # Features
y = train_data['TARGET_5Yrs']  # Target variable

imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

print(X_scaled.shape)

(943, 19)


In [253]:
logistic_model = LogisticRegression(max_iter=10)
logistic_model.fit(X_scaled, y)

logistic_model.coef_


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[ 0.58272665, -0.52963914,  0.10064125,  0.05659237,  0.04604831,
         0.18451327,  0.21766915, -0.10841743,  0.11834039,  0.11409168,
         0.04731141,  0.19067162,  0.71429696, -0.31816695,  0.07118146,
         0.49372266, -0.06543358,  0.28773088, -0.11356678]])

In [254]:
y_pred = logistic_model.predict(X_scaled)
accuracy = accuracy_score(y, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 71.69%


In [255]:
data_test = test_data.drop(['Name', 'Id'], axis=1)
data_test_imputed = imputer.transform(data_test)
data_test_scaled = scaler.transform(data_test_imputed)

In [256]:
logistic_model.predict(data_test_scaled)

array([0., 1., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0.,
       0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1.,
       1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
       1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0.,
       0., 1., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0.,
       0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1.,
       1., 1., 1., 1., 1.