In [116]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import recall_score, precision_score, balanced_accuracy_score, f1_score, classification_report, roc_auc_score

# CLASSIFICATION NBA PLAYERS

In [5]:
df = pd.read_csv("all_seasons.csv")
df

Unnamed: 0.1,Unnamed: 0,player_name,team_abbreviation,age,player_height,player_weight,college,country,draft_year,draft_round,...,pts,reb,ast,net_rating,oreb_pct,dreb_pct,usg_pct,ts_pct,ast_pct,season
0,0,Chris Robinson,VAN,23.0,195.58,90.71840,Western Kentucky,USA,1996,2,...,4.6,1.7,1.6,-11.4,0.039,0.088,0.155,0.486,0.156,1996-97
1,1,Matt Fish,MIA,27.0,210.82,106.59412,North Carolina-Wilmington,USA,1992,2,...,0.3,0.8,0.0,-15.1,0.143,0.267,0.265,0.333,0.000,1996-97
2,2,Matt Bullard,HOU,30.0,208.28,106.59412,Iowa,USA,Undrafted,Undrafted,...,4.5,1.6,0.9,0.9,0.016,0.115,0.151,0.535,0.099,1996-97
3,3,Marty Conlon,BOS,29.0,210.82,111.13004,Providence,USA,Undrafted,Undrafted,...,7.8,4.4,1.4,-9.0,0.083,0.152,0.167,0.542,0.101,1996-97
4,4,Martin Muursepp,DAL,22.0,205.74,106.59412,,USA,1996,1,...,3.7,1.6,0.5,-14.5,0.109,0.118,0.233,0.482,0.114,1996-97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9556,9556,Elijah Millsap,PHX,29.0,200.66,97.52228,Alabama-Birmingham,USA,Undrafted,Undrafted,...,1.5,3.0,0.5,-1.8,0.125,0.150,0.199,0.190,0.067,2016-17
9557,9557,Elfrid Payton,ORL,23.0,193.04,83.91452,Louisiana-Lafayette,USA,2014,1,...,12.8,4.7,6.5,-2.7,0.039,0.136,0.213,0.520,0.329,2016-17
9558,9558,Edy Tavares,CLE,25.0,220.98,120.20188,,Cabo Verde,2014,2,...,4.0,5.5,0.5,15.2,0.160,0.241,0.137,0.633,0.071,2016-17
9559,9559,Willy Hernangomez,NYK,23.0,210.82,108.86208,,Spain,2015,2,...,8.2,7.0,1.3,-2.8,0.142,0.263,0.202,0.564,0.116,2016-17


In [7]:
df.isnull().sum()

Unnamed: 0           0
player_name          0
team_abbreviation    0
age                  0
player_height        0
player_weight        0
college              0
country              0
draft_year           0
draft_round          0
draft_number         0
gp                   0
pts                  0
reb                  0
ast                  0
net_rating           0
oreb_pct             0
dreb_pct             0
usg_pct              0
ts_pct               0
ast_pct              0
season               0
dtype: int64

In [9]:
df_new = df[['player_name','team_abbreviation','age','player_height','player_weight','pts','reb']]
df_new = df_new.rename(columns={"player_name":"name","team_abbreviation":"club","player_height":"height","player_weight":"weight"})
df_new

Unnamed: 0,name,club,age,height,weight,pts,reb
0,Chris Robinson,VAN,23.0,195.58,90.71840,4.6,1.7
1,Matt Fish,MIA,27.0,210.82,106.59412,0.3,0.8
2,Matt Bullard,HOU,30.0,208.28,106.59412,4.5,1.6
3,Marty Conlon,BOS,29.0,210.82,111.13004,7.8,4.4
4,Martin Muursepp,DAL,22.0,205.74,106.59412,3.7,1.6
...,...,...,...,...,...,...,...
9556,Elijah Millsap,PHX,29.0,200.66,97.52228,1.5,3.0
9557,Elfrid Payton,ORL,23.0,193.04,83.91452,12.8,4.7
9558,Edy Tavares,CLE,25.0,220.98,120.20188,4.0,5.5
9559,Willy Hernangomez,NYK,23.0,210.82,108.86208,8.2,7.0


In [10]:
df_pilih = df_new[(df_new['age']<=25)&(df_new['height']>=180)&(df_new['weight']<=90)&(df_new['pts']>=6)&(df_new['reb']>=3)]
ind_pilih = df_pilih.index.tolist() #index yang masuk kriteria
df_pilih #Ada 191 Pemain Target

Unnamed: 0,name,club,age,height,weight,pts,reb
105,Kerry Kittles,NJN,23.0,195.58,81.192968,16.4,3.9
290,Allen Iverson,PHI,22.0,182.88,74.842680,23.5,4.1
432,Eddie Jones,LAL,25.0,198.12,86.182480,17.2,4.1
659,Kerry Kittles,NJN,24.0,195.58,81.192968,17.2,4.7
835,Allen Iverson,PHI,23.0,182.88,74.842680,22.0,3.7
...,...,...,...,...,...,...,...
9345,Brandon Ingram,LAL,19.0,205.74,86.182480,9.4,4.0
9371,D'Angelo Russell,LAL,21.0,195.58,88.450440,15.6,3.5
9485,Jeremy Lamb,CHA,25.0,195.58,83.914520,9.7,4.3
9522,Dennis Schroder,ATL,23.0,185.42,78.017824,17.9,3.1


In [40]:
df_new['status'] = ['Target' if i in ind_pilih else 'Non-Target' for i in range(len(df_new.index))]
df_new

Unnamed: 0,name,club,age,height,weight,pts,reb,status_en,status
0,Chris Robinson,VAN,23.0,195.58,90.71840,4.6,1.7,0,Non-Target
1,Matt Fish,MIA,27.0,210.82,106.59412,0.3,0.8,0,Non-Target
2,Matt Bullard,HOU,30.0,208.28,106.59412,4.5,1.6,0,Non-Target
3,Marty Conlon,BOS,29.0,210.82,111.13004,7.8,4.4,0,Non-Target
4,Martin Muursepp,DAL,22.0,205.74,106.59412,3.7,1.6,0,Non-Target
...,...,...,...,...,...,...,...,...,...
9556,Elijah Millsap,PHX,29.0,200.66,97.52228,1.5,3.0,0,Non-Target
9557,Elfrid Payton,ORL,23.0,193.04,83.91452,12.8,4.7,1,Target
9558,Edy Tavares,CLE,25.0,220.98,120.20188,4.0,5.5,0,Non-Target
9559,Willy Hernangomez,NYK,23.0,210.82,108.86208,8.2,7.0,0,Non-Target


### Label Encoder

In [173]:
### Labeling Target
le = LabelEncoder()
df_new1 = df_new #mau di drop kolom statusnya
df_new1['status_en'] = le.fit_transform(df_new1['status']) ####0 bukan target, 1 target
# df_new1 = df_new1.drop(['status'], axis=1)
df_new1.head(10)

Unnamed: 0,name,club,age,height,weight,pts,reb,status_en,status
0,Chris Robinson,VAN,23.0,195.58,90.7184,4.6,1.7,0,Non-Target
1,Matt Fish,MIA,27.0,210.82,106.59412,0.3,0.8,0,Non-Target
2,Matt Bullard,HOU,30.0,208.28,106.59412,4.5,1.6,0,Non-Target
3,Marty Conlon,BOS,29.0,210.82,111.13004,7.8,4.4,0,Non-Target
4,Martin Muursepp,DAL,22.0,205.74,106.59412,3.7,1.6,0,Non-Target
5,Martin Lewis,TOR,22.0,198.12,102.0582,1.6,0.7,0,Non-Target
6,Mark West,CLE,36.0,208.28,111.583632,3.2,2.7,0,Non-Target
7,Mark Strickland,MIA,26.0,208.28,99.79024,2.0,1.2,0,Non-Target
8,Mark Price,GSW,33.0,182.88,81.64656,11.3,2.6,0,Non-Target
9,Mark Jackson,IND,32.0,190.5,83.91452,9.9,4.8,0,Non-Target


### Standarisasi

In [46]:
#Standarisasi
ss = StandardScaler()

In [47]:
std = ss.fit_transform(df_new1.loc[:,'age':'reb'])
std

array([[-0.99674183, -0.57899275, -0.80260413, -0.58938997, -0.74785002],
       [-0.07727165,  1.06312962,  0.43960663, -1.31961423, -1.10871173],
       [ 0.61233098,  0.78944256,  0.43960663, -0.60637193, -0.78794577],
       ...,
       [-0.53700674,  2.15787786,  1.50435871, -0.69128173,  0.7757883 ],
       [-0.99674183,  1.06312962,  0.61706531,  0.02196056,  1.37722448],
       [ 0.61233098,  0.24206844,  0.08468927,  1.29560752,  1.17674575]])

In [48]:
dfStd = pd.DataFrame(std, columns=['age','height','weight','pts','reb'])
dfStd

Unnamed: 0,age,height,weight,pts,reb
0,-0.996742,-0.578993,-0.802604,-0.589390,-0.747850
1,-0.077272,1.063130,0.439607,-1.319614,-1.108712
2,0.612331,0.789443,0.439607,-0.606372,-0.787946
3,0.382463,1.063130,0.794524,-0.045967,0.334735
4,-1.226609,0.515755,0.439607,-0.742228,-0.787946
...,...,...,...,...,...
9556,0.382463,-0.031619,-0.270228,-1.115831,-0.226605
9557,-0.996742,-0.852680,-1.334980,0.803131,0.455022
9558,-0.537007,2.157878,1.504359,-0.691282,0.775788
9559,-0.996742,1.063130,0.617065,0.021961,1.377224


In [49]:
#Check outlier, -2.5 < z-score > 2.5
Std_atas = dfStd[dfStd<2.5]
Std_atas

Unnamed: 0,age,height,weight,pts,reb
0,-0.996742,-0.578993,-0.802604,-0.589390,-0.747850
1,-0.077272,1.063130,0.439607,-1.319614,-1.108712
2,0.612331,0.789443,0.439607,-0.606372,-0.787946
3,0.382463,1.063130,0.794524,-0.045967,0.334735
4,-1.226609,0.515755,0.439607,-0.742228,-0.787946
...,...,...,...,...,...
9556,0.382463,-0.031619,-0.270228,-1.115831,-0.226605
9557,-0.996742,-0.852680,-1.334980,0.803131,0.455022
9558,-0.537007,2.157878,1.504359,-0.691282,0.775788
9559,-0.996742,1.063130,0.617065,0.021961,1.377224


In [50]:
no_outliers = Std_atas[Std_atas>-2.5]
no_outliers

Unnamed: 0,age,height,weight,pts,reb
0,-0.996742,-0.578993,-0.802604,-0.589390,-0.747850
1,-0.077272,1.063130,0.439607,-1.319614,-1.108712
2,0.612331,0.789443,0.439607,-0.606372,-0.787946
3,0.382463,1.063130,0.794524,-0.045967,0.334735
4,-1.226609,0.515755,0.439607,-0.742228,-0.787946
...,...,...,...,...,...
9556,0.382463,-0.031619,-0.270228,-1.115831,-0.226605
9557,-0.996742,-0.852680,-1.334980,0.803131,0.455022
9558,-0.537007,2.157878,1.504359,-0.691282,0.775788
9559,-0.996742,1.063130,0.617065,0.021961,1.377224


In [51]:
no_outliers.isnull().sum()

age        82
height     62
weight     80
pts       209
reb       267
dtype: int64

In [52]:
std_new = outliers.dropna()
std_new.shape

(8948, 5)

### Splitting

In [53]:
std_new['status_en'] = df_new1['status_en']
std_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,age,height,weight,pts,reb,status_en
0,-0.996742,-0.578993,-0.802604,-0.589390,-0.747850,0
1,-0.077272,1.063130,0.439607,-1.319614,-1.108712,0
2,0.612331,0.789443,0.439607,-0.606372,-0.787946,0
3,0.382463,1.063130,0.794524,-0.045967,0.334735,0
4,-1.226609,0.515755,0.439607,-0.742228,-0.787946,0
...,...,...,...,...,...,...
9556,0.382463,-0.031619,-0.270228,-1.115831,-0.226605,0
9557,-0.996742,-0.852680,-1.334980,0.803131,0.455022,1
9558,-0.537007,2.157878,1.504359,-0.691282,0.775788,0
9559,-0.996742,1.063130,0.617065,0.021961,1.377224,0


In [54]:
x = std_new[['age','height','weight','pts','reb']]
y = std_new['status_en']
print(x.shape)
print(y.shape)

(8948, 5)
(8948,)


In [57]:
xtr, xts, ytr, yts = train_test_split(x,y,test_size=.18)
print(xtr.shape)
print(ytr.shape)
print(xts.shape)
print(yts.shape)

(7337, 5)
(7337,)
(1611, 5)
(1611,)


### Hyperparameter Logistic Regression

In [69]:
#parameter logistic regression
penalty = ['l1','l2','elasticnet','none']
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
max_iter = [1,10,50,100,500,1000,5000,100000]
param = {
    'penalty': penalty, 'solver': solver, 'max_iter': max_iter
}
param

{'penalty': ['l1', 'l2', 'elasticnet', 'none'],
 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
 'max_iter': [1, 10, 50, 100, 500, 1000, 5000, 100000]}

In [70]:
# Logistic Regression
lr = LogisticRegression()
lrRS = RandomizedSearchCV(estimator=lr, param_distributions=param, cv=5, n_iter=10)
lrRS

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='auto', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'max_iter': [1, 10, 50, 100, 500, 1000,
                                                     5000, 100000],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    'no

In [71]:
lrRS.fit(xtr,ytr)

ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: l1_ratio must be between 0 and 1; got (l1_ratio=None)

ValueError: l1_ratio must be between 0 and 1; got (l1_ratio=None)



RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='auto', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'max_iter': [1, 10, 50, 100, 500, 1000,
                                                     5000, 100000],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    'no

In [72]:
lrRS.best_params_

{'solver': 'sag', 'penalty': 'l2', 'max_iter': 1000}

In [75]:
lrNew = LogisticRegression(solver= 'sag', penalty= 'l2', max_iter= 1000)
lrNew.fit(xtr,ytr)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [86]:
lrScore = lrNew.score(xts,yts)
lrScore

0.9813780260707635

### Hyperparameter Decision Tree

In [78]:
#parameter Decision Tree
criterion = ['gini','entropy']
splitter = ['best', 'random']
max_features = ['int','float','auto','sqrt','log2']
param = {
    'criterion': criterion, 'splitter': splitter, 'max_features': max_features
}
param

{'criterion': ['gini', 'entropy'],
 'splitter': ['best', 'random'],
 'max_features': ['int', 'float', 'auto', 'sqrt', 'log2']}

In [79]:
# Decision Tree
dtc = DecisionTreeClassifier()
dtcRS = RandomizedSearchCV(estimator=dtc, param_distributions=param, cv=5, n_iter=10)
dtcRS

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=None,
          

In [80]:
dtcRS.fit(xtr,ytr)

ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

ValueError: Invalid value for max_features. Allowed string values are 'auto', 'sqrt' or 'log2'.

ValueError: Invalid value for 

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=None,
          

In [81]:
dtcRS.best_params_

{'splitter': 'best', 'max_features': 'log2', 'criterion': 'entropy'}

In [82]:
dtcNew = DecisionTreeClassifier(splitter= 'best', max_features= 'log2', criterion= 'entropy')
dtcNew.fit(xtr,ytr)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [85]:
dtcScore = dtcNew.score(xts,yts)
dtcScore

0.9981378026070763

### Hyperparameter KNN

In [103]:
#parameter KNN
n_neighbors = [5,11,15,21,25,31,35,41,45,51,55,61,65,71,75,81,85,87,95]
algorithm = ['auto','ball_tree','kd_tree','brute']
param = {
    'n_neighbors': n_neighbors, 'algorithm': algorithm
    }
param

{'n_neighbors': [5,
  11,
  15,
  21,
  25,
  31,
  35,
  41,
  45,
  51,
  55,
  61,
  65,
  71,
  75,
  81,
  85,
  87,
  95],
 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

In [104]:
knc = KNeighborsClassifier()
kncRS = RandomizedSearchCV(estimator=knc, param_distributions=param, cv=5, n_iter=10)
kncRS

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=KNeighborsClassifier(algorithm='auto',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None,
                                                  n_jobs=None, n_neighbors=5,
                                                  p=2, weights='uniform'),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'n_neighbors': [5, 11, 15, 21, 25, 31,
                                                        35, 41, 45, 51, 55, 61,
                                                        65, 71, 75, 81, 85, 87,
                                                        95]},
                   pr

In [105]:
kncRS.fit(xtr,ytr)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=KNeighborsClassifier(algorithm='auto',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None,
                                                  n_jobs=None, n_neighbors=5,
                                                  p=2, weights='uniform'),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'n_neighbors': [5, 11, 15, 21, 25, 31,
                                                        35, 41, 45, 51, 55, 61,
                                                        65, 71, 75, 81, 85, 87,
                                                        95]},
                   pr

In [106]:
kncRS.best_params_

{'n_neighbors': 5, 'algorithm': 'auto'}

In [107]:
kncNew = KNeighborsClassifier(n_neighbors=5,algorithm='auto')
kncNew.fit(xtr,ytr)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [111]:
kncScore = kncNew.score(xts,yts)
kncScore

0.9888268156424581

### Predict

In [120]:
predLR = lrNew.predict(xts)
predDTC = dtcNew.predict(xts)
predKNN = kncNew.predict(xts)

In [127]:
ypLR = pd.DataFrame(predLR)
ypDTC = pd.DataFrame(predDTC)
ypKNN = pd.DataFrame(predKNN)
ytest = pd.DataFrame(yts)
print(ypLR.shape)
print(ypDTC.shape)
print(ypKNN.shape)
print(ytest.shape)

(1611, 1)
(1611, 1)
(1611, 1)
(1611, 1)


### Evaluation Metrics

In [115]:
print(f'Logistic Regression Score: {lrScore}')
print(f'Decision Tree Score: {dtcScore}')
print(f'KNN Score: {kncScore}')

Logistic Regression Score: 0.9813780260707635
Decision Tree Score: 0.9981378026070763
KNN Score: 0.9888268156424581


In [130]:
print(f"BAS Logistic Regression: {round(balanced_accuracy_score(ytest, ypLR),2)}")
print(f"BAS Decision Tree: {round(balanced_accuracy_score(ytest, ypDTC),2)}")
print(f"BAS KNN: {round(balanced_accuracy_score(ytest, ypKNN),2)}")

BAS Logistic Regression: 0.72
BAS Decision Tree: 0.98
BAS KNN: 0.82


In [133]:
print(f"Precision Score Logistic Regression (+): {precision_score(ytest,ypLR)}")
print(f"Precision Score Decision Tree (+): {precision_score(ytest,ypDTC)}")
print(f"Precision Score KNN (+): {precision_score(ytest,ypKNN)}")

Precision Score Logistic Regression (+): 0.8
Precision Score Decision Tree (+): 0.9772727272727273
Precision Score KNN (+): 0.9354838709677419


In [135]:
print(f"Recall Logistic Regression (+): {recall_score(ytest,ypLR)}")
print(f"Recall Logistic Decision Tree (+): {recall_score(ytest,ypDTC)}")
print(f"Recall Logistic KNN (+): {recall_score(ytest,ypKNN)}")

Recall Logistic Regression (+): 0.4444444444444444
Recall Logistic Decision Tree (+): 0.9555555555555556
Recall Logistic KNN (+): 0.6444444444444445


In [137]:
print(f"F1 Score Logistic Regression (+): {f1_score(ytest, ypLR)}")
print(f"F1 Score Logistic Decision Tree (+): {f1_score(ytest, ypDTC)}")
print(f"F1 Score Logistic KNN (+): {f1_score(ytest, ypKNN)}")

F1 Score Logistic Regression (+): 0.5714285714285714
F1 Score Logistic Decision Tree (+): 0.9662921348314608
F1 Score Logistic KNN (+): 0.763157894736842


In [140]:
print(f'ROC AUC Score Logistic Regression: {round(roc_auc_score(ytest,ypLR),2)}')
print(f'ROC AUC Score Decision Tree: {round(roc_auc_score(ytest,ypDTC),2)}')
print(f'ROC AUC Score KNN: {round(roc_auc_score(ytest,ypKNN),2)}')

ROC AUC Score Logistic Regression: 0.72
ROC AUC Score Decision Tree: 0.98
ROC AUC Score KNN: 0.82


### DARI 3 MODEL TERSEBUT, SETELAH DIANALISA MELALUI BEBERAPA EVALUATION METRICS, METODE DECISION TREE CLASSIFIER UNGGUL DISELURUH NILAI EVALUATION METRICS. MAKA DAPAT DISIMPULKAN BAHWA MODEL TERBAIK ADALAH DECISION TREE CLASSIFIER.


# CLASSIFICATION IBL PLAYERS

In [152]:
dfIBL = pd.read_excel('indo_players.xlsx')
dfIBL = dfIBL.rename(columns={"Avg Points":"Pts","Avg Rebounds":"Reb"})
dfIBL

Unnamed: 0,Name,Club,Country,Age,Height,Weight,Pts,Reb
0,Andakara Prastawa Dyaksa,Pelita Jaya Bakrie,Indonesia,24,190,90,7,6
1,Reggie Mononimbar,Pelita Jaya Bakrie,Indonesia,21,185,86,6,3
2,Hardianus Lakudu,Satria Muda Pertamina Jakarta,Indonesia,23,178,83,10,3
3,Kevin Yonas Sitorus,Satria Muda Pertamina Jakarta,Indonesia,26,185,75,11,4
4,Arki Dikania Wisnu,Satria Muda Pertamina Jakarta,Indonesia,20,183,80,5,2
5,Laurentius Steven Oei,Satria Muda Pertamina Jakarta,Indonesia,21,191,85,4,10
6,Mei Joni,Stapac,Indonesia,25,188,90,7,5
7,Vincent Rivaldi Kosasih,Stapac,Indonesia,23,179,87,1,2
8,Hardian Wicaksono,Pacific Caesar Surabaya,Indonesia,21,177,80,9,8
9,Brandon Jawato,Louvre Surabaya,Indonesia,24,182,85,6,5


In [155]:
df_rekom = dfIBL[(dfIBL['Age']<=25)&(dfIBL['Height']>=180)&(dfIBL['Weight']<=90)&(dfIBL['Pts']>=6)&(dfIBL['Reb']>=3)]
ind_rekom = df_rekom.index.tolist() #index yang masuk kriteria
df_rekom #Ada 191 Pemain Target

Unnamed: 0,Name,Club,Country,Age,Height,Weight,Pts,Reb
0,Andakara Prastawa Dyaksa,Pelita Jaya Bakrie,Indonesia,24,190,90,7,6
1,Reggie Mononimbar,Pelita Jaya Bakrie,Indonesia,21,185,86,6,3
6,Mei Joni,Stapac,Indonesia,25,188,90,7,5
9,Brandon Jawato,Louvre Surabaya,Indonesia,24,182,85,6,5


In [157]:
dfIBL['Status'] = ['Target' if i in ind_rekom else 'Non-Target' for i in range(len(dfIBL.index))]
dfIBL

Unnamed: 0,Name,Club,Country,Age,Height,Weight,Pts,Reb,Status
0,Andakara Prastawa Dyaksa,Pelita Jaya Bakrie,Indonesia,24,190,90,7,6,Target
1,Reggie Mononimbar,Pelita Jaya Bakrie,Indonesia,21,185,86,6,3,Target
2,Hardianus Lakudu,Satria Muda Pertamina Jakarta,Indonesia,23,178,83,10,3,Non-Target
3,Kevin Yonas Sitorus,Satria Muda Pertamina Jakarta,Indonesia,26,185,75,11,4,Non-Target
4,Arki Dikania Wisnu,Satria Muda Pertamina Jakarta,Indonesia,20,183,80,5,2,Non-Target
5,Laurentius Steven Oei,Satria Muda Pertamina Jakarta,Indonesia,21,191,85,4,10,Non-Target
6,Mei Joni,Stapac,Indonesia,25,188,90,7,5,Target
7,Vincent Rivaldi Kosasih,Stapac,Indonesia,23,179,87,1,2,Non-Target
8,Hardian Wicaksono,Pacific Caesar Surabaya,Indonesia,21,177,80,9,8,Non-Target
9,Brandon Jawato,Louvre Surabaya,Indonesia,24,182,85,6,5,Target


In [191]:
### Labeling Target
le = LabelEncoder()
df_IBL = dfIBL #mau di drop kolom statusnya
df_IBL['Status_En'] = le.fit_transform(df_IBL['Status']) ####0 bukan target, 1 target
df_IBL = df_IBL.drop(["Status_en","Status"], axis=1)
df_IBL['Status'] = dfIBL['Status']
df_IBL

Unnamed: 0,Name,Club,Country,Age,Height,Weight,Pts,Reb,Status_En,Status
0,Andakara Prastawa Dyaksa,Pelita Jaya Bakrie,Indonesia,24,190,90,7,6,1,Target
1,Reggie Mononimbar,Pelita Jaya Bakrie,Indonesia,21,185,86,6,3,1,Target
2,Hardianus Lakudu,Satria Muda Pertamina Jakarta,Indonesia,23,178,83,10,3,0,Non-Target
3,Kevin Yonas Sitorus,Satria Muda Pertamina Jakarta,Indonesia,26,185,75,11,4,0,Non-Target
4,Arki Dikania Wisnu,Satria Muda Pertamina Jakarta,Indonesia,20,183,80,5,2,0,Non-Target
5,Laurentius Steven Oei,Satria Muda Pertamina Jakarta,Indonesia,21,191,85,4,10,0,Non-Target
6,Mei Joni,Stapac,Indonesia,25,188,90,7,5,1,Target
7,Vincent Rivaldi Kosasih,Stapac,Indonesia,23,179,87,1,2,0,Non-Target
8,Hardian Wicaksono,Pacific Caesar Surabaya,Indonesia,21,177,80,9,8,0,Non-Target
9,Brandon Jawato,Louvre Surabaya,Indonesia,24,182,85,6,5,1,Target


### Standardization

In [178]:
std = ss.fit_transform(df_IBL.loc[:,'Age':'Reb'])
std

array([[ 0.63599873,  1.32911403,  1.31632171,  0.14285714,  0.48349378],
       [-0.95399809,  0.25724788,  0.42390021, -0.21428571, -0.72524067],
       [ 0.10599979, -1.24336474, -0.24541591,  1.21428571, -0.72524067],
       [ 1.69599661,  0.25724788, -2.0302589 ,  1.57142857, -0.32232919],
       [-1.48399703, -0.17149859, -0.91473203, -0.57142857, -1.12815215],
       [-0.95399809,  1.54348727,  0.20079484, -0.92857143,  2.09513971],
       [ 1.16599767,  0.90036757,  1.31632171,  0.14285714,  0.0805823 ],
       [ 0.10599979, -1.02899151,  0.64700558, -2.        , -1.12815215],
       [-0.95399809, -1.45773797, -0.91473203,  0.85714286,  1.28931674],
       [ 0.63599873, -0.38587182,  0.20079484, -0.21428571,  0.0805823 ]])

In [179]:
dfStd = pd.DataFrame(std, columns=['age','height','weight','pts','reb'])
dfStd

Unnamed: 0,age,height,weight,pts,reb
0,0.635999,1.329114,1.316322,0.142857,0.483494
1,-0.953998,0.257248,0.4239,-0.214286,-0.725241
2,0.106,-1.243365,-0.245416,1.214286,-0.725241
3,1.695997,0.257248,-2.030259,1.571429,-0.322329
4,-1.483997,-0.171499,-0.914732,-0.571429,-1.128152
5,-0.953998,1.543487,0.200795,-0.928571,2.09514
6,1.165998,0.900368,1.316322,0.142857,0.080582
7,0.106,-1.028992,0.647006,-2.0,-1.128152
8,-0.953998,-1.457738,-0.914732,0.857143,1.289317
9,0.635999,-0.385872,0.200795,-0.214286,0.080582


In [197]:
dfList = df_IBL[['Name','Club']]
dfList = pd.concat([dfList,dfStd],axis=1)
dfList['status_en'] = df_IBL['Status_En']
dfList['status'] = df_IBL['Status']
dfList

Unnamed: 0,Name,Club,age,height,weight,pts,reb,status_en,status
0,Andakara Prastawa Dyaksa,Pelita Jaya Bakrie,0.635999,1.329114,1.316322,0.142857,0.483494,1,Target
1,Reggie Mononimbar,Pelita Jaya Bakrie,-0.953998,0.257248,0.4239,-0.214286,-0.725241,1,Target
2,Hardianus Lakudu,Satria Muda Pertamina Jakarta,0.106,-1.243365,-0.245416,1.214286,-0.725241,0,Non-Target
3,Kevin Yonas Sitorus,Satria Muda Pertamina Jakarta,1.695997,0.257248,-2.030259,1.571429,-0.322329,0,Non-Target
4,Arki Dikania Wisnu,Satria Muda Pertamina Jakarta,-1.483997,-0.171499,-0.914732,-0.571429,-1.128152,0,Non-Target
5,Laurentius Steven Oei,Satria Muda Pertamina Jakarta,-0.953998,1.543487,0.200795,-0.928571,2.09514,0,Non-Target
6,Mei Joni,Stapac,1.165998,0.900368,1.316322,0.142857,0.080582,1,Target
7,Vincent Rivaldi Kosasih,Stapac,0.106,-1.028992,0.647006,-2.0,-1.128152,0,Non-Target
8,Hardian Wicaksono,Pacific Caesar Surabaya,-0.953998,-1.457738,-0.914732,0.857143,1.289317,0,Non-Target
9,Brandon Jawato,Louvre Surabaya,0.635999,-0.385872,0.200795,-0.214286,0.080582,1,Target


In [205]:
x = dfList[['age','height','weight','pts','reb']]
y = dfList['status_en']
print(x.shape)
print(y.shape)

(10, 5)
(10,)


### PREDICT

In [206]:
dtsFix = DecisionTreeClassifier(splitter= 'best', max_features= 'log2', criterion= 'entropy')
dtsFix.fit(x,y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [213]:
dtsPred = dtsFix.predict(x)
dtsPred

array([1, 1, 0, 0, 0, 0, 1, 0, 0, 1])

In [211]:
levels = {0: 'Non-Target', 1: 'Target'}
dfPred_IBL = df_IBL
dfPred_IBL = df_IBL.drop("Status_En", axis=1)
dfPred_IBL

Unnamed: 0,Name,Club,Country,Age,Height,Weight,Pts,Reb,Status
0,Andakara Prastawa Dyaksa,Pelita Jaya Bakrie,Indonesia,24,190,90,7,6,Target
1,Reggie Mononimbar,Pelita Jaya Bakrie,Indonesia,21,185,86,6,3,Target
2,Hardianus Lakudu,Satria Muda Pertamina Jakarta,Indonesia,23,178,83,10,3,Non-Target
3,Kevin Yonas Sitorus,Satria Muda Pertamina Jakarta,Indonesia,26,185,75,11,4,Non-Target
4,Arki Dikania Wisnu,Satria Muda Pertamina Jakarta,Indonesia,20,183,80,5,2,Non-Target
5,Laurentius Steven Oei,Satria Muda Pertamina Jakarta,Indonesia,21,191,85,4,10,Non-Target
6,Mei Joni,Stapac,Indonesia,25,188,90,7,5,Target
7,Vincent Rivaldi Kosasih,Stapac,Indonesia,23,179,87,1,2,Non-Target
8,Hardian Wicaksono,Pacific Caesar Surabaya,Indonesia,21,177,80,9,8,Non-Target
9,Brandon Jawato,Louvre Surabaya,Indonesia,24,182,85,6,5,Target


## FINAL RESULT RECOMMENDATION HUNTING
### - TARGET -> REKRUT
### - NON-TARGET -> TIDAK REKRUT

In [214]:
dfPred_IBL['Status Predict'] = [levels[x] for x in dtsPred]
dfPred_IBL

Unnamed: 0,Name,Club,Country,Age,Height,Weight,Pts,Reb,Status,Status Predict
0,Andakara Prastawa Dyaksa,Pelita Jaya Bakrie,Indonesia,24,190,90,7,6,Target,Target
1,Reggie Mononimbar,Pelita Jaya Bakrie,Indonesia,21,185,86,6,3,Target,Target
2,Hardianus Lakudu,Satria Muda Pertamina Jakarta,Indonesia,23,178,83,10,3,Non-Target,Non-Target
3,Kevin Yonas Sitorus,Satria Muda Pertamina Jakarta,Indonesia,26,185,75,11,4,Non-Target,Non-Target
4,Arki Dikania Wisnu,Satria Muda Pertamina Jakarta,Indonesia,20,183,80,5,2,Non-Target,Non-Target
5,Laurentius Steven Oei,Satria Muda Pertamina Jakarta,Indonesia,21,191,85,4,10,Non-Target,Non-Target
6,Mei Joni,Stapac,Indonesia,25,188,90,7,5,Target,Target
7,Vincent Rivaldi Kosasih,Stapac,Indonesia,23,179,87,1,2,Non-Target,Non-Target
8,Hardian Wicaksono,Pacific Caesar Surabaya,Indonesia,21,177,80,9,8,Non-Target,Non-Target
9,Brandon Jawato,Louvre Surabaya,Indonesia,24,182,85,6,5,Target,Target
