In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dill

from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import SelectKBest, f_classif, chi2, SelectFromModel
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline        
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.linear_model import LogisticRegression

%matplotlib inline
SEED=50

In [2]:
prostate = pd.read_csv('prostate.csv')
prostate.head()

Unnamed: 0,x.V1,x.V2,x.V3,x.V4,x.V5,x.V6,x.V7,x.V8,x.V9,x.V10,...,x.V12592,x.V12593,x.V12594,x.V12595,x.V12596,x.V12597,x.V12598,x.V12599,x.V12600,y
1,-9.0,1.0,1.0,15.0,-2.0,-3.0,4.0,8.0,-12.0,-12.0,...,5.0,3.0,21,15.0,1.0,0,14.0,-23.0,14.0,2
2,-2.0,1.0,1.0,4.0,-2.0,-5.0,0.0,8.0,-5.0,-9.0,...,3.0,2.0,12,7.0,4.0,5,4.0,-10.0,25.0,2
3,-6.0,17.0,6.0,29.0,4.0,-11.0,-8.0,10.0,-24.0,-32.0,...,-5.0,11.0,11,36.0,8.0,7,21.0,-82.0,10.0,2
4,0.0,9.0,4.0,19.0,-10.0,-18.0,-18.0,5.0,-33.0,-31.0,...,-7.0,6.0,9,22.0,3.0,20,-7.0,-62.0,24.0,2
5,-1.0,0.0,1.0,5.0,0.0,-4.0,1.0,6.0,-4.0,-9.0,...,6.0,1.0,140,10.0,3.0,8,8.0,-27.0,20.0,2


In [3]:
print(prostate.isnull().values.any())


False


In [4]:
features=prostate.iloc[:,0:-1]
labels=prostate.iloc[:,-1]

In [5]:
# Estandarizar

min_max_scaler = preprocessing.MinMaxScaler()
X = min_max_scaler.fit_transform(features)

In [6]:
X

array([[0.75949367, 0.29577465, 0.43333333, ..., 0.34836066, 0.87700535,
        0.62698413],
       [0.84810127, 0.29577465, 0.43333333, ..., 0.30737705, 0.94652406,
        0.71428571],
       [0.79746835, 0.52112676, 0.6       , ..., 0.37704918, 0.56149733,
        0.5952381 ],
       ...,
       [0.82278481, 0.28169014, 0.4       , ..., 0.30327869, 0.96256684,
        0.70634921],
       [0.87341772, 0.30985915, 0.46666667, ..., 0.29918033, 1.        ,
        0.6984127 ],
       [0.84810127, 0.28169014, 0.4       , ..., 0.28278689, 0.95721925,
        0.66666667]])

In [7]:
y = np.transpose(np.asarray(labels.values - 1, dtype=int))


In [8]:
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# 1. SVC

In [9]:
%%time
clf_svc = SVC(kernel='linear', C=1, probability=True, random_state=SEED) 
scores_clf_svc = cross_validate(clf_svc, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)

CPU times: user 16.8 s, sys: 199 ms, total: 17 s
Wall time: 11 s


In [10]:
print('Accuracy:', np.mean(scores_clf_svc['test_accuracy']))
print('Precision:', np.mean(scores_clf_svc['test_precision']))
print('Recall:', np.mean(scores_clf_svc['test_recall']))
print('F1:', np.mean(scores_clf_svc['test_f1']))

Accuracy: 0.9109090909090909
Precision: 0.9457142857142857
Recall: 0.8800000000000001
F1: 0.9078632478632478


# 2. NAIVE BAYES

In [11]:
%%time
clf_nb = GaussianNB()
scores_clf_nb = cross_validate(clf_nb, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)


CPU times: user 2.59 s, sys: 104 ms, total: 2.69 s
Wall time: 693 ms


In [12]:
print('Accuracy:', np.mean(scores_clf_nb['test_accuracy']))
print('Precision:', np.mean(scores_clf_nb['test_precision']))
print('Recall:', np.mean(scores_clf_nb['test_recall']))
print('F1:', np.mean(scores_clf_nb['test_f1']))

Accuracy: 0.6245454545454545
Precision: 0.6132539682539683
Recall: 0.8666666666666666
F1: 0.7028371628371629


# 3. RANDOM FORESTS

In [13]:
%%time
clf_rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight='balanced', max_depth=5, random_state=SEED)
scores_clf_rf = cross_validate(clf_rf, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)



CPU times: user 4.93 s, sys: 836 ms, total: 5.76 s
Wall time: 11.5 s


In [14]:
print('Accuracy:', np.mean(scores_clf_rf['test_accuracy']))
print('Precision:', np.mean(scores_clf_rf['test_precision']))
print('Recall:', np.mean(scores_clf_rf['test_recall']))
print('F1:', np.mean(scores_clf_rf['test_f1']))



Accuracy: 0.89
Precision: 0.9085714285714284
Recall: 0.9
F1: 0.8966666666666667


## SELECCIONAR VARIABLES USANDO PUNTUACION F

In [15]:
feat_sel_f = SelectKBest(f_classif, k=10) 



In [16]:
pipe_f_svc = Pipeline([('F Score',feat_sel_f), ('SVM',clf_svc)])


In [17]:
%%time

scores_f_svc = cross_validate(pipe_f_svc, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)

CPU times: user 2.12 s, sys: 145 ms, total: 2.27 s
Wall time: 588 ms


In [18]:
print('Accuracy SVC:', np.mean(scores_f_svc['test_accuracy']))
print('Precision SVC:', np.mean(scores_f_svc['test_precision']))
print('Recall SVC:', np.mean(scores_f_svc['test_recall']))
print('F1 SVC:', np.mean(scores_f_svc['test_f1']))


Accuracy SVC: 0.9200000000000002
Precision SVC: 0.9633333333333335
Recall SVC: 0.8800000000000001
F1 SVC: 0.9153535353535354


In [19]:
pipe_f_nb = Pipeline([('F Score',feat_sel_f), ('NB',clf_nb)])


In [20]:
%%time

scores_f_nb = cross_validate(pipe_f_nb, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)


CPU times: user 2.19 s, sys: 125 ms, total: 2.31 s
Wall time: 578 ms


In [21]:
print('Accuracy NB:', np.mean(scores_f_nb['test_accuracy']))
print('Precision NB:', np.mean(scores_f_nb['test_precision']))
print('Recall NB:', np.mean(scores_f_nb['test_recall']))
print('F1 NB:', np.mean(scores_f_nb['test_f1']))


Accuracy NB: 0.93
Precision NB: 0.9633333333333335
Recall NB: 0.9
F1 NB: 0.9264646464646464


In [22]:
pipe_f_rf = Pipeline([('F Score',feat_sel_f), ('RF',clf_rf)])


In [23]:
%%time


scores_f_rf = cross_validate(pipe_f_rf, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)

CPU times: user 10.2 s, sys: 830 ms, total: 11 s
Wall time: 10.7 s


In [24]:
print('Accuracy RF:', np.mean(scores_f_rf['test_accuracy']))
print('Precision RF:', np.mean(scores_f_rf['test_precision']))
print('Recall RF:', np.mean(scores_f_rf['test_recall']))
print('F1 RF:', np.mean(scores_f_rf['test_f1']))

Accuracy RF: 0.93
Precision RF: 0.9633333333333335
Recall RF: 0.9
F1 RF: 0.9236868686868686


## SELECCIONAR VARIABLES USANDO Chi-Cuadrado

In [25]:
feat_sel_chi = SelectKBest(chi2, k=10) 



In [26]:
pipe_chi_svc = Pipeline([('Chi-cuadrado',feat_sel_chi), ('SVM',clf_svc)])


In [27]:
%%time

scores_chi_svc = cross_validate(pipe_f_svc, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)

CPU times: user 2.17 s, sys: 104 ms, total: 2.27 s
Wall time: 588 ms


In [28]:
print('Accuracy SVC:', np.mean(scores_chi_svc['test_accuracy']))
print('Precision SVC:', np.mean(scores_chi_svc['test_precision']))
print('Recall SVC:', np.mean(scores_chi_svc['test_recall']))
print('F1 SVC:', np.mean(scores_chi_svc['test_f1']))


Accuracy SVC: 0.9200000000000002
Precision SVC: 0.9633333333333335
Recall SVC: 0.8800000000000001
F1 SVC: 0.9153535353535354


In [29]:
pipe_chi_nb = Pipeline([('Chi-cuadrado',feat_sel_chi), ('NB',clf_nb)])


In [30]:
%%time


scores_chi_nb = cross_validate(pipe_chi_nb, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)


CPU times: user 2.84 s, sys: 98.4 ms, total: 2.94 s
Wall time: 735 ms


In [31]:
print('Accuracy NB:', np.mean(scores_chi_nb['test_accuracy']))
print('Precision NB:', np.mean(scores_chi_nb['test_precision']))
print('Recall NB:', np.mean(scores_chi_nb['test_recall']))
print('F1 NB:', np.mean(scores_chi_nb['test_f1']))


Accuracy NB: 0.9200000000000002
Precision NB: 0.93
Recall NB: 0.9199999999999999
F1 NB: 0.9214141414141415


In [32]:
pipe_chi_rf = Pipeline([('Chi-cuadrado',feat_sel_chi), ('RF',clf_rf)])


In [33]:
%%time

scores_chi_rf = cross_validate(pipe_chi_rf, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)

CPU times: user 10.4 s, sys: 871 ms, total: 11.2 s
Wall time: 10.9 s


In [34]:
print('Accuracy RF:', np.mean(scores_chi_rf['test_accuracy']))
print('Precision RF:', np.mean(scores_chi_rf['test_precision']))
print('Recall RF:', np.mean(scores_chi_rf['test_recall']))
print('F1 RF:', np.mean(scores_chi_rf['test_f1']))

Accuracy RF: 0.9200000000000002
Precision RF: 0.9333333333333333
Recall RF: 0.9199999999999999
F1 RF: 0.9212121212121211


## SELECCIONAR VARIABLES USANDO ARBOLES DE DECISION

In [35]:
pipe_dt_svc = Pipeline([('DT', SelectFromModel(clf_rf, max_features=10, threshold=-np.inf)), ('SVC',clf_svc)])


In [36]:
%%time

scores_dt_svc = cross_validate(pipe_dt_svc, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)

CPU times: user 5.32 s, sys: 544 ms, total: 5.86 s
Wall time: 7.55 s


In [37]:
print('Accuracy DT:', np.mean(scores_dt_svc['test_accuracy']))
print('Precision DT:', np.mean(scores_dt_svc['test_precision']))
print('Recall DT:', np.mean(scores_dt_svc['test_recall']))
print('F1 DT:', np.mean(scores_dt_svc['test_f1']))

Accuracy DT: 0.9
Precision DT: 0.9550000000000001
Recall DT: 0.8400000000000001
F1 DT: 0.8815873015873017


In [38]:
pipe_dt_nb = Pipeline([('DT', SelectFromModel(clf_rf, max_features=10, threshold=-np.inf)), ('SVC',clf_nb)])


In [39]:
%%time

scores_dt_nb = cross_validate(pipe_dt_nb, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)

CPU times: user 5.34 s, sys: 408 ms, total: 5.75 s
Wall time: 7.46 s


In [40]:
print('Accuracy DT:', np.mean(scores_dt_nb['test_accuracy']))
print('Precision DT:', np.mean(scores_dt_nb['test_precision']))
print('Recall DT:', np.mean(scores_dt_nb['test_recall']))
print('F1 DT:', np.mean(scores_dt_nb['test_f1']))

Accuracy DT: 0.93
Precision DT: 0.9666666666666666
Recall DT: 0.9
F1 DT: 0.9255050505050505


In [41]:
pipe_dt_rf = Pipeline([('DT', SelectFromModel(clf_rf, max_features=10, threshold=-np.inf)), ('SVC',clf_rf)])


In [42]:
%%time

scores_dt_rf = cross_validate(pipe_dt_rf, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)

CPU times: user 10.2 s, sys: 880 ms, total: 11.1 s
Wall time: 18.9 s


In [43]:
print('Accuracy DT:', np.mean(scores_dt_rf['test_accuracy']))
print('Precision DT:', np.mean(scores_dt_rf['test_precision']))
print('Recall DT:', np.mean(scores_dt_rf['test_recall']))
print('F1 DT:', np.mean(scores_dt_rf['test_f1']))

Accuracy DT: 0.9200000000000002
Precision DT: 0.9433333333333334
Recall DT: 0.9
F1 DT: 0.9147979797979797


## SELECCIONAR VARIABLES USANDO LOGISTIC REGRESSION Y L1

In [44]:
pipe_l1_svc = Pipeline([('LR', SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear'), max_features=10, threshold=-np.inf)), ('SVC',clf_svc)])


In [45]:
%%time

scores_l1_svc = cross_validate(pipe_l1_svc, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)

CPU times: user 888 ms, sys: 96.8 ms, total: 984 ms
Wall time: 983 ms


In [46]:
print('Accuracy LR:', np.mean(scores_l1_svc['test_accuracy']))
print('Precision LR:', np.mean(scores_l1_svc['test_precision']))
print('Recall LR:', np.mean(scores_l1_svc['test_recall']))
print('F1 LR:', np.mean(scores_l1_svc['test_f1']))

Accuracy LR: 0.9209090909090909
Precision LR: 0.949047619047619
Recall LR: 0.9
F1 LR: 0.9159945609945609


In [47]:
pipe_l1_nb = Pipeline([('LR', SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear'), max_features=10, threshold=-np.inf)), ('NB',clf_nb)])


In [48]:
%%time

scores_l1_nb = cross_validate(pipe_l1_nb, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)

CPU times: user 934 ms, sys: 96.9 ms, total: 1.03 s
Wall time: 1.03 s


In [49]:
print('Accuracy LR:', np.mean(scores_l1_nb['test_accuracy']))
print('Precision LR:', np.mean(scores_l1_nb['test_precision']))
print('Recall LR:', np.mean(scores_l1_nb['test_recall']))
print('F1 LR:', np.mean(scores_l1_nb['test_f1']))

Accuracy LR: 0.940909090909091
Precision LR: 0.9657142857142856
Recall LR: 0.9199999999999999
F1 LR: 0.938974358974359


In [50]:
pipe_l1_rf = Pipeline([('LR', SelectFromModel(LogisticRegression(penalty='l1', solver='liblinear'), max_features=10, threshold=-np.inf)), ('RF',clf_rf)])


In [51]:
%%time

scores_l1_rf = cross_validate(pipe_l1_rf, X, y, cv=10, scoring=('accuracy','precision','recall','f1'),return_train_score=True)

CPU times: user 4.79 s, sys: 820 ms, total: 5.61 s
Wall time: 11 s


In [52]:
print('Accuracy LR:', np.mean(scores_l1_rf['test_accuracy']))
print('Precision LR:', np.mean(scores_l1_rf['test_precision']))
print('Recall LR:', np.mean(scores_l1_rf['test_recall']))
print('F1 LR:', np.mean(scores_l1_rf['test_f1']))

Accuracy LR: 0.9209090909090909
Precision LR: 0.949047619047619
Recall LR: 0.9
F1 LR: 0.9159945609945609


In [53]:
dill.dump_session('PROSTATE-CV.db')


In [54]:
dill.load_session('PROSTATE-CV.db')
