In [None]:
#Importamos las librerias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.metrics import matthews_corrcoef
from sklearn import decomposition
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
#Leemos el dataset y arreglamos los datos, y volvemos a crear el indice.
df = pd.read_csv("healthcare-dataset-stroke-data.csv",sep=',')
df=df.drop([3116],axis=0)
df=df.dropna()
df.reset_index(inplace=True, drop=True)


In [None]:
#Reemplazamos los datos no numericos, por datos numericos
df['gender'] = df['gender'].replace({'Male':0,'Female':1,}).astype(np.uint8)
df['ever_married'] = df['ever_married'].replace({'Yes':1,'No':0,}).astype(np.uint8)

df['Residence_type'] = df['Residence_type'].replace({'Rural':0,'Urban':1}).astype(np.uint8)
df['smoking_status'] = df['smoking_status'].replace({'Unknown':0,'never smoked':1,'formerly smoked':2,'smokes':3})
df['work_type'] = df['work_type'].replace({'Private':0,'Self-employed':1,'Govt_job':2,'children':-1,'Never_worked':-2}).astype(np.uint8)
df.head(-5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,0,67.0,0,1,1,0,1,228.69,36.6,2,1
1,31112,0,80.0,0,1,1,0,0,105.92,32.5,1,1
2,60182,1,49.0,0,0,1,0,1,171.23,34.4,3,1
3,1665,1,79.0,1,0,1,1,0,174.12,24.0,1,1
4,56669,0,81.0,0,0,1,0,1,186.21,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4898,579,0,9.0,0,0,0,255,1,71.88,17.5,0,0
4899,68398,0,82.0,1,0,1,1,0,71.97,28.3,1,0
4900,36901,1,45.0,0,0,1,0,1,97.95,24.5,0,0
4901,45010,1,57.0,0,0,1,0,0,77.93,21.7,1,0


In [None]:
#Convertimos el dataset ya limpio a un archivo CSV
df1=pd.DataFrame({'gender':df['gender'],'age':df['age'],'hypertension':df['hypertension'],'heart_disease':df['heart_disease'],'ever_married':df['ever_married'],'work_type':df['work_type'],'Residence_type':df['Residence_type'],'avg_glucose_level':df['avg_glucose_level'],'bmi':df['bmi'],'smoking_status':df['smoking_status'],'stroke':df['stroke']})
df1.to_csv('healthcare-dataset-stroke-data-clean.csv',index=False)

In [None]:
#Seleccionamos las caracteristicas con las que trabajaremos
X = df[['gender','age','hypertension','heart_disease','ever_married','work_type','Residence_type','avg_glucose_level','bmi','smoking_status']].values
y = df['stroke'].values
#Partimos nuestro conjunto de datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,train_size=0.75,random_state=57)
#Como el nuestro conjunto de datos esta desbalanceadom, podemos solucionarlo mediante la funcion SMOTE(), la cual crea datos segun nuestras entradas.
sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())
X_test_res, y_test_res = sm.fit_resample(X_test, y_test.ravel())

#Normalizamos
scaler = StandardScaler()
scaler.fit(X_train_res)
X_train_res = scaler.transform(X_train_res)
X_test_res = scaler.transform(X_test_res)

#Aplicamos reduccion dimencional
n=9
pca=decomposition.PCA(n_components=n)
pca.fit(X_train)
X_pca_train=pca.fit_transform(X_train)
X_pca_test=pca.fit_transform(X_test)
print("Pesos de PCA(train):", pca.explained_variance_ratio_)
print("Suma de pesos de PCA (train):",sum(pca.explained_variance_ratio_))


Pesos de PCA(train): [7.63670611e-01 2.07471849e-01 2.44688533e-02 4.24100167e-03
 8.01327981e-05 2.32120184e-05 2.26134600e-05 1.03615125e-05
 7.27670929e-06]
Suma de pesos de PCA (train): 0.9999959113741266


In [None]:
#KNN
#Definimos los parametros que queremos optimizar, y aplicamos un grid search
parametros = {'weights':["distance","uniform"],'n_neighbors':range(1, 30),'algorithm':["auto", "ball_tree", "kd_tree", "brute"],'metric':["manhattan","euclidean","minkowski","chebyshev"]}
neigh = KNeighborsClassifier()
clf = GridSearchCV(neigh, parametros,scoring='f1')

clf.fit(X_train_res, y_train_res)
print(clf.best_params_)
print(clf.best_score_)



{'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 1, 'weights': 'distance'}
0.9766924990981083


In [None]:
#Ya con los parametros utilizados, volvemos a entrenar y revisamos el MCC y F1
new_neigh= KNeighborsClassifier(n_neighbors=1,algorithm="auto",metric="manhattan",weights="distance")
new_neigh.fit(X_train_res, y_train_res)
y_predict=new_neigh.predict(X_test_res)
print(y_predict)
print(y_test)
print("MCC:",matthews_corrcoef(y_test_res, y_predict))
print("F1 score:",f1_score(y_test_res, y_predict))

[0 0 1 ... 1 0 1]
[0 0 0 ... 0 0 0]
MCC: 0.4926951245302342
F1 score: 0.623229461756374


In [None]:
parameters = {'activation':('identity', 'logistic', 'tanh', 'relu'), 'hidden_layer_sizes':[5,15],'solver':('lbfgs', 'sgd', 'adam')}
ANN=MLPClassifier()

grid = GridSearchCV(ANN, parameters,scoring='f1')
grid.fit(X_train_res, y_train_res)
best_estimator = grid.best_estimator_
print(best_estimator)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

MLPClassifier(activation='tanh', hidden_layer_sizes=15, solver='lbfgs')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [None]:
clf = MLPClassifier(hidden_layer_sizes=15,activation='tanh',random_state=1, max_iter=300, solver = 'lbfgs')
clf.fit(X_train_res, y_train_res)
print('El MCC de ANN en test es: {:.3f}'
     .format(matthews_corrcoef(y_test_res,clf.predict(X_test_res) )))
print('El F1 score de ANN en entrenamiento es: {:.3f}'
     .format(f1_score(y_test_res,clf.predict(X_test_res) )))

El MCC de ANN en test es: 0.722
El F1 score de ANN en entrenamiento es: 0.848


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [None]:
parameters = {'class_weight':('balanced', 'balanced_subsample'), 'n_estimators':[10,100],'criterion':('gini', 'entropy', 'log_loss')}
rfc= RandomForestClassifier()

grid = GridSearchCV(rfc, parameters,scoring='f1')
grid.fit(X_train_res, y_train_res)
best_estimator=grid.best_estimator_
print(best_estimator)

20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/usr/local/lib/python3.7/dist-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/usr/local/lib/python3.7/d

RandomForestClassifier(class_weight='balanced_subsample', criterion='entropy')


In [None]:
rd_clf = RandomForestClassifier(class_weight='balanced_subsample',criterion='entropy')
rd_clf.fit(X_train_res, y_train_res)
print("MCC:",matthews_corrcoef(y_test_res, rd_clf.predict(X_test_res)))
print("F1 score:",f1_score(y_test_res, rd_clf.predict(X_test_res)))


MCC: 0.8718541144268
F1 score: 0.9284085727314182
