In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import GaussianNB
import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input/predict-diabities'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/predict-diabities/diabetes.csv


In [2]:
df = pd.read_csv('/kaggle/input/predict-diabities/diabetes.csv')

In [None]:
df.isna().sum()
df.duplicated().sum()

Checking for duplicated rows

In [None]:
for i in df.columns:
    print(df[i].value_counts())

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap = 'coolwarm')

Checking for correlation among features and target variable

In [3]:
y= df.Outcome
df.drop(columns = ['Outcome'], inplace = True)

In [4]:
ros = RandomOverSampler(random_state=0)
X, y = ros.fit_resample(df, y)

Balancing Class
as we have many values of a class and few valuesof others, we will balance the classes using OverSampling to make the classes balanced

In [5]:
sc = StandardScaler()
scaled = sc.fit_transform(X)

Standardizing the features

In [None]:
m1 = IsolationForest()
m1.fit(df)
df['anamoly'] = m1.predict(df)

Isolation forest to separate the outliers. not used here

In [7]:
trainx,testx,trainy, testy = train_test_split(scaled, y, test_size =.35, random_state = 100)


Splitting the train set into train and test

In [None]:
model = LogisticRegression()
penalty = ['l1', 'l2', 'elasticnet']
c = np.array([0.001, 0.01, 0.1, 1, 10, 100])
solv =  ['lbfgs','liblinear','newton-cg', 'newton-cholesky','sag','saga']
max_iter = np.array(range(100,1000))
grid = {'penalty': penalty, 'C': c, 'solver' : solv, 'max_iter': max_iter}
gridlogistic = GridSearchCV(model, grid, cv = 5)
gridlogistic.fit(trainx, trainy)
print('penalty', gridlogistic.best_estimator_.penalty)
print('C', gridlogistic.best_estimator_.C)
print('solver', gridlogistic.best_estimator_.solver)
print('max_iter', gridlogistic.best_estimator_.max_iter)

many fits failed because not all penalty are compatible with solver. Thus not a major impact using grid or random search
297000 fits failed out of a total of 486000.


In [None]:
model = LogisticRegression(penalty = 'l2', C = 0.1 , solver = 'saga', max_iter = 648,  random_state = 100)
model.fit(trainx, trainy)
model.score(testx, testy)

Used LogisticRegression The accuracy was 75%

In [None]:
model = RandomForestClassifier()
min_split = np.array([2, 3, 4, 5, 6, 7])
max_nvl = np.array([3, 4, 5, 6, 7, 9, 11])
alg = ['entropy', 'gini']
nest = np.array(range(1,101))
values_grid = {'min_samples_split': min_split, 'max_depth': max_nvl, 'criterion': alg, 'n_estimators': nest}
gridRandomTree = GridSearchCV(estimator = model, param_grid = values_grid, cv = 5)
gridRandomTree.fit(trainx, trainy)
print('Mín Split: ', gridRandomTree.best_estimator_.min_samples_split)
print('Max Nvl: ', gridRandomTree.best_estimator_.max_depth)
print('Algorithm: ', gridRandomTree.best_estimator_.criterion)
print('n_estimators', gridRandomTree.best_estimator_.n_estimators)
print('Score: ', gridRandomTree.best_score_)

In [None]:
model = RandomForestClassifier(criterion = 'entropy', min_samples_split = 2,  max_depth = 9, n_estimators = 34, random_state = 100)
model.fit(trainx, trainy)
model.score(testx, testy)

Used random forest The accuracy was 82%

In [None]:
model = DecisionTreeClassifier()
min_split = np.array([2, 3, 4, 5, 6, 7])
max_nvl = np.array([3, 4, 5, 6, 7, 9, 11])
alg = ['entropy', 'gini']
values_grid = {'min_samples_split': min_split, 'max_depth': max_nvl, 'criterion': alg}
gridDecisionTree = GridSearchCV(estimator = model, param_grid = values_grid, cv = 5)
gridDecisionTree.fit(trainx, trainy)
print('Mín Split: ', gridDecisionTree.best_estimator_.min_samples_split)
print('Max Nvl: ', gridDecisionTree.best_estimator_.max_depth)
print('Algorithm: ', gridDecisionTree.best_estimator_.criterion)
print('Score: ', gridDecisionTree.best_score_)

In [None]:
model = DecisionTreeClassifier(criterion = 'gini', min_samples_split = 3, max_depth= 11, random_state= 100)
model.fit(trainx, trainy)
model.score(testx, testy)

Used DecisionTree The accuracy was 79%

In [15]:
level_0_estimators = dict()
level_0_estimators["logreg"] = LogisticRegression( random_state=100)
level_0_estimators["forest"] = RandomForestClassifier(criterion = 'entropy', min_samples_split = 2,  max_depth = 9, n_estimators = 34, random_state = 100)
 
level_0_columns = [f"{name}_prediction" for name in level_0_estimators.keys()]
 
level_1_estimator = RandomForestClassifier(criterion = 'entropy', min_samples_split = 2,  max_depth = 9, n_estimators = 34, random_state = 100)

kfold = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 100)
model = StackingClassifier(estimators=list(level_0_estimators.items()), 
                                    final_estimator=level_1_estimator, 
                                    passthrough=True, cv=kfold, stack_method="predict_proba")

model.fit(trainx, trainy)
model.score(testx, testy)

0.8457142857142858

Using stacking classifier yielded the best accuracy 85% 