### Importing pandas library

In [1]:
import pandas as pd

### Loading Dataset

In [2]:
diabetes_data = pd.read_csv('dataset/PimaIndians_processed.csv')

diabetes_data.head(10)

Unnamed: 0,pregnant,glucose,diastolic,triceps,insulin,bmi,diabetes,age,test
0,-0.717427,-1.091046,-0.373655,-0.58511,-0.522842,-0.710421,-1.031876,-0.968299,0
1,-1.029213,0.466314,-2.456964,0.557421,0.100631,1.42673,5.115111,0.209585,1
2,-0.093854,-1.447941,-1.655691,0.271788,-0.573394,-0.297238,-0.797126,-0.477514,1
3,-0.40564,2.413014,-0.053146,1.50953,3.260122,-0.368477,-1.05796,2.172726,1
4,-0.717427,2.153454,-0.854419,-0.58511,5.81299,-0.425468,-0.362402,2.761668,1
5,0.529718,1.407219,0.107109,-0.965953,0.159608,-1.038117,0.185349,1.976412,1
6,-1.029213,-0.150141,1.068636,1.699951,0.623,1.811417,0.081015,0.013271,1
7,-0.717427,-0.636816,-3.258237,0.843053,-0.61552,1.455225,-0.985506,0.209585,0
8,-0.717427,-0.247476,-0.053146,0.081366,-0.505991,0.215678,0.017256,0.111428,1
9,-0.093854,0.109419,1.389146,1.128686,0.665127,0.885318,0.524433,-0.379357,0


In [3]:
diabetes_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pregnant   392 non-null    float64
 1   glucose    392 non-null    float64
 2   diastolic  392 non-null    float64
 3   triceps    392 non-null    float64
 4   insulin    392 non-null    float64
 5   bmi        392 non-null    float64
 6   diabetes   392 non-null    float64
 7   age        392 non-null    float64
 8   test       392 non-null    int64  
dtypes: float64(8), int64(1)
memory usage: 27.7 KB


In [4]:
X = diabetes_data.drop('test', axis=1)

Y = diabetes_data['test']

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [6]:
x_train.shape, y_train.shape

((313, 8), (313,))

In [7]:
x_test.shape, y_test.shape

((79, 8), (79,))

### Voting Classifier
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html

In [8]:
from sklearn.ensemble import VotingClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [9]:
log_clf = LogisticRegression(C=1, solver='liblinear')

svc_clf = SVC(C=1, kernel='linear', gamma='auto')

naive_clf = GaussianNB()

#### Hard voting
Hard voting classifier (voting='hard') aggregate the predictions of each classifier and predict the class that gets the most votes

In [10]:
voting_clf_hard = VotingClassifier(estimators=[('lr', log_clf), 
                                               ('svc', svc_clf), 
                                               ('naive', naive_clf)],
                                   voting='hard')

In [11]:
voting_clf_hard.fit(x_train, y_train)

In [12]:
y_pred = voting_clf_hard.predict(x_test)

In [17]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

In [19]:
accuracy_score(y_test, y_pred)

0.7341772151898734

In [20]:
precision_score(y_test, y_pred)

0.75

In [21]:
recall_score(y_test, y_pred)

0.41379310344827586

In [14]:
voting_clf_hard.classes_

array([0, 1])

In [15]:
pred_results = pd.DataFrame({'y_test': y_test,
                             'y_pred': y_pred})

pred_results.head()

Unnamed: 0,y_test,y_pred
43,0,0
382,0,1
232,0,0
331,1,1
243,0,0


In [22]:
for clf_hard in (log_clf, svc_clf, naive_clf, voting_clf_hard):
    
    clf_hard.fit(x_train, y_train)
    y_pred = clf_hard.predict(x_test)
    
    print(clf_hard.__class__.__name__, accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred))

LogisticRegression 0.7341772151898734 0.75 0.41379310344827586
SVC 0.7215189873417721 0.7333333333333333 0.3793103448275862
GaussianNB 0.759493670886076 0.7272727272727273 0.5517241379310345
VotingClassifier 0.7341772151898734 0.75 0.41379310344827586


#### Soft voting
Soft voting classifier (voting='soft') predict the class with the highest class probability, averaged over all the individual classifiers. 

All of the above classifiers are by default enabled for probability estimates except SVC.    
So in SVC we have to pass one more parameter as probability=True

In [20]:
svc_clf_soft = SVC(C=1, kernel='linear', gamma='auto', probability=True)

In [22]:
voting_clf_soft = VotingClassifier(estimators=[('lr', log_clf), 
                                               ('svc', svc_clf_soft), 
                                               ('naive', naive_clf)],
                                   voting='soft', 
                                   weights = [0.25,0.5,0.25])

In [23]:
for clf_soft in (log_clf, svc_clf_soft, naive_clf, voting_clf_soft):
    
    clf_soft.fit(x_train, y_train)
    y_pred = clf_soft.predict(x_test)
    
    print(clf_soft.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.810126582278481
SVC 0.8354430379746836
GaussianNB 0.810126582278481
VotingClassifier 0.7974683544303798
