## Setting up our Data Set

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df= pd.read_csv('diabetes_balanced_dataset.csv')
y = df[['result']]
y = y.values
y= y.T[0]
X = df.drop(["result"], axis=1)
X = X.values

history=X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=45, stratify=y)
print("Training data size: \t%d samples with %d features" %(X_train.shape[0], X_train.shape[1]))
print("Testing data size: \t%d samples" %(X_test.shape[0]))
y_train = (2* y_train )-1
y_test = (2* y_test) -1

Training data size: 	2250 samples with 25 features
Testing data size: 	563 samples


## Decision Trees

In [2]:
import time

def Decision_Tree(X_train, y_train, X_test, y_test):
    from sklearn.tree import DecisionTreeClassifier

    clf1 = DecisionTreeClassifier(criterion="entropy", max_depth=2)
    clf1.fit(X_train, y_train)
    y_train1 = clf1.predict(X_train)
    y_test1 = clf1.predict(X_test)

    from sklearn.metrics import accuracy_score

    print('Accuracy for training data: \t', (accuracy_score(y_train, y_train1)))
    print('Accuracy for test data: \t', (accuracy_score(y_test, y_test1)))
    
    return clf1
t=time.time()
clf1 = Decision_Tree(X_train, y_train, X_test, y_test)
elapsed_DT = time.time()-t
elapsed_DT

Accuracy for training data: 	 0.628
Accuracy for test data: 	 0.6412078152753108


0.10707211494445801

## Random Forest


In [3]:
def Random_Forest(X_train, y_train, X_test, y_test):
    from sklearn.ensemble import RandomForestClassifier

    clf2 = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
    clf2.fit(X_train, y_train)
    y_train2 = clf2.predict(X_train)
    y_test2 = clf2.predict(X_test)

    from sklearn.metrics import accuracy_score

    print('Accuracy for training data: \t', (accuracy_score(y_train, y_train2)))
    print('Accuracy for test data: \t', (accuracy_score(y_test, y_test2)))
    
    return clf2
t=time.time()    
clf2 =Random_Forest(X_train, y_train, X_test, y_test)

elapsed_rf = time.time()-t
elapsed_rf

Accuracy for training data: 	 0.7075555555555556
Accuracy for test data: 	 0.6642984014209592


2.8432815074920654

# XGB

In [4]:
def XGB(X_train, y_train, X_test, y_test):
    from xgboost import XGBClassifier

    clf3 = XGBClassifier(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                         max_depth = 3,
                         gamma =10, n_estimators = 1000)
    clf3.fit(X_train, y_train)
    y_train3 = clf3.predict(X_train)
    y_test3 = clf3.predict(X_test)

    from sklearn.metrics import accuracy_score

    print('Accuracy for training data: \t', (accuracy_score(y_train, y_train3)))
    print('Accuracy for test data: \t', (accuracy_score(y_test, y_test3)))
    
    return clf3
t=time.time()     
clf3 = XGB(X_train, y_train, X_test, y_test)
   
elapsed_xgb = time.time()-t
elapsed_xgb

ModuleNotFoundError: No module named 'xgboost'

## AdaBoost


In [None]:
def AdaBoost(X_train, y_train, X_test, y_test):
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.tree import DecisionTreeClassifier
    
    clf4 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=1000,
                              algorithm="SAMME.R", learning_rate=0.1)
    clf4.fit(X_train, y_train)
    y_train4 = clf4.predict(X_train)
    y_test4 = clf4.predict(X_test)

    from sklearn.metrics import accuracy_score

    print('Accuracy for training data: \t', (accuracy_score(y_train, y_train4)))
    print('Accuracy for test data: \t', (accuracy_score(y_test, y_test4)))
    
    return clf4
t=time.time()    
clf4 = AdaBoost(X_train, y_train, X_test, y_test)
elapsed_ab = time.time()-t
elapsed_ab

# voting model

In [5]:
def VotingModel (X_train, y_train, X_test, y_test, clf1, clf2, clf3, clf4):
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from xgboost import XGBClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.ensemble import BaggingClassifier
    from sklearn.ensemble import VotingClassifier 
    from sklearn.metrics import accuracy_score
    

    clf1 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=999,
                             max_samples =999, bootstrap =True, n_jobs=-1)  
    clf2 = RandomForestClassifier(n_estimators=999, max_leaf_nodes=16, n_jobs=-1)
    clf3 = XGBClassifier(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                         max_depth = 3,
                         gamma =10, n_estimators = 999)
    clf4 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=999,
                              algorithm="SAMME.R", learning_rate=0.1)
    
    

    voting_clf = VotingClassifier(estimators= [('DTC', clf1), ('RF', clf2),('XGB', clf3), 
                                               ('ada', clf4)], 
                                  voting='soft',
                                  weights= [0.1, 0.2, 0.8, 0.7],flatten_transform=True)
    voting_clf.fit(X_train, y_train)
    for clf5 in([clf1, clf2, clf3, clf4, voting_clf]):
        clf5.fit(X_train, y_train)
        y_train5 = clf5.predict(X_train)
        y_test5 = clf5.predict(X_test)
    print('Accuracy for training data: \t', (accuracy_score(y_train, y_train5)))
    print('Accuracy for test data: \t', (accuracy_score(y_test, y_test5)))
   
    
    return clf5
t=time.time() 
clf5 = VotingModel(X_train, y_train, X_test, y_test, clf1, clf2, clf3, clf4)

elapsed_vt = time.time()-t
elapsed_vt

NameError: name 'clf3' is not defined

In [7]:
import time

t=time.time()
Elapsed = time.time()-t
print(Elapsed)

0.0


## QBoost

In [5]:
import time
def QBoost(X_train, y_train, X_test, y_test):
    NUM_READS = 3000
    DW_PARAMS = {'num_reads': NUM_READS,
                 'auto_scale': True,
                 'num_spin_reversal_transforms': 10,
                 'postprocess': 'optimization',
                 }

    from dwave.system.samplers import DWaveSampler
    from dwave.system.composites import EmbeddingComposite

    dwave_sampler = DWaveSampler(solver={'qpu': True}) # Some accounts need to replace this line with the next:
    # dwave_sampler = DWaveSampler(token='ENTER TOKEN HERE', solver='ENTER SOLVER NAME HERE')
    emb_sampler = EmbeddingComposite(dwave_sampler)

    from qboost import WeakClassifiers, QBoostClassifier

    clf6 = QBoostClassifier(n_estimators=50, max_depth=2)
    clf6.fit(X_train, y_train, emb_sampler, lmd=1.0, **DW_PARAMS)
    y_train6 = clf6.predict(X_train)
    y_test6 = clf6.predict(X_test)

    from sklearn.metrics import accuracy_score

    print('Accuracy for training data: \t', (accuracy_score(y_train, y_train6)))
    print('Accuracy for test data: \t', (accuracy_score(y_test, y_test6)))
    
    return clf6
t=time.time()     
clf6 = QBoost(X_train, y_train, X_test, y_test)
elapsed_qb = time.time()-t
elapsed_qb

KeyError: 'postprocess is not a parameter of this solver.'

## QBoostPlus

In [None]:
def QBoostPlus(X_train, y_train, X_test, y_test, clf1, clf2, clf3, clf4):
    NUM_READS = 1000
    DW_PARAMS = {'num_reads': NUM_READS,
                 'auto_scale': True,
                 'num_spin_reversal_transforms': 10,
                 'postprocess': 'optimization',
                 }

    from dwave.system.samplers import DWaveSampler
    from dwave.system.composites import EmbeddingComposite
    # dwave_sampler = DWaveSampler(token='ENTER TOKEN HERE', solver='ENTER SOLVER NAME HERE')
    dwave_sampler = DWaveSampler(solver={'qpu': True}) # Some accounts need to replace this line with the next:
    
    emb_sampler = EmbeddingComposite(dwave_sampler)
    
    from qboost import QboostPlus

    clf7 = QboostPlus([clf1, clf2, clf3, clf4])
    clf7.fit(X_train, y_train, emb_sampler, lmd=1.0, **DW_PARAMS)
    y_train7 = clf7.predict(X_train)
    y_test7 = clf7.predict(X_test)

    from sklearn.metrics import accuracy_score

    print('Accuracy for training data: \t', (accuracy_score(y_train, y_train7)))
    print('Accuracy for test data: \t', (accuracy_score(y_test, y_test7)))
    
    return clf7
t=time.time()    
clf7 = QBoostPlus(X_train, y_train, X_test, y_test, clf1, clf2, clf3, clf4)
elapsed_qbp = time.time()-t
elapsed_qbp

# new model

In [None]:
def NewModel(X_train, y_train, X_test, y_test, clf5, clf6):
    NUM_READS = 1000
    DW_PARAMS = {'num_reads': NUM_READS,
                 'auto_scale': True,
                 'num_spin_reversal_transforms': 10,
                 'postprocess': 'optimization',
                 }

    from dwave.system.samplers import DWaveSampler
    from dwave.system.composites import EmbeddingComposite

    dwave_sampler = DWaveSampler(solver={'qpu': True}) # Some accounts need to replace this line with the next:
    # dwave_sampler = DWaveSampler(token='ENTER TOKEN HERE', solver='ENTER SOLVER NAME HERE')
    emb_sampler = EmbeddingComposite(dwave_sampler)
    
    from qboost import QboostPlus

    clf8 = QboostPlus([clf5, clf6])
    clf8.fit(X_train, y_train, emb_sampler, lmd=1.0, **DW_PARAMS)
    y_train8 = clf8.predict(X_train)
    y_test8 = clf8.predict(X_test)

    from sklearn.metrics import accuracy_score

    print('Accuracy for training data: \t', (accuracy_score(y_train, y_train8)))
    print('Accuracy for test data: \t', (accuracy_score(y_test, y_test8)))
    
    return clf8
t=time.time()    
clf8 = NewModel(X_train, y_train, X_test, y_test, clf5, clf6)
elapsed_NM = time.time()-t
elapsed_NM

In [None]:
print('=======================================')
#Decision Tree
print('Decision Tree: ')
clf1 = Decision_Tree(X_train, y_train, X_test, y_test)
print('---------------------------------------')
# Random Forest
print('Random Forest: ')
clf2 = Random_Forest(X_train, y_train, X_test, y_test)
print('---------------------------------------')
#XGB
print('XGB:')
clf3 =  XGB(X_train, y_train, X_test, y_test)
print('---------------------------------------')
# AdaBoost
print('AdaBoost: ')
clf4 = AdaBoost(X_train, y_train, X_test, y_test)
print('---------------------------------------')
# VotingMODEL
print('VotingModel: ')
clf5 = VotingModel(X_train, y_train, X_test, y_test, clf1, clf2, clf3, clf4) 
print('---------------------------------------')
print('QBoost: ')
clf6 = QBoost(X_train, y_train, X_test, y_test) 
print('---------------------------------------')
# QBoostPlus
print('QBoostPlus: ')
clf7 = QBoostPlus(X_train, y_train, X_test, y_test, clf1, clf2, clf3, clf4) 
print('---------------------------------------')
# QBoostPlusPlus
print('NewModel: ')
clf8 = NewModel(X_train, y_train, X_test, y_test,  clf5,clf6) 
print('=======================================')

In [None]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
  
actual = y_test
predicted  = clf7.predict(X_test)
results = confusion_matrix(actual, predicted)

In [None]:
import numpy as np
predicted = np.where(predicted >0.0, 1, -1)
#print(predicted)
#print(y_test.T)

In [None]:
print ('Confusion Matrix :')
print(results) 
print ('Accuracy Score :',accuracy_score(actual, predicted)) 
print ('Report : ')
print( classification_report(actual, predicted))

In [None]:
def getCM(actuals, predictions, NUM_LABELS=2):
    import numpy
    import matplotlib.pyplot as plt
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    
    def error_rate(predictions, labels):
        """Return the error rate and confusions."""
        correct = numpy.sum(predictions == labels)
        total = predictions.shape[0]

        error = 100.0 - (100 * float(correct) / float(total))

        confusions = numpy.zeros([NUM_LABELS, NUM_LABELS], numpy.int32)
        bundled = zip(predictions, labels)
        for predicted, actual in bundled:
            confusions[int(predicted), int(actual)] += 1

        return error, confusions

    %matplotlib inline  

    test_error, confusions = error_rate(numpy.asarray(predictions), numpy.asarray(actuals))
    
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.grid(False)
    plt.xticks(numpy.arange(NUM_LABELS))
    plt.yticks(numpy.arange(NUM_LABELS))
    plt.imshow(confusions, cmap=plt.cm.jet, interpolation='nearest');

    for i, cas in enumerate(confusions):
        for j, count in enumerate(cas):
            if count > 0:
                xoff = .07 * len(str(count))
                plt.text(j-xoff, i+.2, int(count), fontsize=10, color='white')
                
    print('Accuracy Score: {}'.format(accuracy_score(actuals, predictions)))
    print()

    print(classification_report(actuals, predictions, digits=4))

In [None]:
y_pred = clf7.predict(X_test)
import numpy as np
predicted = np.where(predicted >0.0, 1, -1)

In [None]:
getCM(y_test.astype(int), y_pred)

In [None]:
import scikitplot as skplt
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)
skplt.metrics.plot_confusion_matrix(y_test, y_pred)

plt.savefig('C:/Users/danyal.maheshwari/Documents/Results/confusion_matrix142.png', transparent=True, dpi = 1000)
plt.show()
