# Voting

In [None]:
import pandas as pd

In [None]:
df_cryotherapydata = pd.read_csv("data/Cryotherapy.csv")
df_cryotherapydata.head(5)

Unnamed: 0,sex,age,Time,Number_of_Warts,Type,Area,Result_of_Treatment
0,1,35,12.0,5,1,100,0
1,1,29,7.0,5,1,96,1
2,1,50,8.0,1,3,132,0
3,1,32,11.75,7,3,750,0
4,1,67,9.25,1,1,42,0


In [None]:
df_cryotherapydata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   sex                  90 non-null     int64  
 1   age                  90 non-null     int64  
 2   Time                 90 non-null     float64
 3   Number_of_Warts      90 non-null     int64  
 4   Type                 90 non-null     int64  
 5   Area                 90 non-null     int64  
 6   Result_of_Treatment  90 non-null     int64  
dtypes: float64(1), int64(6)
memory usage: 5.0 KB


In [None]:
# Import required libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

In [None]:
from sklearn.model_selection import train_test_split

# create feature & response sets
feature_columns = ['sex', 'age', 'Time', 'Number_of_Warts', 'Type', 'Area']
X = df_cryotherapydata[feature_columns]
Y = df_cryotherapydata['Result_of_Treatment']

# Create train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(72, 6)
(18, 6)
(72,)
(18,)


## Build our models with the decision tree, SVM, Logistics regression

In [None]:
# create the sub models
estimators = []

dt_model = DecisionTreeClassifier(random_state=42)
estimators.append(('DecisionTree', dt_model))

svm_model = SVC(random_state=42)
estimators.append(('SupportVector', svm_model))

logit_model = LogisticRegression(random_state=42)
estimators.append(('Logistic Regression', logit_model))


In [None]:
from sklearn.metrics import accuracy_score

for each_estimator in (dt_model, svm_model, logit_model):
    each_estimator.fit(X_train, Y_train)
    Y_pred = each_estimator.predict(X_test)
    print(each_estimator.__class__.__name__, accuracy_score(Y_test, Y_pred))

DecisionTreeClassifier 0.8888888888888888
SVC 0.5555555555555556
LogisticRegression 0.7777777777777778


## Hard Voting

In [None]:
ensemble_model = VotingClassifier(estimators=estimators, voting='hard')

# fit model
ensemble_model.fit(X_train, Y_train)
predicted_labels = ensemble_model.predict(X_test)

print('Classifier Accuracy using hard voting: ', accuracy_score(Y_test, predicted_labels))

Classifier Accuracy using hard voting:  0.8333333333333334


## Soft Voting

In [None]:
# create the sub models
estimators = []

dt_model = DecisionTreeClassifier(random_state=42)
estimators.append(('DecisionTree', dt_model))

svm_model = SVC(random_state=42, probability=True)
estimators.append(('SupportVector', svm_model))

logit_model = LogisticRegression(random_state=42)
estimators.append(('Logistic Regression', logit_model))

for each_estimator in (dt_model, svm_model, logit_model):
    each_estimator.fit(X_train, Y_train)
    Y_pred = each_estimator.predict(X_test)
    print(each_estimator.__class__.__name__, accuracy_score(Y_test, Y_pred))

# Using Voting classifier with soft voting
ensemble_model = VotingClassifier(estimators=estimators, voting='soft')

# fit model
ensemble_model.fit(X_train, Y_train)
predicted_labels = ensemble_model.predict(X_test)

print('Classifier Accuracy using soft voting: ', accuracy_score(Y_test, predicted_labels))


DecisionTreeClassifier 0.8888888888888888
SVC 0.5555555555555556
LogisticRegression 0.7777777777777778
Classifier Accuracy using soft voting:  0.8333333333333334


# Average

 (Regression problems)

In [None]:
df_winedata = pd.read_csv("data/whitewines.csv")

In [None]:
df_winedata.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,6.7,0.62,0.24,1.1,0.039,6.0,62.0,0.9934,3.41,0.32,10.4,5
1,5.7,0.22,0.2,16.0,0.044,41.0,113.0,0.99862,3.22,0.46,8.9,6
2,5.9,0.19,0.26,7.4,0.034,33.0,123.0,0.995,3.49,0.42,10.1,6
3,5.3,0.47,0.1,1.3,0.036,11.0,74.0,0.99082,3.48,0.54,11.2,4
4,6.4,0.29,0.21,9.65,0.041,36.0,119.0,0.99334,2.99,0.34,10.933333,6


In [None]:
df_winedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [None]:
# Import required libraries
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [None]:
# Create feature and response variable set
from sklearn.model_selection import train_test_split

# create feature & response variables
feature_columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide','density', 'pH', 'sulphates', 'alcohol']
X = df_winedata[feature_columns]
Y = df_winedata['quality']

# Create train & test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(3918, 11)
(980, 11)
(3918,)
(980,)


In [None]:
# Build base learners

linreg_model = LinearRegression()
svr_model = SVR()
regressiontree_model = DecisionTreeRegressor()

# Fitting the model
linreg_model.fit(X_train, Y_train)
svr_model.fit(X_train, Y_train)
regressiontree_model.fit(X_train, Y_train)

DecisionTreeRegressor()

In [None]:
linreg_predictions = linreg_model.predict(X_test)
svr_predictions = svr_model.predict(X_test)
regtree_predictions = regressiontree_model.predict(X_test)

In [None]:
# we divide the summation of the predictions by the number of the base learners
average_predictions = (linreg_predictions + svr_predictions + regtree_predictions) / 3

In [None]:
average_predictions

array([5.84011501, 5.9103697 , 6.3669403 , 5.90656001, 5.26765694,
       6.09258835, 5.17081792, 5.75054595, 5.45971195, 6.32958511,
       5.66515637, 4.63823751, 5.91678332, 5.53740599, 4.89447533,
       5.75238972, 5.50803639, 6.5030796 , 5.26175151, 5.99045139,
       5.74030422, 5.93398178, 5.92589144, 5.94635247, 6.24873204,
       5.45669202, 5.84545186, 6.42190058, 6.18122655, 6.51951345,
       5.89520667, 5.51577231, 5.6197979 , 6.3483835 , 5.71390886,
       6.10340091, 6.52617462, 6.09013704, 5.93345336, 5.589475  ,
       6.44913559, 5.44032035, 6.17932255, 6.00317694, 5.85766719,
       6.32084379, 6.63375859, 5.34686638, 6.01597503, 6.11726063,
       5.43021173, 5.82292997, 6.05577415, 6.53472186, 7.03755538,
       5.37171247, 6.22322737, 5.52828301, 6.98643183, 6.19364396,
       4.84603677, 6.53541501, 6.1191462 , 5.82781235, 5.29460297,
       5.81930112, 6.14322744, 5.4485638 , 5.94800453, 6.20554224,
       5.65906125, 6.00659888, 6.06175936, 6.42190058, 5.47339

# Weighted Averaging

In [None]:
# Weighted Averaging
df_cancerdata = pd.read_csv("data/wisc_bc_data.csv")

In [None]:
df_cancerdata.tail(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
564,911320502,B,13.17,18.22,84.28,537.3,0.07466,0.05994,0.04859,0.0287,...,14.9,23.89,95.1,687.6,0.1282,0.1965,0.1876,0.1045,0.2235,0.06925
565,898677,B,10.26,14.71,66.2,321.6,0.09882,0.09159,0.03581,0.02037,...,10.88,19.48,70.89,357.1,0.136,0.1636,0.07162,0.04074,0.2434,0.08488
566,873885,M,15.28,22.41,98.92,710.6,0.09057,0.1052,0.05375,0.03263,...,17.8,28.03,113.8,973.1,0.1301,0.3299,0.363,0.1226,0.3175,0.09772
567,911201,B,14.53,13.98,93.86,644.2,0.1099,0.09242,0.06895,0.06495,...,15.8,16.93,103.1,749.9,0.1347,0.1478,0.1373,0.1069,0.2606,0.0781
568,9012795,M,21.37,15.1,141.3,1386.0,0.1001,0.1515,0.1932,0.1255,...,22.69,21.84,152.1,1535.0,0.1192,0.284,0.4024,0.1966,0.273,0.08666


In [None]:
df_cancerdata.head(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
0,87139402,B,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,...,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
1,8910251,B,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,...,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
2,905520,B,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,...,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
3,868871,B,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,0.04796,...,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
4,9012568,B,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,...,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766


In [None]:
df_cancerdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 569 non-null    int64  
 1   diagnosis          569 non-null    object 
 2   radius_mean        569 non-null    float64
 3   texture_mean       569 non-null    float64
 4   perimeter_mean     569 non-null    float64
 5   area_mean          569 non-null    float64
 6   smoothness_mean    569 non-null    float64
 7   compactness_mean   569 non-null    float64
 8   concavity_mean     569 non-null    float64
 9   points_mean        569 non-null    float64
 10  symmetry_mean      569 non-null    float64
 11  dimension_mean     569 non-null    float64
 12  radius_se          569 non-null    float64
 13  texture_se         569 non-null    float64
 14  perimeter_se       569 non-null    float64
 15  area_se            569 non-null    float64
 16  smoothness_se      569 non

In [None]:
# Import required libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [None]:
# Create feature and response variable set
# We create train & test sample from our dataset
from sklearn.model_selection import train_test_split

# create feature & response variables
X = df_cancerdata.iloc[:,2:32]
Y = df_cancerdata['diagnosis']

# Create train & test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=1)

In [None]:
# create the sub models
estimators = []

dt_model = DecisionTreeClassifier()
estimators.append(('DecisionTree', dt_model))

# Kindly note that we passed probability=True to our SVC function to allow SVC() to return class probabilities.
# In the SVC class, the default is probability=False.
svm_model = SVC(probability=True)
estimators.append(('SupportVector', svm_model))

logit_model = LogisticRegression()
estimators.append(('Logistic Regression', logit_model))

In [None]:
# fit the models on test data
dt_model.fit(X_train, Y_train)
svm_model.fit(X_train, Y_train)
logit_model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [None]:
dt_predictions = dt_model.predict_proba(X_test)
svm_predictions = svm_model.predict_proba(X_test)
logit_predictions = logit_model.predict_proba(X_test)

In [None]:
weighted_avg_predictions = (dt_predictions * 0.5 + svm_predictions * 0.1 + logit_predictions * 0.4) / 3

In [None]:
weighted_avg_predictions[0:5]

array([[0.1705947 , 0.16273863],
       [0.00076155, 0.33257178],
       [0.01969249, 0.31364085],
       [0.00342243, 0.3299109 ],
       [0.32834889, 0.00498445]])

In [None]:
svm_model.classes_

array(['B', 'M'], dtype=object)