### Import Libraries

In [129]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import VotingClassifier


from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import accuracy_score

### Import Dataset

In [130]:
df = pd.read_csv("auto.csv")
df[::40]

Unnamed: 0,mpg,displ,hp,weight,accel,origin,size
0,18.0,250.0,88,3139,14.5,US,15.0
40,35.1,81.0,60,1760,16.1,Asia,10.0
80,30.7,145.0,76,3160,19.6,Europe,15.0
120,27.0,101.0,83,2202,15.3,Europe,10.0
160,31.0,91.0,68,1970,17.6,Asia,10.0
200,19.0,232.0,90,3211,17.0,US,15.0
240,25.4,183.0,77,3530,20.1,Europe,12.5
280,26.6,151.0,84,2635,16.4,US,10.0
320,35.7,98.0,80,1915,14.4,US,10.0
360,24.0,200.0,81,3012,17.6,US,15.0


In [131]:
df = pd.get_dummies(df)
df[::40]

Unnamed: 0,mpg,displ,hp,weight,accel,size,origin_Asia,origin_Europe,origin_US
0,18.0,250.0,88,3139,14.5,15.0,0,0,1
40,35.1,81.0,60,1760,16.1,10.0,1,0,0
80,30.7,145.0,76,3160,19.6,15.0,0,1,0
120,27.0,101.0,83,2202,15.3,10.0,0,1,0
160,31.0,91.0,68,1970,17.6,10.0,1,0,0
200,19.0,232.0,90,3211,17.0,15.0,0,0,1
240,25.4,183.0,77,3530,20.1,12.5,0,1,0
280,26.6,151.0,84,2635,16.4,10.0,0,0,1
320,35.7,98.0,80,1915,14.4,10.0,0,0,1
360,24.0,200.0,81,3012,17.6,15.0,0,0,1


In [132]:
X = df[df.columns.drop(["mpg", "origin_Asia", "origin_Europe", "origin_US"])]
y = df["mpg"]

In [133]:
# Train Test Split
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 30,
                                                    random_state = 21)


parameters = {"max_depth": np.arange(1, 10),
              "min_samples_leaf": np.linspace(0, 1, 110).round(2)}


dt = DecisionTreeRegressor(random_state = 21)

cv = GridSearchCV(dt, parameters, cv = 10)

cv.fit(x_train, y_train)

print("Best CV parameters =", cv.best_params_)
print("Best CV accuracy =", cv.best_score_)

Best CV parameters = {'max_depth': 5, 'min_samples_leaf': 0.08}
Best CV accuracy = 0.7232437587316721


In [134]:
y_pred = cv.predict(x_test)
pd.DataFrame({"Actual": y_test, "Predicted": y_pred})[:20]

Unnamed: 0,Actual,Predicted
195,12.0,13.234211
55,30.9,28.980392
279,16.0,13.234211
347,25.5,25.2
192,15.5,15.398039
127,16.0,19.938462
175,15.0,13.234211
40,35.1,32.7
89,13.0,13.234211
16,36.0,32.7


In [135]:
score = MSE(y_test, y_pred)
score

20.258550363041206

In [136]:
rmse = score**(1/2)
rmse

4.500949940072785

### Voting Classifier in sklearn

In [137]:
from sklearn.datasets import load_breast_cancer

In [138]:
data = load_breast_cancer()

In [139]:
# Feature Names
X = data["data"]
X = pd.DataFrame(X)
X.columns = data["feature_names"]
X[::100]

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
100,13.61,24.98,88.05,582.7,0.09488,0.08511,0.08625,0.04489,0.1609,0.05871,...,16.99,35.27,108.6,906.5,0.1265,0.1943,0.3169,0.1184,0.2651,0.07397
200,12.23,19.56,78.54,461.0,0.09586,0.08087,0.04187,0.04107,0.1979,0.06013,...,14.44,28.36,92.15,638.4,0.1429,0.2042,0.1377,0.108,0.2668,0.08174
300,19.53,18.9,129.5,1217.0,0.115,0.1642,0.2197,0.1062,0.1792,0.06552,...,25.93,26.24,171.1,2053.0,0.1495,0.4116,0.6121,0.198,0.2968,0.09929
400,17.91,21.02,124.4,994.0,0.123,0.2576,0.3189,0.1198,0.2113,0.07115,...,20.8,27.78,149.6,1304.0,0.1873,0.5917,0.9034,0.1964,0.3245,0.1198
500,15.04,16.74,98.73,689.4,0.09883,0.1364,0.07721,0.06142,0.1668,0.06869,...,16.76,20.43,109.7,856.9,0.1135,0.2176,0.1856,0.1018,0.2177,0.08549


In [140]:
# Target Names
y = data["target"]
y[::100]

array([0, 0, 1, 0, 0, 1])

In [141]:
# Split data into train and test
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 10)


In [142]:
# Classifiers
classifiers = [("Logistic Regression", LogisticRegression(random_state = 10)),
               ("K Nearest Neighbours", KNN()),
               ("Classification Tree", DecisionTreeClassifier(random_state = 10))]

for name, clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    print(f"{name} : {score.round(3)}")

Logistic Regression : 0.947
K Nearest Neighbours : 0.942
Classification Tree : 0.93


In [143]:
# Instantiate voting classifier
vc = VotingClassifier(estimators = classifiers)

# Fit "vc" to training set and predict test set labels
vc.fit(x_train, y_train)
y_pred = vc.predict(x_test)

# Evaluate the test-set accuracy of "vc"
score = accuracy_score(y_test, y_pred)
print(score)

0.9473684210526315


### Indian Liver Patient Dataset 

In [144]:
# Dataset
df = pd.read_csv("ILPD.csv")
df[::50]

Unnamed: 0,age,gender,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
50,45,Female,0.7,0.2,170,21,14,5.7,2.5,0.7,1
100,27,Male,0.6,0.2,161,27,28,3.7,1.6,0.76,2
150,56,Male,1.1,0.5,180,30,42,6.9,3.8,1.2,2
200,49,Male,0.6,0.1,218,50,53,5.0,2.4,0.9,1
250,33,Male,2.1,1.3,480,38,22,6.5,3.0,0.8,1
300,58,Male,0.8,0.2,180,32,25,8.2,4.4,1.1,2
350,37,Male,1.8,0.8,145,62,58,5.7,2.9,1.0,1
400,66,Female,0.7,0.2,162,24,20,6.4,3.2,1.0,2
450,65,Male,7.9,4.3,282,50,72,6.0,3.0,1.0,1


In [145]:
# Modification Function
def patient(x):
    if x == 1:
        return 0
    else:
        return 1

In [146]:
# Apply Function
df["is_patient"] = df["is_patient"].apply(patient)

In [147]:
df = pd.get_dummies(df)
df[::50]

Unnamed: 0,age,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient,gender_Female,gender_Male
0,65,0.7,0.1,187,16,18,6.8,3.3,0.9,0,1,0
50,45,0.7,0.2,170,21,14,5.7,2.5,0.7,0,1,0
100,27,0.6,0.2,161,27,28,3.7,1.6,0.76,1,0,1
150,56,1.1,0.5,180,30,42,6.9,3.8,1.2,1,0,1
200,49,0.6,0.1,218,50,53,5.0,2.4,0.9,0,0,1
250,33,2.1,1.3,480,38,22,6.5,3.0,0.8,0,0,1
300,58,0.8,0.2,180,32,25,8.2,4.4,1.1,1,0,1
350,37,1.8,0.8,145,62,58,5.7,2.9,1.0,0,0,1
400,66,0.7,0.2,162,24,20,6.4,3.2,1.0,1,1,0
450,65,7.9,4.3,282,50,72,6.0,3.0,1.0,0,0,1


In [148]:
df = df.dropna()

In [149]:
# Features
X = df[df.columns.drop("is_patient")]

# Target
y = df["is_patient"]

In [150]:
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    random_state = 5)

**Find best parameters for logistic regression**

In [151]:
lr = LogisticRegression(random_state = 5)

cv_lr = GridSearchCV(lr, {"C": [0.001, 0.01, 0.1, 1, 10]})

cv_lr.fit(x_train, y_train)

yhat = cv_lr.predict(x_test)
print("Best CV params =", cv_lr.best_params_)
print("Accuracy =", accuracy_score(y_test, yhat))

Best CV params = {'C': 0.001}
Accuracy = 0.7413793103448276


**Find best parameters for K Neighbours Classifier**

In [152]:
knn = KNN()

cv_knn = GridSearchCV(knn, {"n_neighbors": np.arange(1, 100)})

cv_knn.fit(x_train, y_train)

yhat = cv_knn.predict(x_test)
print("Best CV params =", cv_knn.best_params_)
print("Accuracy =", accuracy_score(y_test, yhat))

Best CV params = {'n_neighbors': 56}
Accuracy = 0.7241379310344828


**Find best parameters for Decision Tree Classification**

In [153]:
tree = DecisionTreeClassifier(random_state = 5)

cv_dt = GridSearchCV(tree, {"max_depth": np.arange(1, 10),
                      "min_samples_leaf": np.linspace(0, 1, 110).round(2)})

cv_dt.fit(x_train, y_train)

yhat = cv_dt.predict(x_test)
print("Best CV params =", cv_dt.best_params_)
print("Accuracy =", accuracy_score(y_test, yhat))

Best CV params = {'max_depth': 2, 'min_samples_leaf': 0.01}
Accuracy = 0.7183908045977011


**Define the ensemble**

In [154]:
# Instantiate lr
lr = LogisticRegression(C = 0.001, random_state = 5)

# Instantiate knn
knn = KNN(n_neighbors = 56)

# Instantiate dt
dt = DecisionTreeClassifier(max_depth = 2, min_samples_leaf = 0.01)

In [155]:
classifiers = [("Logistic Regression", lr),
               ("K Nearest Neighbours", knn),
               ("Decision Tree Classifier", dt)]

for name, clf in classifiers:
    clf.fit(x_train, y_train)
    yhat = clf.predict(x_test)
    score = accuracy_score(y_test, yhat)
    print(f"{name} : {score}")

Logistic Regression : 0.7413793103448276
K Nearest Neighbours : 0.7241379310344828
Decision Tree Classifier : 0.7183908045977011


In [156]:
# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators = classifiers)

# Fit vc to the training set
vc.fit(x_train, y_train)

# Evaluate the test set predictions
y_pred = vc.predict(x_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)

print("Voting Classifier Accuracy =", accuracy.round(3))

Voting Classifier Accuracy = 0.724
