## Ensemble Learning
- Hard votingClassifier: select a class that appears most.
- Soft votingClassifier: select a probability which is higher of the batch.

In [1]:
import pandas as pd 
import numpy as np

# NOTE: sl_no = serial number, ssc_p = secondary education percentage, ssc_b = boards of education, hsc_b = high secondary education percentage.

In [2]:
data_import = pd.read_csv("Placement_Data_Full_Class.csv")
data_import.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [3]:
data_import.describe()

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary
count,215.0,215.0,215.0,215.0,215.0,215.0,148.0
mean,108.0,67.303395,66.333163,66.370186,72.100558,62.278186,288655.405405
std,62.209324,10.827205,10.897509,7.358743,13.275956,5.833385,93457.45242
min,1.0,40.89,37.0,50.0,50.0,51.21,200000.0
25%,54.5,60.6,60.9,61.0,60.0,57.945,240000.0
50%,108.0,67.0,65.0,66.0,71.0,62.0,265000.0
75%,161.5,75.7,73.0,72.0,83.5,66.255,300000.0
max,215.0,89.4,97.7,91.0,98.0,77.89,940000.0


In [5]:
# Checking data types

for column in data_import.columns:
    print(column, data_import[column].dtype, len(data_import[column].unique()))

sl_no int64 215
gender object 2
ssc_p float64 103
ssc_b object 2
hsc_p float64 97
hsc_b object 2
hsc_s object 3
degree_p float64 89
degree_t object 3
workex object 2
etest_p float64 100
specialisation object 2
mba_p float64 205
status object 2
salary float64 46


In [6]:
# drop data sl_column, salary

dropped_columns = data_import.copy()

dropped_columns = dropped_columns.drop(["sl_no", "salary"], axis=1)
dropped_columns

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
0,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed
1,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed
2,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed
3,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed
4,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed
211,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed
212,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed
213,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed


In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

label_encoded_data = dropped_columns.copy()
columns_to_label_encode = ["gender", "workex", "ssc_b", "hsc_b", "specialisation"]

label_encoded_data[columns_to_label_encode] = label_encoded_data[columns_to_label_encode].apply(encoder.fit_transform)
label_encoded_data

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
0,1,67.00,1,91.00,1,Commerce,58.00,Sci&Tech,0,55.0,1,58.80,Placed
1,1,79.33,0,78.33,1,Science,77.48,Sci&Tech,1,86.5,0,66.28,Placed
2,1,65.00,0,68.00,0,Arts,64.00,Comm&Mgmt,0,75.0,0,57.80,Placed
3,1,56.00,0,52.00,0,Science,52.00,Sci&Tech,0,66.0,1,59.43,Not Placed
4,1,85.80,0,73.60,0,Commerce,73.30,Comm&Mgmt,0,96.8,0,55.50,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,1,80.60,1,82.00,1,Commerce,77.60,Comm&Mgmt,0,91.0,0,74.49,Placed
211,1,58.00,1,60.00,1,Science,72.00,Sci&Tech,0,74.0,0,53.62,Placed
212,1,67.00,1,67.00,1,Commerce,73.00,Comm&Mgmt,1,59.0,0,69.72,Placed
213,0,74.00,1,66.00,1,Commerce,58.00,Comm&Mgmt,0,70.0,1,60.23,Placed


In [8]:
hot_encoded_data = label_encoded_data.copy()

hot_encoded_data_y_placeholder = hot_encoded_data["status"]

# Removes the prediction column so that we don't encode it, pd.get_dummies are like one hot encoding
hot_encoded_data = hot_encoded_data.drop("status", axis=1)
hot_encoded_data = pd.get_dummies(hot_encoded_data)

hot_encoded_data = pd.concat([hot_encoded_data, hot_encoded_data_y_placeholder], axis=1)
hot_encoded_data

Unnamed: 0,gender,ssc_p,ssc_b,hsc_p,hsc_b,degree_p,workex,etest_p,specialisation,mba_p,hsc_s_Arts,hsc_s_Commerce,hsc_s_Science,degree_t_Comm&Mgmt,degree_t_Others,degree_t_Sci&Tech,status
0,1,67.00,1,91.00,1,58.00,0,55.0,1,58.80,0,1,0,0,0,1,Placed
1,1,79.33,0,78.33,1,77.48,1,86.5,0,66.28,0,0,1,0,0,1,Placed
2,1,65.00,0,68.00,0,64.00,0,75.0,0,57.80,1,0,0,1,0,0,Placed
3,1,56.00,0,52.00,0,52.00,0,66.0,1,59.43,0,0,1,0,0,1,Not Placed
4,1,85.80,0,73.60,0,73.30,0,96.8,0,55.50,0,1,0,1,0,0,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,1,80.60,1,82.00,1,77.60,0,91.0,0,74.49,0,1,0,1,0,0,Placed
211,1,58.00,1,60.00,1,72.00,0,74.0,0,53.62,0,0,1,0,0,1,Placed
212,1,67.00,1,67.00,1,73.00,1,59.0,0,69.72,0,1,0,1,0,0,Placed
213,0,74.00,1,66.00,1,58.00,0,70.0,1,60.23,0,1,0,1,0,0,Placed


In [11]:
# Scaling our data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_data = hot_encoded_data.copy()
scaled_data_temp = scaled_data.iloc[:, -1]
scaled_data = scaled_data.iloc[:, :-1]

scaled_data = scaler.fit_transform(scaled_data)
scaled_data

array([[ 0.73943397, -0.02808697,  1.08245885, ..., -1.43924583,
        -0.23221018,  1.62605898],
       [ 0.73943397,  1.11336869, -0.92382264, ..., -1.43924583,
        -0.23221018,  1.62605898],
       [ 0.73943397, -0.21323793, -0.92382264, ...,  0.69480833,
        -0.23221018, -0.61498384],
       ...,
       [ 0.73943397, -0.02808697,  1.08245885, ...,  0.69480833,
        -0.23221018, -0.61498384],
       [-1.35238581,  0.61994138,  1.08245885, ...,  0.69480833,
        -0.23221018, -0.61498384],
       [ 0.73943397, -0.49096436, -0.92382264, ...,  0.69480833,
        -0.23221018, -0.61498384]])

In [12]:
# Train-test split

from sklearn.model_selection import train_test_split

X = scaled_data
y = scaled_data_temp

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
X_train

array([[-1.35238581,  0.5273659 , -0.92382264, ...,  0.69480833,
        -0.23221018, -0.61498384],
       [-1.35238581,  0.15706399,  1.08245885, ...,  0.69480833,
        -0.23221018, -0.61498384],
       [ 0.73943397, -1.07604139,  1.08245885, ...,  0.69480833,
        -0.23221018, -0.61498384],
       ...,
       [-1.35238581,  0.80509234, -0.92382264, ..., -1.43924583,
        -0.23221018,  1.62605898],
       [ 0.73943397,  0.85138008,  1.08245885, ..., -1.43924583,
        -0.23221018,  1.62605898],
       [ 0.73943397,  0.15706399,  1.08245885, ...,  0.69480833,
        -0.23221018, -0.61498384]])

In [13]:
# Ensemble Predictors

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(estimators = [('lr', log_clf), ('rf', rnd_clf), ('sv', svm_clf)], voting="hard")

In [14]:
# Training the Ensemble classifier

voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('sv', SVC())])

In [15]:
# Measure the accuracy of our classifier

from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf,svm_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8309859154929577
RandomForestClassifier 0.8450704225352113
SVC 0.8028169014084507


## Bagging and Pasting
- Bagging: Picking a subset and train it then put it back. This is generally preferred.
- Pasting: Picking a subset with replacement. select subset cannot be used again. 
- Out-of-Bag Evaluation: Samples that are not used when selecting a sample.
- Random patches & Random subspaces: randon features & training instances used to speed up training.

In [24]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Bootstrap determine whether this is bagging or pasting.
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8309859154929577


In [22]:
# Random Forests

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

for name, score in zip(hot_encoded_data.iloc[:,:-1].columns, rf_clf.feature_importances_):
    print(name, ":" , score)

gender : 0.022470317005024786
ssc_p : 0.2764541592611236
ssc_b : 0.01546090106675862
hsc_p : 0.17741881839469115
hsc_b : 0.013560606151609476
degree_p : 0.18069780311873448
workex : 0.03614792609135309
etest_p : 0.0735936737436032
specialisation : 0.03355336400411124
mba_p : 0.10651579882048205
hsc_s_Arts : 0.0014469517865412192
hsc_s_Commerce : 0.008319176940017714
hsc_s_Science : 0.011475124611953286
degree_t_Comm&Mgmt : 0.015859521308699057
degree_t_Others : 0.015067847719588079
degree_t_Sci&Tech : 0.01195800997570903


In [23]:
# adaboost: trains more underfit data each time.

from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))



0.9014084507042254
