# Ensemble Learning and Bagging,Boosting

<div  style="color:blue;font-family:Candara,arial,helvetica;line-height:20px"><strong>


## Ensemble learning is the process by which multiple models, such as classifiers or experts, are strategically generated and combined to solve a particular computational intelligence problem. Ensemble learning is primarily used to improve the (classification, prediction, function approximation, etc.) performance of a model, or reduce the likelihood of an unfortunate selection of a poor one. 

## Bagging is a way to decrease the variance in the prediction by generating additional data for training from dataset using combinations with repetitions to produce multi-sets of the original data. Boosting is an iterative technique which adjusts the weight of an observation based on the last classification.  
    
    
<img src="https://miro.medium.com/max/1169/1*_pfQ7Xf-BAwfQXtaBbNTEg.png" alt="drawing" width="600" height="300"/>     
    
<img src="https://miro.medium.com/max/850/1*DwvwMlOcT1T9hZwIJvMfng.png" alt="drawing" width="600" height="300"/>     
   

</strong></div>

# Evaluate the Adult Income Dataset using Random forests

## Import Library and Split into Test/Train

In [1]:
# Import libraries
import pandas as pd
from sklearn.metrics import confusion_matrix

# Read dataset
data = pd.read_csv('04 - decisiontreeAdultIncome.csv')

# Check for Null values
data.isnull().sum(axis=0)

# Create Dummy variables
data.dtypes
data_prep = pd.get_dummies(data, drop_first=True)


# Create X and Y Variables
X = data_prep.iloc[:, :-1]
Y = data_prep.iloc[:, -1]


# Split the X and Y dataset into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size = 0.3, random_state = 1234, stratify=Y)

## Evaluate the model

In [2]:

# Import and train Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=1234)
rfc.fit(X_train, Y_train)


# Test the RFC model
Y_predict = rfc.predict(X_test)

# Evaluate the RFC model
cm2 = confusion_matrix(Y_test, Y_predict)
score2 = rfc.score(X_test, Y_test)

print(cm2)
print(score2)

[[3882  491]
 [ 712  852]]
0.7973724103082365




## Solving Iris problem using Random forest

In [3]:
# import and load the Iris Dataset
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
Y = iris.target

display(X)
display(Y)


# split, train test....
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = \
train_test_split(X, Y, test_size = 0.3, random_state = 1234, stratify=Y)

# Train the SVC 
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [4]:
# Import and train Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=1234)
rfc.fit(X_train, Y_train)


# Test the model
Y_predict = rfc.predict(X_test)
Y_prob = rfc.predict_proba(X_test)[:,:]
print("Printing Y_Probability")
display(Y_prob)

# Evaluate the model
from sklearn.metrics import confusion_matrix
cm_iris = confusion_matrix(Y_test, Y_predict)
score_iris = rfc.score(X_test, Y_test)

print(score_iris)
print(cm_iris)

Printing Y_Probability




array([[0. , 0. , 1. ],
       [0. , 0. , 1. ],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0.1, 0.9],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0.9, 0.1],
       [1. , 0. , 0. ],
       [0. , 1. , 0. ],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 1. , 0. ],
       [0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [0. , 0.5, 0.5],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [0. , 1. , 0. ],
       [0. , 0. , 1. ],
       [0. , 0.7, 0.3],
       [0. , 1. , 0. ],
       [0. , 0.1, 0.9],
       [0. , 0.5, 0.5],
       [0. , 1. , 0. ],
       [0. , 0.9, 0.1],
       [0. , 0. , 1. ],
       [1. , 0. , 0. ],
       [0. , 0. , 1. ],
       [0. , 0. , 1. ],
       [0. , 1. , 0. ],
       [0. , 0.9, 0.1],
       [0. , 0. , 1. ],
       [1. , 0. 

0.9555555555555556
[[15  0  0]
 [ 0 14  1]
 [ 0  1 14]]
