# Imports

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt

# Preprocessing

In [9]:
# Preprocessed Breast Cancer data!
df = pd.read_csv('https://raw.githubusercontent.com/karthikb19/data/master/breastcancer.csv')
X = df.drop('target', inplace=False, axis=1)
y = df['target']
df.head()

Unnamed: 0.1,Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,0,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,2.217515,...,-1.359293,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015,0
1,1,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,0.001392,...,-0.369203,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.24389,0.28119,0
2,2,1.579888,0.456187,1.566503,1.558884,0.94221,1.052926,1.363478,2.037231,0.939685,...,-0.023974,1.347475,1.456285,0.527407,1.082932,0.854974,1.955,1.152255,0.201391,0
3,3,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,2.867383,...,0.133984,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.93501,0
4,4,1.750297,-1.151816,1.776573,1.826229,0.280372,0.53934,1.371011,1.428493,-0.00956,...,-1.46677,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.3971,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               569 non-null    int64  
 1   mean radius              569 non-null    float64
 2   mean texture             569 non-null    float64
 3   mean perimeter           569 non-null    float64
 4   mean area                569 non-null    float64
 5   mean smoothness          569 non-null    float64
 6   mean compactness         569 non-null    float64
 7   mean concavity           569 non-null    float64
 8   mean concave points      569 non-null    float64
 9   mean symmetry            569 non-null    float64
 10  mean fractal dimension   569 non-null    float64
 11  radius error             569 non-null    float64
 12  texture error            569 non-null    float64
 13  perimeter error          569 non-null    float64
 14  area error               5

In [None]:
features = ['Unamed', 'mean radius', 'mean texture', '']

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forests vs. Bagging 

In this Notebook, we are gonna cover Random Forests which is a very important ensemble model


Here we will compare bagging classifiers, and random forests to truly show how effective RF in terms of ML Models


RF is really cool because you can also view the feature importance of each class!

## Bagging CLF

In [5]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

bag = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500, 
    max_samples=150, bootstrap=True, random_state=42
)

bag.fit(X_train, y_train)
bag_pred = bag.predict(X_test)
print("Bagging DT Accuracy: ", accuracy_score(y_test, bag_pred))

Bagging DT Accuracy:  0.9649122807017544


## Random Forests

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print("Random Forests Accuracy: ", accuracy_score(y_test, rf_pred))

Random Forests Accuracy:  0.9707602339181286


#### Our Random Forests Classifier forms better than the Bagging model with the same hyper params!

### Random Forests Feature Importance(Breast Cancer)

In [13]:
## Shows the % importance of each feature 

rf.feature_importances_

array([0.01088686, 0.02952874, 0.01293538, 0.03853933, 0.04171815,
       0.00586171, 0.0134993 , 0.05614174, 0.12457716, 0.00409843,
       0.00448393, 0.01701003, 0.00464086, 0.01206647, 0.03493116,
       0.00491687, 0.00462889, 0.00783283, 0.00428836, 0.00528087,
       0.00505469, 0.10779097, 0.0200913 , 0.11314135, 0.10846539,
       0.01135294, 0.01422086, 0.03538667, 0.12876845, 0.01230302,
       0.00555729])

To show feature importance in greater detail, I will switch over to the Iris Dataset!

# Feature Importance

In [23]:
from sklearn.datasets import load_iris

iris = load_iris()
iris_rf = RandomForestClassifier(random_state=42)
iris_rf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], iris_rf.feature_importances_):
    print("Name: " + str(name) + "   -    Score: " + str(score))

Name: sepal length (cm)   -    Score: 0.10612761987750428
Name: sepal width (cm)   -    Score: 0.02167809317736852
Name: petal length (cm)   -    Score: 0.4361295069034437
Name: petal width (cm)   -    Score: 0.43606478004168353


The above data shows the specific feature followed by the importance of that specific feature to predicting the correct flower type