<a href="https://www.kaggle.com/code/jatin2055/ensemble-bagging?scriptVersionId=262615543" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score
from sklearn .datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier

In [None]:
X, y = make_classification(n_samples=10000, n_features=20, n_informative= 10, n_classes=3, random_state=42)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
dt = DecisionTreeClassifier(random_state=42)

dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

cr = classification_report(y_test, y_pred)

print(cr)
print(accuracy_score(y_test, y_pred))

# 0.7915


# BAGGING

In [None]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500, # no of models used
    max_samples=0.25, # 25%
    bootstrap = True, # no duplication
    random_state = 42   
)

bag.fit(X_train, y_train)

In [None]:
y_pred = bag.predict(X_test)

cr = classification_report(y_test, y_pred)

print(cr)
print(accuracy_score(y_test, y_pred))
# 0.8755


In [None]:
bag.estimators_samples_[0].shape

# SVM

In [None]:
bag_svc = BaggingClassifier(
    base_estimator=SVC(),
    n_estimators=500, # no of models used
    max_samples=0.25, # 25%
    bootstrap = True, # no duplication
    random_state = 42,
    verbose=1,
    n_jobs=-1
)

bag_svc.fit(X_train, y_train)

In [None]:
y_pred = bag_svc.predict(X_test)

cr = classification_report(y_test, y_pred)

print(cr)
print(accuracy_score(y_test, y_pred))
0.8905



# Pasting

In [None]:
bag_pasting = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500, # no of models used
    max_samples=0.25, # 25%
    bootstrap = False, # duplication
    random_state = 42,
    verbose=1,
    n_jobs=-1
)

bag_pasting.fit(X_train, y_train)

In [None]:
y_pred = bag_pasting.predict(X_test)

cr = classification_report(y_test, y_pred)

print(cr)
print(accuracy_score(y_test, y_pred))
# 0.8765



# Random SubSpaces

In [None]:
bag_rss = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500, # no of models used
    max_samples=1.0, # 25%
    bootstrap = False, # duplication
    random_state = 42,
    max_features=0.5,
    bootstrap_features=True, #no duplication
    verbose=1,
    n_jobs=-1
)

bag_rss.fit(X_train, y_train)

In [None]:
y_pred = bag_rss.predict(X_test)

cr = classification_report(y_test, y_pred)

print(cr)
print(accuracy_score(y_test, y_pred))
# 0.8985



# Random Patches

In [None]:
bag_rp = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500, # no of models used
    max_samples=0.25, # 25%
    bootstrap = True, # duplication
    random_state = 42,
    max_features=0.5,
    bootstrap_features=True, #no duplication
    verbose=1,
    n_jobs=-1
)

bag_rp.fit(X_train, y_train)

In [None]:
y_pred = bag_rp.predict(X_test)

cr = classification_report(y_test, y_pred)

print(cr)
print(accuracy_score(y_test, y_pred))
# 0.8785



# OOB Score

Out of Bag Samples

In [None]:
bag_oob = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500, # no of models used
    max_samples=0.25, # 25%
    bootstrap = True, # no duplication
    random_state = 42,
    oob_score = True # Out of Bag rows, whch never made it to be part of the sample
)

bag_oob.fit(X_train, y_train)

In [None]:
y_pred = bag_oob.predict(X_test)

cr = classification_report(y_test, y_pred)

print(cr)
print(accuracy_score(y_test, y_pred))
0.8755



In [None]:
bag_oob.oob_score_

# GRID SEARCH CV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters={
    'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 450, 500],
    'max_samples': [0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
    'bootstrap': [True, False],
    'max_features':[0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
}

In [None]:
search = GridSearchCV(BaggingClassifier(), parameters, cv=10)

search.fit(X_train, y_train)

In [None]:
print('best_score_: {search.best_score_}')
print('best_params_: {search.best_params_}')