In [17]:
import pandas as pd
import numpy as np

In [18]:
# create sample classification dataset with 1000 rows and 20 features
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=1)
print(X.shape, y.shape)

# converting to dataframes
X = pd.DataFrame(X)
y = pd.Series(y)

(1000, 20) (1000,)


In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the base classifier (you can use any classifier here)
base_classifier = DecisionTreeClassifier(random_state=42)

# Define the bagging classifier
bagging_classifier = BaggingClassifier(base_estimator=base_classifier, n_estimators=10, random_state=42)

# Train the bagging classifier
bagging_classifier.fit(X_train, y_train)

# Make predictions
y_pred = bagging_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [19]:
# preprocessing to make all inputs between 0 and 1
from sklearn.preprocessing import MinMaxScaler

scaler= MinMaxScaler()
X= scaler.fit_transform(X)


In [20]:
# logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

lr = LogisticRegression(solver="liblinear", random_state=1)
cross_val_score(lr, X, y).mean()


0.866

In [21]:
# random forest model
from sklearn.ensemble import RandomForestClassifier

rf= RandomForestClassifier(n_estimators=100, random_state=1)
cross_val_score(rf, X, y).mean()

0.909

In [22]:
# multinomial nb model
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
cross_val_score(nb, X, y).mean()

0.7969999999999999

In [24]:
# ensemble
from sklearn.ensemble import VotingClassifier

vc = VotingClassifier([("clf1", lr), ("clf2", rf), ("clf3", nb)])
cross_val_score(vc, X, y).mean()

0.8629999999999999

In [25]:
# finetuning votingclassifier
params= {'voting': ['hard', 'soft'], 
         'weights': [[1,1,1], [1,2,1], [1,1,2], [1,2,2]],
         'clf1__C': [0.01, 0.1, 1, 10, 100],
         'clf2__n_estimators': [10, 100, 1000],
         'clf3__alpha': [0.01, 0.1, 1, 10, 100]}


In [26]:
# grid
from sklearn.model_selection import GridSearchCV

grid= GridSearchCV(vc, params, cv=5, n_jobs=-1)
grid.fit(X, y)
grid.best_params_

{'clf1__C': 0.01,
 'clf2__n_estimators': 1000,
 'clf3__alpha': 100,
 'voting': 'soft',
 'weights': [1, 2, 1]}

In [27]:
# voring classifier with best parameters
vc = VotingClassifier(estimators=[("clf1", LogisticRegression(C=0.01, solver="liblinear", random_state=1)), 
                       ("clf2", RandomForestClassifier(n_estimators=1000, random_state=1)), 
                       ("clf3", MultinomialNB(alpha=0.01))], voting="soft", weights=[1,2,1])

cross_val_score(vc, X, y).mean()

0.9149999999999998

In [40]:
x_sample= X[100]
vc.fit(X,y)
vc.predict(x_sample.reshape(1, -1))

array([1])

In [32]:
# to do create custom estimator

from sklearn.utils.estimator_checks import check_estimator
from sklearn.svm import LinearSVC

check_estimator(LinearSVC())



In [34]:
from sklearn.base import BaseEstimator

class customEstimator(BaseEstimator):
    def __init__(self, C=1):
        self.C = C

    def fit(self, X, y):
        self.clf = LinearSVC(C=self.C)
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)
    
    def score(self, X, y):
        return cross_val_score(self.clf, X, y).mean()

In [37]:
ce= customEstimator()
ce.fit(X, y)
ce.score(X, y)




0.8699999999999999

In [None]:
# param setting for ensembling

# clf1 raw-gat-st
# clf2 raw-pc

params= {'voting': ['hard', 'soft'], 
        'weights': [[1,1,1], [1,2,1], [1,1,2], [1,2,2]],
        'clf1__lr': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.90],
        'clf1__momentum': [1e-4, 1e-3, 1e-2, 0.90],
        'clf1__lr': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 0.90],
        'clf1__momentum': [1e-4, 1e-3, 1e-2, 0.90],
        'clf2__n_estimators': [10, 100, 1000],
        'clf3__alpha': [0.01, 0.1, 1, 10, 100]}