# Ensemble Techniques

# 6 no. answer

In [54]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [55]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier

In [56]:
cancer = load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = cancer.target

In [57]:
model = RandomForestClassifier(random_state=1)
model.fit(X, y)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [58]:
feature_importances = pd.Series(model.feature_importances_, index=X.columns)

In [59]:
top_5_features = feature_importances.nlargest(5)
print("Top 5 most important features:")
print(top_5_features)

Top 5 most important features:
worst concave points    0.123350
worst perimeter         0.115661
worst area              0.105248
worst radius            0.102798
mean concave points     0.100735
dtype: float64


# 7 no. answer

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.datasets import load_iris

In [4]:
iris = load_iris()
X = iris.data
y = iris.target

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [6]:
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier(random_state=1)
dt_classifier.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [8]:
y_pred_dt = dt_classifier.predict(X_test)

In [9]:
from sklearn.metrics import accuracy_score
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Accuracy of single Decision Tree: {accuracy_dt:.4f}")

Accuracy of single Decision Tree: 0.9556


In [17]:
from sklearn.ensemble import BaggingClassifier
bagging_classifier = BaggingClassifier(estimator = DecisionTreeClassifier(random_state=1),n_estimators=10, random_state=1)
bagging_classifier.fit(X_train, y_train)

0,1,2
,estimator,DecisionTreeC...andom_state=1)
,n_estimators,10
,max_samples,1.0
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,
,random_state,1

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [18]:
y_pred_bagging = bagging_classifier.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score
accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
print(f"Accuracy of Bagging Classifier: {accuracy_bagging:.4f}")

Accuracy of Bagging Classifier: 0.9556


In [20]:
print(f"\nComparison:")
print(f"Single Decision Tree Accuracy: {accuracy_dt:.4f}")
print(f"Bagging Classifier Accuracy: {accuracy_bagging:.4f}")


Comparison:
Single Decision Tree Accuracy: 0.9556
Bagging Classifier Accuracy: 0.9556


In [22]:
if accuracy_bagging > accuracy_dt:
    print("The Bagging Classifier performed better than the single Decision Tree.")
elif accuracy_bagging < accuracy_dt:
     print("The single Decision Tree performed better than the Bagging Classifier.")
else:
    print("Both classifiers performed equally well.")

Both classifiers performed equally well.


# 8 no. answer

In [42]:
import pandas as pd 
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score

In [44]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [46]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [47]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
params = {'max_depth': [1, 2, 3, 4, 5, 10, None],
         'n_estimators': [30, 40, 50, 100, 200, 300],
         'criterion': ['gini', 'entropy']}

params

{'max_depth': [1, 2, 3, 4, 5, 10, None],
 'n_estimators': [30, 40, 50, 100, 200, 300],
 'criterion': ['gini', 'entropy']}

In [48]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

In [49]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'max_depth': [None, 5, ...], 'n_estimators': [50, 100, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [50]:
print("Best parameters found: ", grid_search.best_params_)

Best parameters found:  {'max_depth': 5, 'n_estimators': 200}


In [51]:
best_rf_model = grid_search.best_estimator_

In [52]:
y_pred = best_rf_model.predict(X_test)

In [53]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Final accuracy on the test set: {accuracy:.4f}")

Final accuracy on the test set: 0.9532


# 9 no. answer

In [23]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [24]:
housing = fetch_california_housing()
X = housing.data
y = housing.target

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [26]:
bagging_reg = BaggingRegressor(random_state=1)
bagging_reg.fit(X_train, y_train)

0,1,2
,estimator,
,n_estimators,10
,max_samples,1.0
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,
,random_state,1


In [27]:
y_pred_bagging = bagging_reg.predict(X_test)
mse_bagging = mean_squared_error(y_test, y_pred_bagging)

In [28]:
rf_reg = RandomForestRegressor(random_state=1)
rf_reg.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [29]:
y_pred_rf = rf_reg.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)

In [30]:
print(f"Mean Squared Error (Bagging Regressor): {mse_bagging:.4f}")
print(f"Mean Squared Error (Random Forest Regressor): {mse_rf:.4f}")

Mean Squared Error (Bagging Regressor): 0.2843
Mean Squared Error (Random Forest Regressor): 0.2542


# 10 no. answer

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("Loan_default.csv")

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

In [5]:
data.isna().sum()

LoanID            0
Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
dtype: int64

In [6]:
X = data.drop(columns=['LoanID', 'Default'])
y = data['Default']

In [7]:
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(exclude=['object']).columns.tolist()

In [8]:
num_transformer = Pipeline(steps=[
    ('imputation', SimpleImputer(strategy='median'))
])

cat_transformer = Pipeline(steps=[
    ('imputation', SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder(handle_unknown='ignore'))
])

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)

In [10]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=1)

In [11]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [13]:
clf.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
y_pred_proba = clf.predict_proba(X_test)[:, 1]

In [15]:
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC Score: {roc_auc:.3f}")

ROC AUC Score: 0.731


In [16]:
from sklearn.model_selection import cross_val_score

In [17]:
cv_scores = cross_val_score(clf, X, y, cv=5, scoring='roc_auc')

In [18]:
print(f"Cross-Validation ROC AUC scores: {cv_scores}")
print(f"Mean ROC AUC: {cv_scores.mean():.3f}")

Cross-Validation ROC AUC scores: [0.733663   0.72892762 0.72913652 0.72864312 0.7243754 ]
Mean ROC AUC: 0.729
