In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
# Load the dataset
churn_data = pd.read_csv('Churn_Modelling.csv')
insurance_data = pd.read_csv('insurance.csv')

In [4]:
print(churn_data.columns)
print(insurance_data.columns)


Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')
Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')


In [5]:
# Drop unnecessary columns (CustomerId, Surname, and RowNumber)
churn_data = churn_data.drop(['CustomerId', 'Surname', 'RowNumber'], axis=1)


In [6]:
print(churn_data.columns)

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')


In [7]:
# Preprocessing for Churn dataset
churn_data = pd.get_dummies(churn_data, columns=['Gender', 'Geography'], drop_first=True)
X_churn = churn_data.drop(['Exited'], axis=1)
y_churn = churn_data['Exited']
X_train_churn, X_test_churn, y_train_churn, y_test_churn = train_test_split(X_churn, y_churn, test_size=0.2, random_state=42)

In [8]:
# Preprocessing for Insurance dataset
insurance_data = pd.get_dummies(insurance_data, columns=['sex', 'smoker', 'region'], drop_first=True)
X_insurance = insurance_data.drop(['charges'], axis=1)
y_insurance = insurance_data['charges']
X_train_insurance, X_test_insurance, y_train_insurance, y_test_insurance = train_test_split(X_insurance, y_insurance, test_size=0.2, random_state=42)

In [9]:
# Standardize the features using StandardScaler
scaler = StandardScaler()
X_churn = scaler.fit_transform(X_churn)

In [10]:

scaler = StandardScaler()
X_insurance = scaler.fit_transform(X_insurance)

In [11]:
# Homogeneous ensemble with Random Forest for Churn dataset
rf_model_churn = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model_churn.fit(X_train_churn, y_train_churn)
y_pred_churn = rf_model_churn.predict(X_test_churn)
print('Accuracy for Random Forest (Churn):', accuracy_score(y_test_churn, y_pred_churn))
print('Precision for Random Forest (Churn):', precision_score(y_test_churn, y_pred_churn))
print('Recall for Random Forest (Churn):', recall_score(y_test_churn, y_pred_churn))
print('F1-score for Random Forest (Churn):', f1_score(y_test_churn, y_pred_churn))

Accuracy for Random Forest (Churn): 0.8675
Precision for Random Forest (Churn): 0.7758620689655172
Recall for Random Forest (Churn): 0.4580152671755725
F1-score for Random Forest (Churn): 0.576


In [17]:
from sklearn.metrics import accuracy_score


In [18]:
# Convert the target variable to categorical values
y_train_insurance = pd.cut(y_train_insurance, bins=[-float("inf"), 0.5, float("inf")], labels=[0, 1])

In [22]:
threshold = 0.5
y_test_insurance_binary = (y_test_insurance >= threshold).astype(int)

rf_model_insurance = RandomForestClassifier(random_state=42)
rf_model_insurance.fit(X_train_insurance, y_train_insurance)
y_pred_insurance = rf_model_insurance.predict(X_test_insurance)

print('Accuracy for Random Forest (Insurance):', accuracy_score(y_test_insurance_binary, y_pred_insurance))
print('Precision for Random Forest (Insurance):', precision_score(y_test_insurance_binary, y_pred_insurance))
print('Recall for Random Forest (Insurance):', recall_score(y_test_insurance_binary, y_pred_insurance))
print('F1-score for Random Forest (insurance):', f1_score(y_test_insurance_binary, y_pred_insurance))


Accuracy for Random Forest (Insurance): 1.0
Precision for Random Forest (Insurance): 1.0
Recall for Random Forest (Insurance): 1.0
F1-score for Random Forest (insurance): 1.0


In [24]:
# Heterogeneous ensemble for Churn dataset
dt_model_churn = DecisionTreeClassifier(random_state=42)
lr_model_churn = LogisticRegression(random_state=42)
svm_model_churn = SVC(kernel='rbf', random_state=42)

hetero_model_churn = VotingClassifier(
    estimators=[('dt', dt_model_churn), ('lr', lr_model_churn), ('svm', svm_model_churn)], 
    voting='hard'
)

hetero_model_churn.fit(X_train_churn, y_train_churn)
y_pred_churn = hetero_model_churn.predict(X_test_churn)

print('Accuracy for Heterogeneous ensemble (Churn):', accuracy_score(y_test_churn, y_pred_churn))
print('Precision for Heterogeneous ensemble (Churn):', precision_score(y_test_churn, y_pred_churn))
print('Recall for Heterogeneous ensemble (Churn):', recall_score(y_test_churn, y_pred_churn))
print('F1-score for Heterogeneous ensemble (Churn):', f1_score(y_test_churn, y_pred_churn))

Accuracy for Heterogeneous ensemble (Churn): 0.8075
Precision for Heterogeneous ensemble (Churn): 0.6333333333333333
Recall for Heterogeneous ensemble (Churn): 0.04834605597964377
F1-score for Heterogeneous ensemble (Churn): 0.08983451536643025


In [26]:
#Heterogeneous ensemble for Insurance dataset
# define models
dt_model_insurance = DecisionTreeClassifier(random_state=42)
lr_model_insurance = LogisticRegression(random_state=42)
svm_model_insurance = SVC(kernel='rbf', random_state=42)

In [27]:
# define heterogeneous ensemble model
hetero_model_insurance = VotingClassifier(estimators=[
    ('dt', dt_model_insurance), 
    ('lr', lr_model_insurance), 
    ('svm', svm_model_insurance)], 
    voting='hard')

In [29]:
import numpy as np

In [33]:
from sklearn.datasets import make_classification



In [34]:
# Generate some random binary classification data
X, y = make_classification(n_samples=100, n_features=10, n_classes=2, random_state=42)

In [35]:
# Check if there are at least two unique classes in the data
if len(np.unique(y)) < 2:
    raise ValueError("The data must contain at least two classes")

In [36]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
# Create a voting classifier using logistic regression and decision tree
logistic = LogisticRegression(random_state=42)
tree = DecisionTreeClassifier(random_state=42)
voting_clf = VotingClassifier(estimators=[('lr', logistic), ('dt', tree)], voting='hard')

In [38]:
# Fit the model on the training data
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('dt', DecisionTreeClassifier(random_state=42))])

In [39]:
# Evaluate the model on the test data
accuracy = voting_clf.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.85


In [48]:
# check number of unique values in y
unique_classes = np.unique(y_train_insurance)
num_classes = len(unique_classes)
print("Number of unique classes:", num_classes)

Number of unique classes: 1


In [43]:
# Get the unique classes in your dataset
classes = np.unique(y)


# Check if there is only one class in your dataset
if len(classes) == 1:
    # Duplicate your existing class
    X_new = np.concatenate((X, X), axis=0)
    y_new = np.concatenate((y, y+1), axis=0)
    
    # Add some random noise to the duplicated class
    noise = np.random.normal(size=(X.shape[0], X.shape[1]))
    X_new[X.shape[0]:] += noise
    
    # Update your dataset
    X = X_new
    y = y_new