## Prepare the Data

In [1]:
# Import modules
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
bank_data_df = pd.read_csv('cardio_train.csv')

# Review the DataFrame
bank_data_df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,y
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
# Split the features and target data
y = bank_data_df['y']
X = bank_data_df.drop(columns='y')

In [4]:
# Encode the features dataset's categorical variables using get_dummies
X = pd.get_dummies(X)

# Review the features DataFrame
X.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,0,18393,2,168,62.0,110,80,1,1,0,0,1
1,1,20228,1,156,85.0,140,90,3,1,0,0,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0
3,3,17623,2,169,82.0,150,100,1,1,0,0,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0


In [5]:
# Split data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
# Review the distinct values from y
y_train.value_counts()

y
0    26412
1    26088
Name: count, dtype: int64

In [7]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

---

## RandomForestClassifier

### Create and fit a `RandomForestClassifier` to the **scaled** training data.

In [8]:
# Import the RandomForestClassifier from sklearn
from sklearn.ensemble import RandomForestClassifier

# Instantiate a RandomForestClassifier instance
model = RandomForestClassifier()

# Fit the traning data to the model
model.fit(X_train_scaled, y_train)

In [9]:
# Predict labels for original scaled testing features
y_pred = model.predict(X_test_scaled)

---

## Random Undersampler

### Import `RandomUnderSampler` from `imblearn`.

In [10]:
# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler

# Instantiate a RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

In [11]:
# Fit the training data to the random undersampler model
X_undersampled, y_undersampled = rus.fit_resample(X_train_scaled, y_train)

In [12]:
# Count distinct values for the resampled target data
y_undersampled.value_counts()

y
0    26088
1    26088
Name: count, dtype: int64

###  Create and fit a `RandomForestClassifier` to the **undersampled** training data.

In [13]:
# Instantiate a new RandomForestClassier model
model_undersampled = RandomForestClassifier()

# Fit the undersampled data the new model
model_undersampled.fit(X_undersampled, y_undersampled)

In [14]:
# Predict labels for oversampled testing features
y_pred_undersampled = model_undersampled.predict(X_test_scaled)

In [15]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Undersampled Data")
print(classification_report(y_test, y_pred_undersampled))

Classification Report - Original Data
              precision    recall  f1-score   support

           0       0.71      0.76      0.73      8609
           1       0.75      0.70      0.72      8891

    accuracy                           0.73     17500
   macro avg       0.73      0.73      0.73     17500
weighted avg       0.73      0.73      0.73     17500

---------
Classification Report - Undersampled Data
              precision    recall  f1-score   support

           0       0.71      0.75      0.73      8609
           1       0.75      0.70      0.72      8891

    accuracy                           0.73     17500
   macro avg       0.73      0.73      0.73     17500
weighted avg       0.73      0.73      0.73     17500



---

## Random Oversampler

###  Import `RandomOverSampler` from `imblearn`.

In [16]:
# Import RandomOverSampler from imblearn
from imblearn.over_sampling import RandomOverSampler

# Instantiate a RandomOversampler instance
ros = RandomOverSampler(random_state=1)

In [17]:
# Fit the training data to the `RandomOverSampler` model
X_oversampled, y_oversampled = ros.fit_resample(X_train_scaled, y_train)

In [18]:
# Count distinct values
y_oversampled.value_counts()

y
0    26412
1    26412
Name: count, dtype: int64

###  Create and fit a `RandomForestClassifier` to the **oversampled** training data.

In [19]:
# Instantiate a new RandomForestClassier model
model_oversampled = RandomForestClassifier()

# Fit the oversampled data the new model
model_oversampled.fit(X_oversampled, y_oversampled)

In [20]:
# Predict labels for oversampled testing features
y_pred_oversampled = model_oversampled.predict(X_test_scaled)

In [21]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Undersampled Data")
print(classification_report(y_test, y_pred_undersampled))
print("---------")
print(f"Classification Report - Oversampled Data")
print(classification_report(y_test, y_pred_oversampled))

Classification Report - Original Data
              precision    recall  f1-score   support

           0       0.71      0.76      0.73      8609
           1       0.75      0.70      0.72      8891

    accuracy                           0.73     17500
   macro avg       0.73      0.73      0.73     17500
weighted avg       0.73      0.73      0.73     17500

---------
Classification Report - Undersampled Data
              precision    recall  f1-score   support

           0       0.71      0.75      0.73      8609
           1       0.75      0.70      0.72      8891

    accuracy                           0.73     17500
   macro avg       0.73      0.73      0.73     17500
weighted avg       0.73      0.73      0.73     17500

---------
Classification Report - Oversampled Data
              precision    recall  f1-score   support

           0       0.70      0.75      0.73      8609
           1       0.74      0.69      0.72      8891

    accuracy                           0.

---

## Cluster Centroids

In [22]:
# Import ClusterCentroids from imblearn
from imblearn.under_sampling import ClusterCentroids

# Instantiate a ClusterCentroids instance
cc_sampler = ClusterCentroids(random_state=1)

In [23]:
# Fit the training data to the cluster centroids model
X_resampled, y_resampled = cc_sampler.fit_resample(X_train_scaled, y_train)



KeyboardInterrupt: 

In [None]:
# Count distinct values for the resampled target data
y_resampled.value_counts()

In [None]:
# Instantiate a new RandomForestClassier model
cc_model = RandomForestClassifier()

# Fit the resampled data the new model
cc_model.fit(X_resampled, y_resampled)

In [None]:
# Predict labels for resampled testing features
cc_y_pred = cc_model.predict(X_test_scaled)

In [None]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Resampled Data - CentroidClusters")
print(classification_report(y_test, cc_y_pred))

---

## SMOTE

In [24]:
# Import SMOTE from imblearn
from imblearn.over_sampling import SMOTE

# Instantiate the SMOTE instance 
# Set the sampling_strategy parameter equal to auto
smote_sampler = SMOTE(random_state=1, sampling_strategy='auto')

In [25]:
# Fit the training data to the smote_sampler model
X_resampled, y_resampled = smote_sampler.fit_resample(X_train_scaled, y_train)

In [26]:
# Count distinct values for the resampled target data
y_resampled.value_counts()

y
0    26412
1    26412
Name: count, dtype: int64

In [27]:
# Instantiate a new RandomForestClassier model 
smote_model = RandomForestClassifier()

# Fit the resampled data to the new model
smote_model.fit(X_resampled, y_resampled)

In [28]:
# Predict labels for resampled testing features
smote_y_pred = smote_model.predict(X_test_scaled)

In [29]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTE")
print(classification_report(y_test, smote_y_pred))

Classification Report - Original Data
              precision    recall  f1-score   support

           0       0.71      0.76      0.73      8609
           1       0.75      0.70      0.72      8891

    accuracy                           0.73     17500
   macro avg       0.73      0.73      0.73     17500
weighted avg       0.73      0.73      0.73     17500

---------
Classification Report - Resampled Data - SMOTE
              precision    recall  f1-score   support

           0       0.71      0.75      0.73      8609
           1       0.74      0.70      0.72      8891

    accuracy                           0.72     17500
   macro avg       0.73      0.72      0.72     17500
weighted avg       0.73      0.72      0.72     17500



---

## SMOTEENN

In [30]:
# Import SMOTEEN from imblearn
from imblearn.combine import SMOTEENN

# Instantiate the SMOTEENN instance
smote_enn = SMOTEENN(random_state=1)

In [31]:
# Fit the model to the training data
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)

In [32]:
# Instantiate a new RandomForestClassier model
smoteenn_model = RandomForestClassifier()

# Fit the resampled data the new model
smoteenn_model.fit(X_resampled, y_resampled)

In [33]:
# Predict labels for resampled testing features
smoteenn_y_pred = smoteenn_model.predict(X_test_scaled)

In [34]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTEENN")
print(classification_report(y_test, smoteenn_y_pred))

Classification Report - Original Data
              precision    recall  f1-score   support

           0       0.71      0.76      0.73      8609
           1       0.75      0.70      0.72      8891

    accuracy                           0.73     17500
   macro avg       0.73      0.73      0.73     17500
weighted avg       0.73      0.73      0.73     17500

---------
Classification Report - Resampled Data - SMOTEENN
              precision    recall  f1-score   support

           0       0.71      0.73      0.72      8609
           1       0.73      0.71      0.72      8891

    accuracy                           0.72     17500
   macro avg       0.72      0.72      0.72     17500
weighted avg       0.72      0.72      0.72     17500

