# Import Required Library

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import tree
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.ensemble import RandomForestClassifier

# Load Data

In [None]:
df = pd.read_csv('./dataset/predict_students_dropout_and_academic_success.csv',delimiter=';')
df.columns = df.columns.str.strip().str.lower()\
    .str.replace(" ","_").str.replace("(","").str.replace(")","").str.replace("/","").str.replace("'","")
df.head()

# Preliminary Data QA

Ensuring the Integrity of Your Data

Key Areas to Address:
- Table Structure
- Variable Types
- Null Values
- Range Calculations
- Count Calculations

`info()` gives general information about the DataFrame.

```python
df.info(verbose=False)
```

**Verbose Information**
- number of records: `4424`
- number of columns: `37`
- column data types: `float64(7), int64(29), object(1)`
- non-null values in each columns: `4424` (Same for all)


In [None]:
df.info(verbose=False)

**Info**

In [None]:
df.info()

**Null Values**

In [None]:
df.isna().sum() 

**Describe Stats**

In [None]:
df.describe().T

# Data Preprocessing

In the process of data preprocessing, a comprehensive evaluation of the dataset was conducted. During this examination, it was determined that the dataset contained no null values, and all categorical variables had been suitably encoded. An analysis of the imbalance ratio between the dropout and graduate classes revealed a ratio of 0.64. Although this ratio does not reach the threshold for being considered extreme, it does indicate a subtle imbalance within the dataset. Consequently, we have opted to address this by proceeding with oversampling the dataset using the SMOTE technique.=

## Imbalance Ratio

The dataset is imbalance even though the difference is not extreme. The "graduate" class has more instances than the "dropout" class. 

- Majority Class: `graduate`
- Minority Class: `dropout`

Imbalance Ratio: 0.64

In [None]:
plt.figure(figsize=(6,6))
ax=sns.countplot(x= df['target'], palette =  "Blues")
for label in ax.containers:
    ax.bar_label(label)
plt.title("Target")
plt.show()

In [None]:
target_distribution = df.groupby('target').size()
display(target_distribution.reset_index())
dropout_count = target_distribution['Dropout']
graduate_count = target_distribution['Graduate']
imbalance_ratio = dropout_count/graduate_count
print("Imbalance Ratio: {:.2f}".format(imbalance_ratio))

## Separate Features & Target values

In [None]:
working_df = df[df['target']!='Enrolled']
X = working_df.drop(columns=['target'])
y = working_df['target']

## SMOTE Oversample Dataset

In [None]:
print("Class distribution before oversampling:")
print(Counter(y))

# Initialize SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)

# Apply SMOTE to oversample the minority class
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the class distribution after oversampling
print("Class distribution after oversampling:")
print(Counter(y_resampled))

## Split train-test dataset

In [None]:
y_binary = np.where(y_resampled == 'Dropout', 1, 0)
x_train,x_test,y_train,y_test = train_test_split(X_resampled,y_binary,stratify=y_resampled)

print("Train Dataset:",x_train.shape)
print("Test Dataset:",x_test.shape)

# EDA
Most of the column values are coded. You can find their values at the end of the [dataset source](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success). From the page, the reference are scraped and saved as `code_reference.csv`

In [None]:
reference_df = pd.read_csv('./dataset/code_reference.csv',delimiter=';')
print("reference table shape:", reference_df.shape)
reference_df.head()

## Correlation Matrix

In [None]:
corr_df = df.copy()
corr_df['target'] = corr_df['target'].apply(lambda x: 1 if x == 'Dropout' else 0)
corr_df.head()

In [None]:
plt.figure(figsize = (20,20))
sns.heatmap(corr_df.corr(),cmap='Blues',square = True)

In [None]:
selected_cols = corr_df.iloc[:, :8] 
correlation_matrix = selected_cols.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='Blues', vmin=-1, vmax=1)
plt.title('Corr with First 8 Cols Heatmap')
plt.show()

In [None]:
selected_cols = corr_df.iloc[:, 8:12]  
correlation_matrix = selected_cols.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='Blues', vmin=-1, vmax=1)
plt.title('Parent Occupation & Qualification Heatmap')
plt.show()

In [None]:
selected_cols = corr_df.iloc[:, 12:21]  
correlation_matrix = selected_cols.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='Blues', vmin=-1, vmax=1)
plt.title('Corr with Unrelated Middle Cols Heatmap')
plt.show()

In [None]:
selected_cols = corr_df.iloc[:, 21:33]  
correlation_matrix = selected_cols.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='Blues', vmin=-1, vmax=1)
plt.title('Curricular Units')
plt.show()

In [None]:
# Create a figure with one row and two columns
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))

# Plot data on each subplot
axes[0].set_title('1st semester Curricular Units')

selected_cols_1 = corr_df.iloc[:, 21:27] 
selected_cols_1.columns = ["credited","enrolled","evaluations","approved","grade","no_evaluations"]
correlation_matrix_1 = selected_cols_1.corr()
sns.heatmap(correlation_matrix_1, annot=True, cmap='Blues', vmin=-1, vmax=1,ax=axes[0])


axes[1].set_title('2nd semester Curricular Units')
selected_cols_2 = corr_df.iloc[:, 27:33]  
selected_cols_2.columns = ["credited","enrolled","evaluations","approved","grade","no_evaluations"]
correlation_matrix_2 = selected_cols_2.corr()
sns.heatmap(correlation_matrix_2, annot=True, cmap='Blues', vmin=-1, vmax=1,ax=axes[1])
# Adjust spacing between subplots
plt.tight_layout()
# plt.suptitle('Curricular Units', fontsize=16)
plt.show()


In [None]:
selected_cols = corr_df.iloc[:, 33:]  
correlation_matrix = selected_cols.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='Blues', vmin=-1, vmax=1)
plt.title('Corr with global index columns')
plt.show()

In [None]:
correlation_matrix = corr_df.corr()
top_corr_cols = correlation_matrix.unstack().sort_values(ascending=False).drop_duplicates()

# Select the top 10 most correlated columns (excluding the same column correlation and reverse pairs)
top_corr_cols = top_corr_cols[(top_corr_cols != 1) & (top_corr_cols != -1)].head(10)
top_corr_cols.name = "correlation"
display(top_corr_cols.reset_index())


## Features Vs Target Distribution

In [None]:
merged = pd.merge(working_df, reference_df[reference_df['column'] == 'marital_status'],
                left_on='marital_status', right_on='code', how='left')
result = merged.groupby(['value', 'target']).size().reset_index(name='count')

# Create a bar chart
fig, ax = plt.subplots(figsize=(6, 4))
result_pivot = result.pivot(index='value', columns='target', values='count')
result_pivot.plot(kind='bar', ax=ax)

# Customize the plot
plt.xlabel('marital_status')
plt.ylabel('Count')
plt.title('Marital Status vs. Target Distribution')
plt.legend(title='Target')

plt.show()

In [None]:
merged = pd.merge(working_df, reference_df[reference_df['column'] == 'application_mode'],
                left_on='application_mode', right_on='code', how='left')
result = merged.groupby(['value', 'target']).size().reset_index(name='count')

# Create a bar chart
fig, ax = plt.subplots(figsize=(6, 4))
result_pivot = result.pivot(index='value', columns='target', values='count')
result_pivot.plot(kind='bar', ax=ax)

# Customize the plot
plt.xlabel('application_mode')
plt.ylabel('Count')
plt.title('Application Mode vs. Target Distribution')
plt.legend(title='Target')

plt.show()

In [None]:
merged = pd.merge(working_df, reference_df[reference_df['column'] == 'educational_special_needs'],
                left_on='educational_special_needs', right_on='code', how='left')
result = merged.groupby(['value', 'target']).size().reset_index(name='count')

# Create a bar chart
fig, ax = plt.subplots(figsize=(6, 4))
result_pivot = result.pivot(index='value', columns='target', values='count')
result_pivot.plot(kind='bar', ax=ax)

# Customize the plot
plt.xlabel('educational_special_needs')
plt.ylabel('Count')
plt.title('Educational Special Needs vs. Target Distribution')
plt.legend(title='Target')

plt.show()

In [None]:
merged = pd.merge(working_df, reference_df[reference_df['column'] == 'gender'],
                left_on='gender', right_on='code', how='left')
result = merged.groupby(['value', 'target']).size().reset_index(name='count')

# Create a bar chart
fig, ax = plt.subplots(figsize=(6, 4))
result_pivot = result.pivot(index='value', columns='target', values='count')
result_pivot.plot(kind='bar', ax=ax)

# Customize the plot
plt.xlabel('gender')
plt.ylabel('Count')
plt.title('Gender vs. Target Distribution')
plt.legend(title='Target')

plt.show()

## Outlier Detection

In [None]:
sns.boxplot(x=df['admission_grade'])
plt.title('Outlier Detection for Admission Grade')
plt.show()

In [None]:
sns.boxplot(x=df['age_at_enrollment'])
plt.title('Outlier Detection for Age at Enrollment')
plt.show()

# Model Selection

## Decision Tree

### Simplified

In [None]:
dt = DecisionTreeClassifier(random_state=0)
dt.fit(x_train,y_train)

In [None]:
plt.figure(figsize=(20,20))
features = df.columns
classes = ['Graduate','Dropout']

In [None]:
tree.plot_tree(dt,feature_names=features,class_names=classes,filled=True)
plt.show()

In [None]:
y_test_pred = dt.predict(x_test)
y_train_pred = dt.predict(x_train)
test_accuracy_score =  round(accuracy_score(y_test, y_test_pred),2)
train_accuracy_score =  round(accuracy_score(y_train, y_train_pred),2)

cm_test = confusion_matrix(y_test_pred, y_test)
cm_train = confusion_matrix(y_train_pred, y_train)

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot heatmap for the training set
sns.heatmap(cm_train, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[1])
axes[1].set_title('Training Dataset\nAccuracy: ' + str(train_accuracy_score))

# Plot heatmap for the test set
sns.heatmap(cm_test, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[0])
axes[0].set_title('Test Dataset\nAccuracy: ' + str(test_accuracy_score))

plt.suptitle('Confusion Matrix', fontsize=16)

### Preprunning

In [None]:
params = {'max_depth': [2,4,6,8,10,12],
         'min_samples_split': [2,3,4],
         'min_samples_leaf': [1,2]}

pre_dt = DecisionTreeClassifier()
gcv = GridSearchCV(estimator=pre_dt,param_grid=params)
# cross-validate each combination to estimate performance
gcv.fit(x_train,y_train)
# Optimal Performance Hyperparameters
best_hyperparameters = gcv.best_params_
gcv_dt = gcv.best_estimator_
print(best_hyperparameters)

In [None]:
y_test_pred = gcv_dt.predict(x_test)
y_train_pred = gcv_dt.predict(x_train)
test_accuracy_score =  round(accuracy_score(y_test, y_test_pred),2)
train_accuracy_score =  round(accuracy_score(y_train, y_train_pred),2)

cm_test = confusion_matrix(y_test_pred, y_test)
cm_train = confusion_matrix(y_train_pred, y_train)

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot heatmap for the training set
sns.heatmap(cm_train, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[1])
axes[1].set_title('Training Dataset\nAccuracy: ' + str(train_accuracy_score))

# Plot heatmap for the test set
sns.heatmap(cm_test, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[0])
axes[0].set_title('Test Dataset\nAccuracy: ' + str(test_accuracy_score))

plt.suptitle('Confusion Matrix', fontsize=16)

In [None]:
tree.plot_tree(gcv_dt,feature_names=features,class_names=classes,filled=True)
plt.show()

In [None]:
print(tree.export_text(gcv_dt, feature_names = X.columns.tolist()))

### Post Prunning

In [None]:
path = dt.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(x_train, y_train)
    clfs.append(clf)

In [None]:
clfs = clfs[:-1]
ccp_alphas = ccp_alphas[:-1]
node_counts = [clf.tree_.node_count for clf in clfs]
depth = [clf.tree_.max_depth for clf in clfs]
plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes',drawstyle="steps-post")
plt.plot(ccp_alphas,depth,label='depth',drawstyle="steps-post")
plt.legend()
plt.title('Nodes vs Depth')
plt.show()

In [None]:
train_acc = []
test_acc = []
for c in clfs:
    y_train_pred = c.predict(x_train)
    y_test_pred = c.predict(x_test)
    train_acc.append(accuracy_score(y_train_pred,y_train))
    test_acc.append(accuracy_score(y_test_pred,y_test))

plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()

In [None]:
dt_ccp = DecisionTreeClassifier(random_state=0,ccp_alpha=0.005)
dt_ccp.fit(x_train, y_train)
y_test_pred = dt_ccp.predict(x_test)
y_train_pred = dt_ccp.predict(x_train)
test_accuracy_score =  round(accuracy_score(y_test, y_test_pred), 2)
train_accuracy_score =  round(accuracy_score(y_train, y_train_pred),2)

cm_test = confusion_matrix(y_test_pred, y_test)
cm_train = confusion_matrix(y_train_pred, y_train)

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot heatmap for the training set
sns.heatmap(cm_train, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[1])
axes[1].set_title('Training Dataset\nAccuracy: ' + str(train_accuracy_score))

# Plot heatmap for the test set
sns.heatmap(cm_test, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[0])
axes[0].set_title('Test Dataset\nAccuracy: ' + str(test_accuracy_score))

plt.suptitle('Confusion Matrix', fontsize=16)

In [None]:
tree.plot_tree(dt_ccp,feature_names=features,class_names=classes,filled=True)
plt.show()

In [None]:
print(tree.export_text(dt_ccp, feature_names = X.columns.tolist()))

### ROC & AUC Analysis

- The Receiver Operating Characteristic (ROC) curve and its associated area under the curve (AUC) provide valuable insights into the performance of a binary classification model, such as a Decision Tree. 
- The AUC ranges from 0 to 1, where a higher value indicates better classification performance. 

In [None]:

# Predict probabilities for the positive class ('Dropout' in this case)
y_probs = dt.predict_proba(x_test)[:, 1]
# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_probs, pos_label=1)  # Specify the positive class label
# Calculate the area under the ROC curve (AUC)
roc_auc = auc(fpr, tpr)


# Predict probabilities for the positive class ('Dropout' in this case)
y_probs_gcv = gcv_dt.predict_proba(x_test)[:, 1]
# Calculate ROC curve
fpr_gcv, tpr_gcv, thresholds_gcv = roc_curve(y_test, y_probs_gcv, pos_label=1)  # Specify the positive class label
# Calculate the area under the ROC curve (AUC)
roc_auc_gcv = auc(fpr_gcv, tpr_gcv)

# Predict probabilities for the positive class ('Dropout' in this case)
y_probs_cpp = dt_ccp.predict_proba(x_test)[:, 1]
# Calculate ROC curve
fpr_cpp, tpr_cpp, thresholds_cpp = roc_curve(y_test, y_probs_cpp, pos_label=1)  # Specify the positive class label
# Calculate the area under the ROC curve (AUC)
roc_auc_cpp = auc(fpr_cpp, tpr_cpp)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='AUC UnPrun Decision Tree  (area = %0.2f)' % roc_auc)
plt.plot(fpr_gcv, tpr_gcv, color='green', lw=2, label='AUC PrePrun Decision Tree (area = %0.2f)' % roc_auc_gcv)
plt.plot(fpr_cpp, tpr_cpp, color='red', lw=2, label='AUC PostPrun Decision Tree (area = %0.2f)' % roc_auc_cpp)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

### Result

An AUC of normal Decision Tree indicates the model has a good ability to discriminate between the positive and negative classes. 
Pre-pruning involves stopping the tree construction process before it becomes too complex, which can help prevent overfitting.
Post-pruning involves growing the tree to its full depth and then removing branches that do not contribute significantly to performance.
Compare to normal Decision Tree, Pre-pruned & Post-pruned Decision Tree have a slightly higher AUC value that suggests they are better at distinguishing between positive and negative instances, potentially leading to better generalization.

In [None]:
y_test_pred_ccp = dt_ccp.predict(x_test)
y_test_pred_gcv = gcv_dt.predict(x_test)
ccp_accuracy_score =  round(accuracy_score(y_test, y_test_pred_ccp), 2)
gcv_accuracy_score =  round(accuracy_score(y_test, y_test_pred_gcv),2)

cm_ccp = confusion_matrix(y_test_pred_ccp, y_test)
cm_gcv = confusion_matrix(y_test_pred_gcv, y_test)

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot heatmap for the training set
sns.heatmap(cm_ccp, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[1])
axes[1].set_title('PostPrunning\nAccuracy: ' + str(ccp_accuracy_score))

# Plot heatmap for the test set
sns.heatmap(cm_gcv, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[0])
axes[0].set_title('PrePrunning\nAccuracy: ' + str(gcv_accuracy_score))

plt.suptitle('Confusion Matrix', fontsize=16)

## Random Forest

In [None]:
# Initialize the Random Forest classifier with 100 trees
rfclf = RandomForestClassifier(n_estimators=100, max_depth= 10)
# Train the Random Forest classifier on the training data
rfclf.fit(x_train, y_train)

In [None]:
y_test_pred = rfclf.predict(x_test)
y_train_pred = rfclf.predict(x_train)
test_accuracy_score =  round(accuracy_score(y_test, y_test_pred),2)
train_accuracy_score =  round(accuracy_score(y_train, y_train_pred),2)

cm_test = confusion_matrix(y_test_pred, y_test)
cm_train = confusion_matrix(y_train_pred, y_train)

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot heatmap for the training set
sns.heatmap(cm_train, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[1])
axes[1].set_title('Training Dataset\nAccuracy: ' + str(train_accuracy_score))

# Plot heatmap for the test set
sns.heatmap(cm_test, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[0])
axes[0].set_title('Test Dataset\nAccuracy: ' + str(test_accuracy_score))

plt.suptitle('Confusion Matrix', fontsize=16)

### Parameter Tuning

In [None]:
# Define a grid of hyperparameters to search
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20],
}

# Perform a grid search with cross-validation to find the best hyperparameters
grid_search = GridSearchCV(estimator=rfclf, param_grid=param_grid, cv=3)
grid_search.fit(x_train, y_train)

# Get the Optimal hyperparameters
best_params = grid_search.best_params_
gcv_rfclf = gcv.best_estimator_

print("Optimal Hyperparameters:", best_params)

In [None]:
y_test_pred = gcv_rfclf.predict(x_test)
y_train_pred = gcv_rfclf.predict(x_train)
test_accuracy_score =  round(accuracy_score(y_test, y_test_pred),2)
train_accuracy_score =  round(accuracy_score(y_train, y_train_pred),2)

cm_test = confusion_matrix(y_test_pred, y_test)
cm_train = confusion_matrix(y_train_pred, y_train)

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot heatmap for the training set
sns.heatmap(cm_train, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[1])
axes[1].set_title('Training Dataset\nAccuracy: ' + str(train_accuracy_score))

# Plot heatmap for the test set
sns.heatmap(cm_test, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[0])
axes[0].set_title('Test Dataset\nAccuracy: ' + str(test_accuracy_score))

plt.suptitle('Confusion Matrix', fontsize=16)

In [None]:
# Predict probabilities for the positive class ('Dropout' in this case)
y_probs_rf = gcv_rfclf.predict_proba(x_test)[:, 1]
# Calculate ROC curve
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_probs_rf, pos_label=1)  # Specify the positive class label
# Calculate the area under the ROC curve (AUC)
roc_auc_rf = auc(fpr_rf, tpr_rf)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='AUC UnPrun Decision Tree  (area = %0.2f)' % roc_auc)
plt.plot(fpr_gcv, tpr_gcv, color='green', lw=2, label='AUC PrePrun Decision Tree (area = %0.2f)' % roc_auc_gcv)
plt.plot(fpr_cpp, tpr_cpp, color='red', lw=2, label='AUC PostPrun Decision Tree (area = %0.2f)' % roc_auc_cpp)
plt.plot(fpr_rf, tpr_rf, color='blue', lw=2, label='AUC Random Forest (area = %0.2f)' % roc_auc_rf)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

In [None]:
y_test_pred_ccp = dt_ccp.predict(x_test)
y_test_pred_gcv = gcv_dt.predict(x_test)
y_test_pred_rf = gcv_rfclf.predict(x_test)
ccp_accuracy_score =  round(accuracy_score(y_test, y_test_pred_ccp), 2)
gcv_accuracy_score =  round(accuracy_score(y_test, y_test_pred_gcv),2)
rf_accuracy_score =  round(accuracy_score(y_test, y_test_pred_rf),2)

cm_ccp = confusion_matrix(y_test_pred_ccp, y_test)
cm_gcv = confusion_matrix(y_test_pred_gcv, y_test)
cm_rf = confusion_matrix(y_test_pred_rf, y_test)

# Create a figure with two subplots
fig, axes = plt.subplots(1, 3, figsize=(22, 6))

# Plot heatmap for the training set
sns.heatmap(cm_rf, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[2])
axes[2].set_title('Random Forset\nAccuracy: ' + str(rf_accuracy_score), fontsize=16)

# Plot heatmap for the training set
sns.heatmap(cm_ccp, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[1])
axes[1].set_title('Postprunning Decision Tree\nAccuracy: ' + str(ccp_accuracy_score), fontsize=16)

# Plot heatmap for the test set
sns.heatmap(cm_gcv, annot=True, yticklabels=classes, xticklabels=classes, cmap='Blues', fmt='g', ax=axes[0])
axes[0].set_title('Preprunning Decision Tree\nAccuracy: ' + str(gcv_accuracy_score), fontsize=16)

# plt.suptitle('Confusion Matrix', fontsize=18)

# Conclusion

- Prepruning and postpruning techniques both played a role in enhancing the decision tree model's accuracy while maintaining a good AUC score. Prepruning, in particular, had a significant impact on improving AUC.
- Random Forest demonstrated consistent accuracy on both train and test data, and it consistently performed well in terms of AUC as well.
- Prepruned decision tree and random forest models are the most promising options for this dataset, as they achieve higher accuracy and maintain strong AUC scores.

# Pickle Model

In [None]:
import pickle
pickle.dump(dt_ccp, open('model.pkl', 'wb'))