<img src="mmu_logo.png" style="height: 80px;" align=left> 

# Learning Objectives

Towards the end of this lesson, you should be able to:
- experiments with 2 different feature selection algorithms
- define different ways to evaluate a model


### Load Libraries

In [None]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeClassifier 
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve

from tqdm import tqdm 

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", 500)

# Feature Selection

### Function for Ranking Feature

This is a function to do the ranking of the variables

In [None]:
def ranking(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x,2), ranks)
    return dict(zip(names, ranks))

### Read Dataset

In [None]:
df = pd.read_csv("banking.csv")
df.head()

In [None]:
df.y.value_counts()

### One-Hot Encoding

In [None]:
# your codes here...



### Separate into X, y

In [None]:
y = df.y
X = df.drop("y", 1)
colnames = X.columns

### Prepare Boruta classifier

In [None]:
# using the BorutaPy function
# your codes here...
 

### Fit Boruta classifier to data

In [None]:
# your codes here...


### Get the ranking of the features returned by Boruta

In [None]:
boruta_score = ranking(list(map(float, feat_selector.ranking_)), colnames, order=-1)
boruta_score = pd.DataFrame(list(boruta_score.items()), columns=['Features', 'Score'])
boruta_score = boruta_score.sort_values("Score", ascending = False)

### Top and Bottom 10 features


In [None]:
print('---------Top 10----------')
display(boruta_score.head(10))

print('---------Bottom 10----------')
boruta_score.tail(10)

### Plot it out

In [None]:
sns_boruta_plot = sns.catplot(x="Score", y="Features", data = boruta_score[:], kind = "bar", 
               height=14, aspect=1.9, palette='coolwarm')
plt.title("Boruta all Features")

## RFE

### Prepare RFE classifier

In [None]:
# Your Task:
# Try the following params:
# 1. class_weight={0:1,1:2}
# 2. class_weight="balanced"
# 3. max_depth=4
# 4. max_depth=6
# 5. n_estimators = 100
# observe the feature importance ranking.

# your codes here...



### Fit RFE classifier to data

In [None]:
rfe.fit(X, y)

### Get the ranking of the features returned by RFE

In [None]:
rfe_score = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)
rfe_score = pd.DataFrame(list(rfe_score.items()), columns=['Features', 'Score'])
rfe_score = rfe_score.sort_values("Score", ascending = False)

### Plot it out

In [None]:
sns_rfe_plot = sns.catplot(x="Score", y="Features", data = rfe_score[-10:], kind = "bar", 
               height=14, aspect=1.9, palette='coolwarm')
plt.title("RFE bottom-10 Features")

# Modeling

In [None]:
# Setup model list using only NB and DT

model_list = ["NB", "DT"]
feature_num, acc_nb, acc_dt = [], [], []

for i in range(1, 30):
    feature_num.append(i)
    for model in model_list:
        
        # Create X and y dataset
        y = df.y
        X = df.drop("y", axis = 1)
        
        cols = boruta_score.Features[0:i]
        X = X[cols].copy()
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
        
        if model == "NB":
            clf = GaussianNB()
        elif model == "DT":
            clf = DecisionTreeClassifier(max_depth=3) 
        
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        acc = round((accuracy_score(y_test, y_pred)*100), 2)
        
        if model == "NB":
            acc_nb.append(acc)
        elif model == "DT":
            acc_dt.append(acc) 
            
# your codes here...



In [None]:
# Plot the line charts

sns.set(rc={'figure.figsize':(11.7,8.27)})

ax = sns.lineplot(x = "No_Of_Features", y = "Accuracy", hue = "Model", data = boruta_acc_result)
ax.set(ylim=(0, 100))
ax.set(title="Accuracy Trend for Different Classifiers (Boruta)")

In [None]:
# Get the model accuracy using different number of features

# your codes here...
 

# Model Evaluation

In [None]:
# Read dataset

df = pd.read_csv('banking.csv') 
df.head()

In [None]:
# Prepare X and y

df_X = df.drop('y', axis=1)
y = df['y']

In [None]:
# dummification of variables

# your codes here...


In [None]:
X.columns

In [None]:
X.head()

In [None]:
# Prepare the train and test dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

### Training a Decision Tree Model


In [None]:
# construct Decision Tree Model

# your codes here...

y_pred = model_DT.predict(X_test)

In [None]:
y_pred

In [None]:
# Model Accuracy
# your codes here...



In [None]:
# Confusion matrix report

confusion_majority=confusion_matrix(y_test, y_pred)

print('Mjority classifier Confusion Matrix\n', confusion_majority)

print('**********************')
print('Mjority TN= ', confusion_majority[0][0])
print('Mjority FP=', confusion_majority[0][1])
print('Mjority FN= ', confusion_majority[1][0])
print('Mjority TP= ', confusion_majority[1][1])
print('**********************')

print('Precision= {:.2f}'.format(precision_score(y_test, y_pred)))
print('Recall= {:.2f}'. format(recall_score(y_test, y_pred)))
print('F1= {:.2f}'. format(f1_score(y_test, y_pred)))
print('Accuracy= {:.2f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
# Calculate AUC

# your codes here...

print('AUC: %.2f' % auc_DT)

In [None]:
# Plot ROC Curve 

# your codes here...

plt.plot(fpr_DT, tpr_DT, color='blue', label='DT') 
plt.plot([0, 1], [0, 1], color='green', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()


In [None]:
# Plot Precision-Recall Curve
from sklearn import metrics

# your codes here...

plt.plot(prec_DT, rec_DT, color='blue', label='DT') 
plt.plot([1, 0], [0.1, 0.1], color='green', linestyle='--')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()

 
# calculate precision-recall AUC 
# your codes here...


### Training a Naive Bayes Model

In [None]:
# your codes here...


In [None]:
# Model Accuracy

print("Accuracy on training set: {:.3f}".format(nb.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(nb.score(X_test, y_test)))

In [None]:
confusion_majority=confusion_matrix(y_test, y_pred)

print('Mjority classifier Confusion Matrix\n', confusion_majority)

print('**********************')
print('Mjority TN= ', confusion_majority[0][0])
print('Mjority FP=', confusion_majority[0][1])
print('Mjority FN= ', confusion_majority[1][0])
print('Mjority TP= ', confusion_majority[1][1])
print('**********************')

print('Precision= {:.2f}'.format(precision_score(y_test, y_pred)))
print('Recall= {:.2f}'. format(recall_score(y_test, y_pred)))
print('F1= {:.2f}'. format(f1_score(y_test, y_pred)))
print('Accuracy= {:.2f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
# Calculate AUC

# your codes here...

print('AUC: %.2f' % auc_NB)

In [None]:
# Plot ROC Curve 

fpr_NB, tpr_NB, thresholds_NB = roc_curve(y_test, prob_NB) 

plt.plot(fpr_NB, tpr_NB, color='orange', label='NB') 
plt.plot([0, 1], [0, 1], color='green', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()


In [None]:
prec_NB, rec_NB, threshold_NB = precision_recall_curve(y_test, prob_NB)

plt.plot(prec_NB, rec_NB, color='orange', label='NB') 
plt.plot([1, 0], [0.1, 0.1], color='green', linestyle='--')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()

# calculate precision-recall AUC 
print(metrics.auc(rec_NB, prec_NB))

### Performance Comparison

In [None]:
plt.plot(fpr_NB, tpr_NB, color='orange', label='NB') 
plt.plot(fpr_DT, tpr_DT, color='blue', label='DT')  
plt.plot([0, 1], [0, 1], color='green', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()

In [None]:

plt.plot(prec_DT, rec_DT, color='blue', label='DT') 
plt.plot(prec_NB, rec_NB, color='orange', label='NB') 

plt.plot([1, 0], [0.1, 0.1], color='black', linestyle='--')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()