 ## Ensemble Methods

## 1. ESSENTIAL IMPORTS

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score


## 2.  Data  Import and View

In [None]:
data = pd.read_csv("D:/FTI/Module 3 Supervised Machine Learning/Lecture3/Finance.csv") 

In [None]:
data

https://archive.ics.uci.edu/ml/datasets/bank+marketing

## 3. Checking for Missing Values

In [None]:
data.isnull().sum()

## 4. Checking for Data Types

In [None]:
data.dtypes

## 5. Apply normalization to Numeric Columns

In [None]:
num_col=[]
cat_col=[]
for col in data.columns:
    if data[col].dtype=='int64':
        num_col.append(col)
    else:
        cat_col.append(col)

In [None]:
df_num_col=data[num_col]
df_cat_col=data[cat_col]

In [None]:
from sklearn.preprocessing import StandardScaler
#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(data[num_col]).round(3)
scaled = pd.DataFrame(scaled,columns=num_col)

In [None]:
df=pd.concat([df_cat_col,scaled], axis=1)

In [None]:
df

In [None]:
data=df.copy()

## 6. Preprocessing

In [None]:
data['y'].replace(to_replace='yes', value=1, inplace=True)
data['y'].replace(to_replace='no',  value=0, inplace=True)

In [None]:
data.head()

### Encoding 

In [None]:
df_dummies = pd.get_dummies(data)
df_dummies.head()

### View Correlation 

In [None]:
plt.figure(figsize=(15,8))
df_dummies.corr()['y'].sort_values(ascending = False).plot(kind='bar')

In [None]:
y = df_dummies['y'].values
X = df_dummies.drop(columns = ['y'])

## 7. Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## 8. Model Fitting and Evaluation  (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train, y_train)
prediction_test = clf.predict(X_test)

## Classification Report

In [None]:
def classification_report(model):
    prediction_test = model.predict(X_test)
    # Print the prediction accuracy
    accuracy = metrics.accuracy_score(y_test, prediction_test)
    confusion_matrix = metrics.confusion_matrix(y_test, prediction_test)
    classification = metrics.classification_report(y_test, prediction_test)
    probs = model.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, probs)


    print('============================== Model Evaluation ==============================')
    print('Classifier :',model)
    print("\n Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification)
    print('AUC: %.3f' % auc)
   

In [None]:
classification_report(clf)

## Confusion Matrix

In [None]:
class_names=[1,0]
def plot_conf_matrix(y_test, prediction_test, class_names):
    cnf_matrix = metrics.confusion_matrix(y_test, prediction_test)
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names)
    plt.yticks(tick_marks, class_names)
    # create heatmap
    sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    

In [None]:
class_names=[0,1]
plot_conf_matrix(y_test, prediction_test, class_names)

## ROC Curve

In [None]:
def plot_roc(model,X_test,y_test):
    y_pred_proba = model.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_proba)
    plt.plot([0,1],[0,1],'k--')
    plt.plot(fpr,tpr, label='NB')
    plt.xlabel('fpr')
    plt.ylabel('tpr')
    plt.title('Naive Bayes ROC curve')
    plt.show()

In [None]:
plot_roc(clf,X_test,y_test)

## Feature Importance using Random Forest

In [None]:
importances = clf.feature_importances_
#Sort it
print ("Sorted Feature Importance:")
sorted_feature_importance = sorted(zip(importances, list(X_train)), reverse=True)
print (sorted_feature_importance)

In [None]:
feature_imp = pd.Series(clf.feature_importances_,index=X_train.columns).sort_values(ascending=False)

In [None]:
# Creating a bar plot
plt.figure(figsize=(16,12))
sns.barplot(x=feature_imp, y=feature_imp.index)

## Adaboost

In [None]:
from sklearn.ensemble import  AdaBoostClassifier
clf = AdaBoostClassifier().fit(X_train, y_train)
predictions_test=clf.predict(X_test)

In [None]:
classification_report(clf)

In [None]:
class_names=[0,1]
plot_conf_matrix(y_test, prediction_test, class_names)

In [None]:
plot_roc(clf,X_test,y_test)

## Gradient Boosting 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier().fit(X_train, y_train)
predictions_test=clf.predict(X_test)

In [None]:
classification_report(clf)

In [None]:
class_names=[0,1]
plot_conf_matrix(y_test, prediction_test, class_names)

In [None]:
plot_roc(clf,X_test,y_test)