In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
from sklearn.metrics import mean_squared_error
from matplotlib.legend_handler import HandlerLine2D
from sklearn.inspection import PartialDependenceDisplay

In [None]:
#Reading in Dataset
df = pd.read_csv('BankChurners.csv')

#Subsetting to needed columns
df = df[df.columns[1:21]]

#Changing Attrition_Flag column to 0s and 1s
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1})

#Dummy variables for all Categorical Data
df = pd.get_dummies(df, columns=["Gender", "Education_Level","Marital_Status", "Income_Category", "Card_Category"])
df.iloc[14:] = df.iloc[14:].astype(int)

In [None]:
#Creating Training/Split Sets
X = df.loc[:, df.columns != 'Attrition_Flag']
y = df.loc[:, df.columns == 'Attrition_Flag']

y = y.astype('int')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

In [None]:
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Training the Model
clf.fit(X_train,y_train.values.ravel())

y_pred=clf.predict(X_test)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

In [None]:
#New Data Prediction
clf.predict([[27, 1, 12, 1, 0, 1, 8000, 150, 7850, .2, 3500, 52, .3, .45, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1]])

In [None]:
#Caclulating Most Important Features
feature_imp = pd.Series(clf.feature_importances_, X.columns).sort_values(ascending=False)
feature_imp

In [None]:
%matplotlib inline

# Creating a bar plot
rcParams['figure.figsize'] = 15,10
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.show()

In [None]:
# Creating New Random Forest Model with a Random Variable Included

In [None]:
# New Dataframe
df_random = df

In [None]:
# Creating Random Variable
df_random['random'] = np.random.randint(1, 100, df_random.shape[0])

In [None]:
# Train/Test Split
X_rand = df.loc[:, df_random.columns != 'Attrition_Flag']
y_rand = df.loc[:, df_random.columns == 'Attrition_Flag']

y_rand = y_rand.astype('int')

X_train_rand, X_test_rand, y_train_rand, y_test_rand = train_test_split(X_rand, y_rand, test_size=0.3)

In [None]:
#Create a Gaussian Classifier
clf_rand=RandomForestClassifier(n_estimators=500)

#Training the Model
clf_rand.fit(X_train_rand,y_train_rand.values.ravel())

y_pred_rand=clf_rand.predict(X_test_rand)

In [None]:
# Feature Importance with Random Variable
feature_imp_rand = pd.Series(clf_rand.feature_importances_, X_rand.columns).sort_values(ascending=False)
feature_imp_rand

In [None]:
%matplotlib inline

# Creating a bar plot
rcParams['figure.figsize'] = 15,10
sns.barplot(x=feature_imp_rand, y=feature_imp_rand.index)

# Adding Labels
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.show()

In [None]:
# Creating New Random Forest Model with only features that did better than Random Variable

In [None]:
# New Dataframe
df_2 = df

In [None]:
# Only Keeping Variables that did better than random
df_2 = df_2[["Attrition_Flag", "Total_Trans_Amt", "Total_Trans_Ct", "Total_Revolving_Bal", "Total_Relationship_Count", "Credit_Limit", "Avg_Open_To_Buy", "Customer_Age", "Contacts_Count_12_mon", "Months_on_book"]]

In [None]:
# Test/Train Split
X_2 = df_2.loc[:, df_2.columns != 'Attrition_Flag']
y_2 = df_2.loc[:, df_2.columns == 'Attrition_Flag']

y_2 = y_2.astype('int')

X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.3) # 70% training and 30% test

In [None]:
#Create a Gaussian Classifier
clf_2=RandomForestClassifier(n_estimators=100)

#Training the Model
clf_2.fit(X_train_2,y_train_2.values.ravel())

y_pred_2=clf_2.predict(X_test_2)

In [None]:
# Model Metrics
print("Accuracy:",metrics.accuracy_score(y_test_2, y_pred_2))
print("Precision:",metrics.precision_score(y_test_2, y_pred_2))
print("Recall:",metrics.recall_score(y_test_2, y_pred_2))

In [None]:
# Feature Importance
feature_imp_2 = pd.Series(clf_2.feature_importances_, X_2.columns).sort_values(ascending=False)
feature_imp_2

In [None]:
%matplotlib inline

# Creating a bar plot
rcParams['figure.figsize'] = 15,10
sns.barplot(x=feature_imp_2, y=feature_imp_2.index)

# Adding labels
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.show()

In [None]:
#ROC Curve
fpr, tpr, _ = metrics.roc_curve(y_test_2,  y_pred_2)
auc = metrics.roc_auc_score(y_test_2, y_pred_2)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()