In [None]:
#Importing Libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from bioinfokit import visuz
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
#Reading in Dataset
df = pd.read_csv('BankChurners.csv')

#Subsetting to needed columns
df = df[df.columns[1:21]]

#Changing Attrition_Flag column to 0s and 1s
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1})

In [None]:
#EDA Viz
sns.countplot(x='Attrition_Flag', data=df, palette='Paired')
plt.show()

In [None]:
#Percent of Attrition
count_no_sub = len(df[df['Attrition_Flag']==0])
count_sub = len(df[df['Attrition_Flag']==1])
pct_of_no_sub = count_no_sub/(count_no_sub+count_sub)
print("percentage of no attrition is", pct_of_no_sub*100)
pct_of_sub = count_sub/(count_no_sub+count_sub)
print("percentage of attrition is", pct_of_sub*100)

In [None]:
#EDA Viz
%matplotlib inline
pd.crosstab(df.Education_Level,df.Attrition_Flag).plot(kind='bar')
plt.title('Attrition Frequency for Education Level')
plt.xlabel('Education Level')
plt.ylabel('Count')

In [None]:
#EDA Viz
%matplotlib inline
pd.crosstab(df.Income_Category,df.Attrition_Flag).plot(kind='bar')
plt.title('Attrition Frequency for Income Category')
plt.xlabel('Income Category')
plt.ylabel('Count')

In [None]:
#EDA Viz
%matplotlib inline
pd.crosstab(df.Card_Category,df.Attrition_Flag).plot(kind='bar')
plt.title('Attrition Frequency for Card Category')
plt.xlabel('Card Category')
plt.ylabel('Count')

In [None]:
#EDA Viz
%matplotlib inline
pd.crosstab(df.Gender,df.Attrition_Flag).plot(kind='bar')
plt.title('Attrition Frequency for Gender')
plt.xlabel('Gender')
plt.ylabel('Count')

In [None]:
#EDA Viz
%matplotlib inline
pd.crosstab(df.Marital_Status,df.Attrition_Flag).plot(kind='bar')
plt.title('Attrition Frequency for Marital Status')
plt.xlabel('Marital Status')
plt.ylabel('Count')

In [None]:
#EDA Viz
df.Customer_Age.hist()
plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')

In [None]:
#One Hot Encoding for Categorical Variables
df = pd.get_dummies(df, columns=["Gender", "Education_Level","Marital_Status", "Income_Category", "Card_Category"])
df.iloc[14:] = df.iloc[14:].astype(int)

In [None]:
#Creating Predictor and Resposne DFs
X = df.loc[:, df.columns != 'Attrition_Flag']
y = df.loc[:, df.columns == 'Attrition_Flag']

y = y.astype('int')

In [None]:
#Creating Test and Train splits
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

In [None]:
# initiate the model
logreg = LogisticRegression(max_iter = 1000)

# fitting the model
model = logreg.fit(X_train,y_train.values.ravel())

# response predictions
y_pred=logreg.predict(X_test)

In [None]:
#Confusion Matrix
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
#Confusion Matrix Visualization
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
#Performance Measures
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

In [None]:
#ROC Curve
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
# Extracting Model Coefficients
log_odds = model.coef_[0]

pd.DataFrame(log_odds, 
             X.columns, 
             columns=['coef'])\
            .sort_values(by='coef', ascending=False)