<img src="mmu_logo.png" style="height: 80px;" align=left> 

# Week 5.1: Learning Objectives

Towards the end of this lesson, you should be able to:
- programming in Logistic regression


# Load Python Libraries

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
plt.rc("font", size=14)

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
sns.set(rc={'figure.figsize':(11,6)})

from scipy.stats import spearmanr 

import missingno as msno

pd.set_option('display.max_columns', 500)

In [None]:
!pip install missingno

# Read banking.csv

In [None]:
df = pd.read_csv('banking.csv')
df.head()

In [None]:
df.shape


In [None]:
df.columns

In [None]:
df_tmp = df.loc[1:100,['age','duration']]
sns.regplot(x='age',y='duration', data= df_tmp)

# Correlation Check

In [None]:
age = df.loc[1:100,['age']]
duration = df.loc[1:100,['duration']]
spearmanr_coef, p_value = spearmanr(age, duration)

print("coefficient=",spearmanr_coef)
print('p-value=',p_value)

# Check missing values

In [None]:
df.isnull().sum()
msno.bar(df)

In [None]:
df = df.dropna()
df = df.fillna(0)

# Plot chart for counting


In [None]:
df.y.value_counts()

In [None]:
b=sns.countplot(x='y', data = df)

for p in b.patches:
    	b.annotate("%.0f" % p.get_height(), (p.get_x() + 
	p.get_width() / 2., p.get_height()), 
    	ha='center', va='center', rotation=0, 
	xytext=(0, 18), textcoords='offset points')

In [None]:
df.info()
df.dtypes
df.describe()
df.shape

# Basic Transformation

In [None]:
df['education'].unique()

In [None]:
# your answer here...



In [None]:
df['education'].unique()

In [None]:
count_no_sub = len(df[df['y']==0])
count_sub = len(df[df['y']==1])
pct_of_no_sub = count_no_sub/(count_no_sub+count_sub)
print("percentage of no subscription is", pct_of_no_sub*100)
pct_of_sub = count_sub/(count_no_sub+count_sub)
print("percentage of subscription", pct_of_sub*100)

In [None]:
df.describe()

In [None]:
df.groupby('y').mean()

In [None]:
df.groupby('job').mean()

# Simple Visualization

In [None]:
a=pd.crosstab(df.job,df.y).plot(kind='bar')
plt.title('Purchase Frequency for Job Title')
plt.xlabel('Job')
plt.ylabel('Frequency of Purchase') 


In [None]:
table=pd.crosstab(df.marital,df.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Marital Status vs Purchase')
plt.xlabel('Marital Status')
plt.ylabel('Proportion of Customers') 

In [None]:
table=pd.crosstab(df.education,df.y)
table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.title('Stacked Bar Chart of Education vs Purchase')
plt.xlabel('Education')
plt.ylabel('Proportion of Customers') 

In [None]:
df.age.hist()
plt.title('Histogram of Age')
plt.xlabel('Age')
plt.ylabel('Frequency') 

# Create Dummy Variables

In [None]:
df.head()

In [None]:
cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']

# your answer here...



In [None]:
df_final=df[to_keep]
df_final.columns.values

In [None]:
df_final.head()

# Over-sampling using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE 

In [None]:
X = df_final.loc[:, df_final.columns != 'y']
y = df_final.loc[:, df_final.columns == 'y']
 
from imblearn.over_sampling import SMOTE 


# your answer here...


# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

In [None]:
# your answer here...



In [None]:
len(os_data_)

In [None]:
X = X.astype(int)

In [None]:
X.loan_no = X.loan_no.astype(int)


# Using statsmodels

In [None]:
import statsmodels.api as sm

logit_model=sm.Logit(y,X)
result=logit_model.fit(maxiter=200)
print(result.summary2())

# Using sklearn

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y.values.ravel(), test_size=0.3, random_state=0)

# your answer here...



In [None]:
# your answer here...


print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
y_pred

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
tn, fp, fn, tp = confusion_matrix.ravel()

In [None]:
print(tp, fp, fn, tn)

In [None]:
from sklearn.metrics import classification_report

# your answer here...


In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right") 
plt.show()

In [None]:
df_final.head()
