In [8]:
%matplotlib inline

from pathlib import Path
import pandas as pd
import numpy as np
import math
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.classification import accuracy_score
from sklearn.metrics import roc_curve, auc
import matplotlib.pylab as plt
from utilities import printConfusionMatrix

#import data and organize table for easier usage
DATA = Path('.').resolve().parent/'data'
bank_df = pd.read_csv(DATA/'UniversalBank.csv')
bank_df.drop(columns = ['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 
                        'Family', 'CCAvg', 'Education', 'Mortgage', 
                        'Securities Account', 'CD Account', ], inplace = True)
columns = list(bank_df.columns)
columns.remove('Personal Loan')
columns.append('Personal Loan')
bank_df = bank_df[columns]
bank_df.columns = ['Online', 'CC', 'Loan']
trainData = bank_df.sample(frac=0.6, random_state=12345)
validData = bank_df.drop(trainData.index)

Question 1 (2 points)
Create a pivot table for the training data with Online as a column variable , CC as a row variable, and Loan as a secondary row variable. The values inside the cells should convey the count (number of records).

In [9]:
table = pd.pivot_table(trainData, index=['CC', 'Loan'],
                    columns=['Online'], aggfunc=lambda x: len(x))
table

Unnamed: 0_level_0,Online,0,1
CC,Loan,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,795,1147
0,1,77,118
1,0,302,476
1,1,35,50


Question 2 (2 points)
Consider the task of classifying a customer who owns a bank credit card and is actively using online banking services. Looking at the pivot table that you created, what is the probability that this customer will accept the loan offer?
P(Loan=1|CC=1, Online=1)

In [10]:
p = 50/(476+50) 
print("Probability (Customer will accept the loan if CC=1, Online == 1) = %0.4f" %p)

Probability (Customer will accept the loan if CC=1, Online == 1) = 0.0951


Question 3 (6 points)
One will have Loan (rows) as a function of Online (columns) and the other will have 
Loan (rows) as a function of CC. Compute the probabilities below (report three decimals).
Note: P(A|B) means "the probability of A given B".
P(CC = 1|Loan = 1) = the proportion of credit card holders among the loan acceptors
P(Online = 1|Loan = 1)
P(Loan = 1) = the proportion of loan acceptors
P(CC = 1|Loan = 0)
P(Online = 1|Loan = 0)
P(Loan = 0)

In [11]:
table1 = pd.pivot_table(trainData.drop('CC', axis=1), index = ['Loan'], columns = ['Online'], aggfunc=lambda x: len(x))
table2 = pd.pivot_table(trainData.drop('Online', axis = 1), index = ['Loan'], columns = ['CC'], aggfunc=lambda x: len(x))
print(table1)
print(table2)

Online     0     1
Loan              
0       1097  1623
1        112   168
CC       0    1
Loan           
0     1942  778
1      195   85


In [12]:
#P(CC = 1|Loan = 1) = the proportion of credit card holders among the loan acceptors
p1 = 85/(195+85)
#P(Online = 1|Loan = 1)
p2 = 168/(112+168)
#P(Loan = 1) = the proportion of loan acceptors
p3 = (112+168)/3000
#P(CC = 1|Loan = 0)
p4 = 778/(1942+778)
#P(Online = 1|Loan = 0)
p5 = 1623/(1097+1623)
#P(Loan = 0)
p6 = (1097+1623)/3000
print("P(CC = 1|Loan = 1) = %0.3f\nP(Online = 1|Loan = 1) = %0.3f\nP(Loan = 1) = %0.3f\nP(CC = 1|Loan = 0) = %0.3f\n"
"P(Online = 1|Loan = 0) = %0.3f\nP(Loan = 0) = %0.3f" %(p1, p2, p3, p4, p5, p6)) 

P(CC = 1|Loan = 1) = 0.304
P(Online = 1|Loan = 1) = 0.600
P(Loan = 1) = 0.093
P(CC = 1|Loan = 0) = 0.286
P(Online = 1|Loan = 0) = 0.597
P(Loan = 0) = 0.907


Question 4 (2 points) 
Compute the naive Bayes probability P(Loan = 1|CC = 1, Online = 1). 
Note: Use the quantities that you computed in the previous question. 

In [13]:
#P(Loan = 1|CC = 1, Online = 1) = P(Loan = 1) * P(CC = 1, Online = 1|Loan = 1)/P(CC=1, Online = 1)
#=P(Loan = 1) * P(CC = 1|Loan = 1) * P(Online =1|Loan = 1)/P(CC=1)*P(Online = 1)
#if we assume CC and Online is independent
p = (p3*p1*p2)/((p1*p3 + p4*p6)*(p2*p3 + p5*p6))
print("the naive Bayes probability P(Loan = 1|CC = 1, Online = 1) = %0.4f" %p)
pCC = sum(trainData.CC)/len(trainData)
pOnline = sum(trainData.Online)/len(trainData)

the naive Bayes probability P(Loan = 1|CC = 1, Online = 1) = 0.0990


Question 5 (2 points) 
Of the two values that you computed earlier ( computed in Q2 and Q4), which is a more accurate estimate of P(Loan=1|CC=1, Online=1)? 

The value in Q2 is a more accurate estimate of P(Loan=1|CC=1, Online=1) since NB probability will assume CC and Online is independent

Question 6 (6 points) 
In Python, run naive Bayes on the training data and examine the output and find entries that are needed for computing P(Loan = 1|CC = 1, Online = 1). Compute this probability, and also the predicted probability for P(Loan=1 | Online = 1, CC = 1) 

In [14]:
trainData.dtypes

Online    int64
CC        int64
Loan      int64
dtype: object

In [15]:
trainData.Online = trainData.Online.astype('category')
trainData.CC = trainData.CC.astype('category')
trainData = pd.get_dummies(trainData, prefix_sep='_')
trainData.Loan = trainData.Loan.astype('category')
loans_nb = MultinomialNB(alpha=0.01)
loans_nb.fit(trainData.drop(columns=['Loan']), trainData['Loan'])

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [16]:
trainData.dtypes

Loan        category
Online_0       uint8
Online_1       uint8
CC_0           uint8
CC_1           uint8
dtype: object

In [21]:
le = preprocessing.LabelEncoder()
trainData.loc[:, 'Loan'] = le.fit_transform(trainData['Loan'])
classes = le.classes_
print('A-priori probabilities')
print(['{}: {}'.format(cls, math.exp(p)) for cls, p in zip(classes, loans_nb.class_log_prior_)])
prio0 = math.exp(loans_nb.class_log_prior_[0])
prio1 = math.exp(loans_nb.class_log_prior_[1])
print('P(Loan = 0) = ', prio0)
print('P(Loan = 1) = ', prio1)

A-priori probabilities
['0: 0.9066666666666672', '1: 0.09333333333333338']
P(Loan = 0) =  0.9066666666666672
P(Loan = 1) =  0.09333333333333338


In [42]:
classes = [0, 1]
columns = list(trainData.columns)
columns.remove('Loan')
columns
print('A-priori probabilities')
print(['{}: {}'.format(cls, math.exp(p)) for cls, p in zip(classes, loans_nb.class_log_prior_)])

print('\nConditional probabilities')

nvars = 5
clsWidth = 1
colWidth = max(len(column) for column in columns)
# Construct the format statements
fmt1 = '{{:>{}}}'.format(clsWidth + 1)

i1 = 0
while i1 < len(columns) - 1:
    i2 = min(i1 + nvars, len(columns))
    print(fmt1.format(''), end='')
    fmt2 = '{{:>{}}}'.format(colWidth + 1) * (i2 - i1)
    print(fmt2.format(*(columns[i1:i2])))
    fmt2 = '{{:{}.4f}}'.format(colWidth + 1) * (i2 - i1)
    for i, cls in enumerate(classes):
        print(fmt1.format(cls), end='')
        row = [math.exp(p) for p in loans_nb.feature_log_prob_[i][i1:i2]]
        print(fmt2.format(*row))
    print()
    i1 = i2
#it seems feature_log_prob_ conditional probability need to divide half

A-priori probabilities
['0: 0.9066666666666672', '1: 0.09333333333333338']

Conditional probabilities
   Online_0 Online_1     CC_0     CC_1
 0   0.2017   0.2983   0.3570   0.1430
 1   0.2000   0.3000   0.3482   0.1518

0.15178571428571427 0.3 0.04666666666666667 0.14301470588235293 0.2983455882352941 0.4533333333333333


In [23]:
predProb = loans_nb.predict_proba(trainData.drop(columns=['Loan']))
predicted = pd.concat([trainData, pd.DataFrame(predProb, index=trainData.index)], axis=1)
predicted[(predicted.Online_1 == 1) & (predicted.CC_1 == 1)].head()

Unnamed: 0,Loan,Online_0,Online_1,CC_0,CC_1,0,1
2282,0,0,1,0,1,0.901012,0.098988
4456,0,0,1,0,1,0.901012,0.098988
143,0,0,1,0,1,0.901012,0.098988
1946,0,0,1,0,1,0.901012,0.098988
29,1,0,1,0,1,0.901012,0.098988
