In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from mord import LogisticIT
import matplotlib.pylab as plt
import seaborn as sns
from dmba import classificationSummary, gainsChart, liftChart
from dmba.metric import AIC_score

In [3]:
bank_df = pd.read_csv("D:/dmba/UniversalBank.csv")
bank_df.drop(columns=['ID', 'ZIP Code'], inplace=True)
bank_df.columns = [c.replace(' ', '_') for c in bank_df.columns]
bank_df

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,1,1.9,3,0,0,0,0,1,0
4996,30,4,15,4,0.4,1,85,0,0,0,1,0
4997,63,39,24,2,0.3,3,0,0,0,0,0,0
4998,65,40,49,3,0.5,2,0,0,0,0,1,0


In [9]:
bank_df['Education'] = bank_df['Education'].astype('category')
bank_df.Education.cat.categories


0                   Undergrad
1                   Undergrad
2                   Undergrad
3                    Graduate
4                    Graduate
                ...          
4995    Advanced/Professional
4996                Undergrad
4997    Advanced/Professional
4998                 Graduate
4999                Undergrad
Name: Education, Length: 5000, dtype: category
Categories (3, object): ['Undergrad', 'Graduate', 'Advanced/Professional']

In [11]:
new_categories = {1:'Undergrad', 2:'Graduate', 3:'Advanced/Professional'}

bank_df.Education.cat.categories

Index([1, 2, 3], dtype='int64')

In [13]:
bank_df = pd.get_dummies(bank_df, prefix_sep='_', drop_first=True)
bank_df

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard,Education_2,Education_3
0,25,1,49,4,1.6,0,0,1,0,0,0,False,False
1,45,19,34,3,1.5,0,0,1,0,0,0,False,False
2,39,15,11,1,1.0,0,0,0,0,0,0,False,False
3,35,9,100,1,2.7,0,0,0,0,0,0,True,False
4,35,8,45,4,1.0,0,0,0,0,0,1,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,1,1.9,0,0,0,0,1,0,False,True
4996,30,4,15,4,0.4,85,0,0,0,1,0,False,False
4997,63,39,24,2,0.3,0,0,0,0,0,0,False,True
4998,65,40,49,3,0.5,0,0,0,0,1,0,True,False


In [14]:
y = bank_df['Personal_Loan']
X = bank_df.drop(columns=['Personal_Loan'])

In [15]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

In [18]:
logit_reg = LogisticRegression(penalty='l2', C=1e42, solver='liblinear')
logit_reg.fit(train_X, train_y)

print('intercept ', logit_reg.intercept_[0])
print(pd.DataFrame({'coeff':logit_reg.coef_[0]}, index=X.columns).transpose())

print('AIC', AIC_score(valid_y, logit_reg.predict(valid_X), df = len(train_X.columns) + 1))

intercept  -12.631626696562861
            Age  Experience    Income    Family     CCAvg  Mortgage  \
coeff -0.032052    0.033667  0.058825  0.614463  0.240512  0.001012   

       Securities_Account  CD_Account    Online  CreditCard  Education_2  \
coeff           -1.030099    3.653239 -0.677388   -0.957917     4.190596   

       Education_3  
coeff     4.340342  
AIC -709.1524769205962


In [None]:
logit_reg_pred = logit_reg.predict(valid_X)
logit_reg_proba = logit_reg.predict_proba(valid_X)
logit_result = pd.DataFrame({'actual':})