In [9]:
import numpy as np
import pandas as pd

In [10]:
# Part 1 Data Acquisition

df = pd.read_csv('PersonalLoan.csv')

# Examine if it is an individual-level data set
print(df.head(10))
print(df.columns.values)
print(df.shape)

   Age  Experience  Income  ZIPCode  Family  CCAvg  Education  Mortgage  \
0   25           1      49    91107       4    1.6          1         0   
1   45          19      34    90089       3    1.5          1         0   
2   39          15      11    94720       1    1.0          1         0   
3   35           9     100    94112       1    2.7          2         0   
4   35           8      45    91330       4    1.0          2         0   
5   37          13      29    92121       4    0.4          2       155   
6   53          27      72    91711       2    1.5          2         0   
7   50          24      22    93943       1    0.3          3         0   
8   35          10      81    90089       3    0.6          2       104   
9   34           9     180    93023       1    8.9          3         0   

  SecuritiesAccount CDAccount Online CreditCard PersonalLoan  
0               Yes        No     No         No           No  
1               Yes        No     No         No 

In [11]:
# Part 3 Missing Value Imputation

# Drop ZIP Code for now
rvar_list =['ZIPCode']
df_sample1 = df.drop(columns=rvar_list)

# One for numerical, the other for categorical 
cvar_list = ['Education', 'SecuritiesAccount', 'CDAccount', 'Online', 'CreditCard', 'PersonalLoan']
nvar_list = ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Mortgage']

# Check if there is any missing value
df_sample1.isnull().sum()


Age                  0
Experience           0
Income               0
Family               0
CCAvg                0
Education            0
Mortgage             0
SecuritiesAccount    0
CDAccount            0
Online               0
CreditCard           0
PersonalLoan         0
dtype: int64

In [12]:
# Part 4 Variable Transformation

# Standardize the numerical variables 
df_sample2 = df_sample1.copy()
df_sample2[nvar_list] = (df_sample1[nvar_list] - df_sample1[nvar_list].mean())/df_sample1[nvar_list].std()

df_sample3 = df_sample2.copy()
df_sample3[cvar_list] = df_sample2[cvar_list].astype('category')
df_sample3[nvar_list] = df_sample2[nvar_list].astype('float64')

# Convert the categorical variables into dummies
df_sample4 = df_sample3.copy()
df_sample4 = pd.get_dummies(df_sample3, prefix_sep='_')

# Remove the redundant dummies 
rdummies = ['Education_1', 'SecuritiesAccount_Yes', 'CDAccount_Yes', 'Online_Yes', 'CreditCard_Yes', 'PersonalLoan_No']
df_sample5 = df_sample4.copy()
df_sample5 = df_sample4.drop(columns=rdummies)

# Get the remaining variable list after the variable transformation
print(df_sample5.columns.values)

print(df_sample5)
print(df)

['Age' 'Experience' 'Income' 'Family' 'CCAvg' 'Mortgage' 'Education_2'
 'Education_3' 'SecuritiesAccount_No' 'CDAccount_No' 'Online_No'
 'CreditCard_No' 'PersonalLoan_Yes']
           Age  Experience    Income    Family     CCAvg  Mortgage  \
0    -1.774239   -1.665912 -0.538175  1.397274 -0.193366 -0.555468   
1    -0.029521   -0.096321 -0.864023  0.525938 -0.250586 -0.555468   
2    -0.552936   -0.445119 -1.363657 -1.216733 -0.536683 -0.555468   
3    -0.901880   -0.968316  0.569708 -1.216733  0.436047 -0.555468   
4    -0.901880   -1.055515 -0.625068  1.397274 -0.536683 -0.555468   
...        ...         ...       ...       ...       ...       ...   
4995 -1.425296   -1.491513 -0.733684 -1.216733 -0.021708 -0.555468   
4996 -1.338060   -1.404313 -1.276764  1.397274 -0.879999  0.280210   
4997  1.540726    1.647670 -1.081255 -0.345398 -0.937218 -0.555468   
4998  1.715198    1.734869 -0.538175  0.525938 -0.822780 -0.555468   
4999 -1.512532   -1.404313  0.200414  0.525938 -0.651121 

In [13]:
# Part 5 Data Partiton

from sklearn.model_selection import train_test_split

df4partition = df_sample5
testpart_size = 0.2

# random_state specifies the seed for random number generator. Random_state = 1 unless otherwised noted
df_nontestData, df_testData = train_test_split(df4partition, test_size=testpart_size, random_state=1)

print(df_nontestData)

           Age  Experience    Income    Family     CCAvg  Mortgage  \
1233  0.668367    0.775675 -1.124701 -0.345398 -0.879999 -0.555468   
1056 -0.814644   -1.229914 -1.059532 -1.216733 -0.725507 -0.555468   
1686  1.453490    1.560470 -0.755407  1.397274  0.149950 -0.555468   
187   0.057715    0.078078  1.851377  0.525938 -0.021708  2.541456   
3840  0.930075    0.950074 -0.842300  0.525938 -1.051657  0.565323   
...        ...         ...       ...       ...       ...       ...   
2895  1.279018    1.386071 -0.755407  1.397274 -0.365024  0.820943   
2763  0.842839    0.950074 -1.320210  1.397274 -0.708341 -0.555468   
905   0.057715    0.165278 -0.994362 -1.216733 -0.536683  0.270378   
3980  0.057715    0.165278  0.330753  1.397274 -0.307805 -0.555468   
235  -0.640172   -1.055515 -0.060265  1.397274 -0.078927 -0.555468   

      Education_2  Education_3  SecuritiesAccount_No  CDAccount_No  Online_No  \
1233            0            0                     1             1          0 

In [14]:
# Part 6 Logistic Regression with Penalty 

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

# Separate the predictor values and the DV values into X and y respectively
DV = 'PersonalLoan_Yes'
y = df_nontestData[DV]
X = df_nontestData.drop(columns=[DV])

# Run Logistic regression with pre-specified penalty level (alpha)
alpha = 10

# The Logistic regression results are put into a model object clf
clf = LogisticRegression(C=1/alpha, penalty='l1', solver='saga', max_iter=200, random_state=1).fit(X,y)

# to display the estimated coefficients of a model candidate obtained by the Logistic Regression analysis
def summary_coef(model_object):
  n_predictors = X.shape[1]
  model_coef = pd.DataFrame(model_object.coef_.reshape(1, n_predictors), columns=X.columns.values)
  model_coef['Intercept'] = model_object.intercept_
  return model_coef.transpose()

print(summary_coef(clf))

                             0
Age                   0.000000
Experience            0.000000
Income                2.275057
Family                0.586373
CCAvg                 0.192983
Mortgage              0.054617
Education_2           2.648378
Education_3           2.842589
SecuritiesAccount_No  0.000000
CDAccount_No         -1.684072
Online_No             0.117621
CreditCard_No         0.176838
Intercept            -4.386651


In [15]:
# Run Logistic regression with k-fold cross validation with k=5
kfolds = 5

# Specify within which range of the penalty levels we will search for the optimal penalty level, 
min_alpha = 0.001
max_alpha = 100

# Discretize the continuous alpha range [min_alpha, max_alpha] into n individual points of alpha
# We train n model candidates each of which corresponds to one individual alpha point
n_candidates = 1000

# Store the list of individual alpha points
alpha_list = list(np.linspace(min_alpha, max_alpha, num=n_candidates))

# C_list is the element-wise inverse of alpha_list. It is required as one of the paramater values for LogisticRegressionCV
C_list = list(1/np.linspace(min_alpha, max_alpha, num=n_candidates))

# Set n_jobs to be -1 to run LogisticRegressionCV on all CPU cores.
clf_optimal = LogisticRegressionCV(Cs=C_list, cv=kfolds, penalty='l1', solver='saga', max_iter=200, random_state=1, n_jobs=-1).fit(X,y)

# Display the estimated coefficients of the final selected model
print(summary_coef(clf_optimal))

# Display the optimal alpha that yields the final selected model (the best model candidate)
print(1/clf_optimal.C_)

# Calcuate the error rate over the test partition based on the final selected model

y_test_actual = df_testData[DV]
X_test = df_testData.drop(columns=[DV])

# Use predict method of the clf_optimal object to apply the model associated with clf_optimal to the test partition
y_test_predicted = clf_optimal.predict(X_test)

from sklearn import metrics

# Display the confusion matrix over the test partition
print(metrics.confusion_matrix(y_test_actual, y_test_predicted))

# Display the accuracy over the test partition
print(clf_optimal.score(X_test, y_test_actual))

                             0
Age                  -0.206939
Experience            0.283202
Income                2.860680
Family                0.705755
CCAvg                 0.299086
Mortgage              0.100955
Education_2           4.116630
Education_3           4.377923
SecuritiesAccount_No  0.918620
CDAccount_No         -3.661547
Online_No             0.673737
CreditCard_No         1.018429
Intercept            -6.024895
[0.001]
[[892   8]
 [ 32  68]]
0.96


In [18]:
# Calcuates the profit over a dataset 

def profit_calculation(model, x_value, y_value):
  
  # Specify the decision cut-off used in the decision rule
  d_cutoff = 1/11
  # The method predict_proba is to get the predicted probability
  decision = list(model.predict_proba(x_value)[:,1] > d_cutoff)

  y = list(y_value)
    
  n_obs = len(y)

  cum_profit = 0

  for i in range(n_obs): 
    if decision[i] == True and y[i] == 1: 
      profit = 10 # net profit
    elif decision[i] == True and y[i] == 0: 
      profit = -1 
    else:
      profit = 0 # For any other situation, the net profit is zero
    cum_profit = cum_profit + profit 
  
  average_net_profit = cum_profit / n_obs 
  return average_net_profit 

# Set n_jobs to be -1 to run LogisticRegressionCV on all CPU cores.
clf_optimal = LogisticRegressionCV(Cs=C_list, cv=kfolds, scoring=profit_calculation, penalty='l1', solver='saga', max_iter=200, random_state=1, n_jobs=-1).fit(X,y)

# Display the estimated coefficients of the final selected model
print(summary_coef(clf_optimal))

# Display the optimal alpha that yields the final selected model (the best model candidate)
print(1/clf_optimal.C_)

# Calcuate the average net profit over the test partition based on the final selected model

y_test_actual = df_testData[DV]

X_test = df_testData.drop(columns=[DV])

print(profit_calculation(clf_optimal, X_test, y_test_actual))

                             0
Age                   0.000000
Experience            0.000000
Income                1.900911
Family                0.520431
CCAvg                 0.136723
Mortgage              0.016677
Education_2           1.524499
Education_3           1.706426
SecuritiesAccount_No  0.000000
CDAccount_No         -0.830344
Online_No             0.000000
CreditCard_No         0.000000
Intercept            -3.807365
[25.02577477]
0.759


In [16]:
# Part7 Score the new data
df_newdata = pd.read_csv('PersonalLoan_NEWDATA.csv')

# Generate the categorical predictor list
Original_DV = 'PersonalLoan'
cpredictor_list = cvar_list.copy()
cpredictor_list.remove(Original_DV)

# Drop the redundant variable, e.g., ZIPCode as we did for the historical data
df_newdata_sample1 = df_newdata.drop(columns=rvar_list)

df_newdata_sample2 = df_newdata_sample1.copy()
df_newdata_sample2[cpredictor_list] = df_newdata_sample1[cpredictor_list].astype('category')
df_newdata_sample2[nvar_list] = df_newdata_sample1[nvar_list].astype('float64')

historical_sample_mean = df_sample1[nvar_list].mean()
historical_sample_std = df_sample1[nvar_list].std()

# Use the historical sample mean and historical sample standard deviation to standardize the new data
df_newdata_sample3 = df_newdata_sample2.copy()
df_newdata_sample3[nvar_list] = (df_newdata_sample2[nvar_list] - historical_sample_mean[nvar_list])/historical_sample_std[nvar_list]

df_newdata_sample4 = pd.get_dummies(df_newdata_sample3, prefix_sep='_')

# Fix the inconsistency between the predictors 
df_newdata_sample5 = df_newdata_sample4.copy()
df_newdata_sample5['Education_2'] = 0

print(df_newdata_sample5.columns.values)

['Age' 'Experience' 'Income' 'Family' 'CCAvg' 'Mortgage' 'Education_3'
 'SecuritiesAccount_No' 'CDAccount_No' 'Online_No' 'CreditCard_No'
 'Education_2']


In [17]:
# Score the new data using the model carried by the model object clf_optimal 
predicted_PersonalLoan = clf_optimal.predict_proba(df_newdata_sample5)[:,1]

# Print the predicted probablity for the new data observations
print(predicted_PersonalLoan)

[0.04208684]
