In [64]:
import numpy as np
import pandas as pd
import random
import statsmodels.api as sm
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [65]:
CreditData = pd.read_table("MGMT635_GermanCreditData.csv", sep = ',')
data = CreditData
X = data.iloc[:,0:20].copy()
y = data.iloc[:,-1].copy()
n = len(y)
for i in range(n):
    if y[i] == 2:
        y[i] = 1
    elif y[i] == 1:
        y[i] = 0

In [66]:
X = X.astype(float)
y = y.astype(float)


regr = sm.Logit(y, X, family = sm.families.Binomial()).fit()
print(regr.summary2())

Optimization terminated successfully.
         Current function value: 0.484342
         Iterations 6
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.207     
Dependent Variable: 1.1              AIC:              1007.7153 
Date:               2020-04-22 14:52 BIC:              1105.8504 
No. Observations:   999              Log-Likelihood:   -483.86   
Df Model:           19               LL-Null:          -610.51   
Df Residuals:       979              LLR p-value:      6.6397e-43
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     6.0000                                       
-------------------------------------------------------------------
           Coef.    Std.Err.      z      P>|z|     [0.025    0.975]
-------------------------------------------------------------------
11        -0.5823     0.0697   -8.3501   0.0000   -0.7190   -0.4456
6          0.0259     0.0086    3.0119   0.0026    0.0090 

In [67]:
prediction = regr.predict()
print(prediction[0:25])

[0.56261154 0.02839812 0.4600665  0.57201153 0.14401504 0.06723001
 0.52450374 0.04202592 0.48533292 0.47273851 0.81619827 0.27342889
 0.32666377 0.54035701 0.60945418 0.03369232 0.83439804 0.63447983
 0.11019275 0.10626081 0.2325831  0.31450498 0.1390728  0.04127084
 0.27997469]


In [68]:
def contrast(variable):
    level = list(set(variable))
    L = len(level)
    df = pd.DataFrame(np.zeros([L,L]), index = level, columns = level)
    for i in range(L):
        for j in range(L):
            df.iloc[i,j] = 1 if i==j else 0
    
    return df

In [69]:
print(contrast(data['1.1']))

result = [2 if r > .5 else 1 for r in prediction]
print(confusion_matrix(y_pred = result, y_true = data['1.1']))

     1    2
1  1.0  0.0
2  0.0  1.0
[[621  78]
 [159 141]]


In [70]:
def conf_mat(y_true, y_pred): # function version
    df = contrast(y_true) # columns true, rows pred
    name = df.columns.tolist()
    n = len(y_pred)
    for j in range(len(name)):
        df.iloc[j,j] = 0
    for i in range(n):
        if y_true.iloc[i] == name[0]:
            if y_pred[i] == name[0]:
                df.iloc[0,0] += 1
            else :
                df.iloc[1,0] += 1
        else :
            if y_pred[i] == name[0]:
                df.iloc[0,1] += 1
            else :
                df.iloc[1,1] += 1
                
    return df

In [71]:
conf = conf_mat(data['1.1'], result)
print(conf, '\n')
print((conf.iloc[0,0]+conf.iloc[1,1])/len(y), '\n')
print(np.mean(prediction))

       1      2
1  621.0  159.0
2   78.0  141.0 

0.7627627627627628 

0.30026489266738937


Confusion matrix says that 621 were correctly identified as yes's and 141 were correctly identified as no's

In [72]:
df = pd.DataFrame(data)
print(data)
df = df.drop(['43','4.1', '152', '173', '192'], axis=1)

     11   6  34  43  1169  65  75  4  93  101  ...  121  67  143  152  2  173  \
0    12  48  32  43  5951  61  73  2  92  101  ...  121  22  143  152  1  173   
1    14  12  34  46  2096  61  74  2  93  101  ...  121  49  143  152  1  172   
2    11  42  32  42  7882  61  74  2  93  103  ...  122  45  143  153  1  173   
3    11  24  33  40  4870  61  73  3  93  101  ...  124  53  143  153  2  173   
4    14  36  32  46  9055  65  73  2  93  101  ...  124  35  143  153  1  172   
..   ..  ..  ..  ..   ...  ..  .. ..  ..  ...  ...  ...  ..  ...  ... ..  ...   
994  14  12  32  42  1736  61  74  3  92  101  ...  121  31  143  152  1  172   
995  11  30  32  41  3857  61  73  4  91  101  ...  122  40  143  152  1  174   
996  14  12  32  43   804  61  75  4  93  101  ...  123  38  143  152  1  173   
997  11  45  32  43  1845  61  73  4  93  101  ...  124  23  143  153  1  173   
998  12  45  34  41  4576  62  71  3  93  101  ...  123  27  143  152  1  173   

     1  192  201  1.1  
0  

In [73]:
train = df.iloc[0:979,:].copy()
test = df.iloc[979:999,:].copy()

X = train.iloc[:,0:15].copy()
y = train.iloc[:,-1].copy()

X_test = test.iloc[:,0:15].copy()
y_test = test.iloc[:,-1].copy()

n = len(y)
for i in range(n):
    if y[i] == 2:
        y[i] = 1
    elif y[i] == 1:
        y[i] = 0

y = y.astype(float)
X = X.astype(float)
X_test = X_test.astype(float)


In [74]:
regr = sm.Logit(y, X, family=sm.families.Binomial()).fit()
print(regr.summary2())

Optimization terminated successfully.
         Current function value: 0.483625
         Iterations 6
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.211     
Dependent Variable: 1.1              AIC:              976.9369  
Date:               2020-04-22 14:52 BIC:              1050.2349 
No. Observations:   979              Log-Likelihood:   -473.47   
Df Model:           14               LL-Null:          -599.97   
Df Residuals:       964              LLR p-value:      6.8610e-46
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     6.0000                                       
-------------------------------------------------------------------
           Coef.    Std.Err.      z      P>|z|     [0.025    0.975]
-------------------------------------------------------------------
11        -0.5925     0.0700   -8.4595   0.0000   -0.7298   -0.4553
6          0.0261     0.0087    2.9973   0.0027    0.0090 

In [75]:
pred = regr.predict(X_test)
result = [2 if r > .5 else 1 for r in pred]
conf = conf_mat(y_test, result)
print(conf, '\n')
print((conf.iloc[0,0]+conf.iloc[1,1])/np.sum(conf.values))

      1    2
1  12.0  1.0
2   4.0  3.0 

0.75


What this confusion matrix tells us is that the model accurately predicted 12 goods and 3 bads, but was incorrect on 25% of the test samples.

---



The goal of this project was to create code for a machine learning program that would be able to predict whether or not to lend money to them. For this project we were given 1000 data points with 20 different variables.  Some of these examples were age, sex, marital statues, income, property, other debtors etc. and we needed to decide which of these variables we wanted to use for our machine learning code to get the most accurate model possible. We used 980 users to train the model, and 20 users to test the model. The type of model I decided to use is called a logistic regression model. We use these types of models when the dependent variable is dichotomous otherwise known as binary.  In this case the dependent variable we would be looking at is whether or not we wanted to lend money to an individual. To avoid overfitting the model, I screened the data and looked for the 5 variables with the highest p-value. A pvalue higher then that would indicate it is not statistically significant and we do not want it. If I were to build another machine algorithm in the future I could try getting rid of all statistically insignificant data.


