## Preparation

### Preparation > Imports

In [1]:
''' IMPORTS '''
import numpy as np
import pandas as pd
import matplotlib as plot
import math 
import string

from enum import Enum    

# for plotting
import matplotlib.pyplot as plt
import scipy.stats as stats

from matplotlib import cm
from scipy.stats import multivariate_normal

### Preparation > Classes

In [2]:
'''CLASSES '''
class Cols(Enum):
        StatusOfExistingCheckingAccount = 0,
        DurationInMonth = 1, 
        CreditHistory = 2, 
        Purpose = 3, 
        CreditAmount = 4 , 
        SavingsAccountBonds =5,
        PresentEmploymentSince =6 ,
        InstallmentRateInPercentageOfDisposableIncome =7 ,
        PersonalStatusAndSex = 8,
        OtherDebtorsOrGuarantors =9 ,
        PresentResidenceSince =10 ,
        Property =11 ,
        AgeInYears = 12,
        OtherInstallmentPlans =13 ,
        Housing = 14,
        NumberOfExistingCreditsAtThisBank =15 ,
        Job = 16,
        NumberOfPeopleBeingLiableToProvideMaintenanceFor =17 ,
        Telephone = 18,
        ForeignWorker =19 ,
        ClassOutcome = 20
        
class Outcomes(Enum):
        Good = 1
        Bad = 2

### Preparation > Constances

In [3]:
'''CONSTANCES'''
# dict of all column names
COLS = {
        Cols.StatusOfExistingCheckingAccount : "Status of existing checking account" ,
        Cols.DurationInMonth:"Duration in month" , # numerical
        Cols.CreditHistory: "Credit history", 
        Cols.Purpose:"Purpose" , 
        Cols.CreditAmount:  "Credit amount",  # numerical
        
        Cols.SavingsAccountBonds: "Savings account/bonds" ,
        Cols.PresentEmploymentSince:"Present employment since" ,
        Cols.InstallmentRateInPercentageOfDisposableIncome: "Installment rate in percentage of disposable income", # numerical
        Cols.PersonalStatusAndSex: "Personal status and sex",
        Cols.OtherDebtorsOrGuarantors:"Other debtors / guarantors", 
        
        Cols.PresentResidenceSince: "Present residence since", # numerical
        Cols.Property: "Property", 
        Cols.AgeInYears: "Age in years", # numerical
        Cols.OtherInstallmentPlans:"Other installment plans", 
        Cols.Housing: "Housing", 
        
        Cols.NumberOfExistingCreditsAtThisBank:"Number of existing credits at this bank", # numerical
        Cols.Job: "Job",
        Cols.NumberOfPeopleBeingLiableToProvideMaintenanceFor:"Number of people being liable to provide maintenance for", 
        Cols.Telephone: "Telephone",
        Cols.ForeignWorker:"Foreign worker",
        
            Cols.ClassOutcome: "Class/Outcome"
}

# model parameter
PERC_TRAIN = 0.67
PERC_TEST = 1-PERC_TRAIN

# constants
PI = np.pi
numerical_attr = [
    COLS[Cols.DurationInMonth],
    COLS[Cols.CreditAmount],
    COLS[Cols.PresentResidenceSince],
    COLS[Cols.InstallmentRateInPercentageOfDisposableIncome], 
    COLS[Cols.AgeInYears], 
    COLS[Cols.NumberOfExistingCreditsAtThisBank],
]

### Model > Load Data

In [4]:
df_normal = pd.read_csv(
    "german.data", 
    sep=" ",
    names=[
        COLS[Cols.StatusOfExistingCheckingAccount],
        COLS[Cols.DurationInMonth],
        COLS[Cols.CreditHistory],
        COLS[Cols.Purpose],
        COLS[Cols.CreditAmount],
        COLS[Cols.SavingsAccountBonds],
        COLS[Cols.PresentEmploymentSince],
        COLS[Cols.InstallmentRateInPercentageOfDisposableIncome],
        COLS[Cols.PersonalStatusAndSex],
        COLS[Cols.OtherDebtorsOrGuarantors],
        COLS[Cols.PresentResidenceSince],
        COLS[Cols.Property],
        COLS[Cols.AgeInYears],
        COLS[Cols.OtherInstallmentPlans],
        COLS[Cols.Housing],
        COLS[Cols.NumberOfExistingCreditsAtThisBank],
        COLS[Cols.Job],
        COLS[Cols.NumberOfPeopleBeingLiableToProvideMaintenanceFor],
        COLS[Cols.Telephone],
        COLS[Cols.ForeignWorker],
        COLS[Cols.ClassOutcome],
    ])


df_numerical = pd.read_csv(
    "german.data-numeric", 
    delim_whitespace=True,
    header=None,
    names = [
        'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x',
           COLS[Cols.ClassOutcome]]
)

df_numerical.head(3)

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,p,q,r,s,t,u,v,w,x,Class/Outcome
0,1,6,4,12,5,5,3,4,1,67,...,0,0,1,0,0,1,0,0,1,1
1,2,48,2,60,1,3,2,2,1,22,...,0,0,1,0,0,1,0,0,1,2
2,4,12,4,21,1,4,3,3,1,49,...,0,0,1,0,0,1,0,1,0,1


## Model

### Model > Functions > Classification

In [5]:
'''FUNCTIONS'''
    
# to calculate un-normalized posteriors and do classification with respect to g1 and g2results
def do_classification(df_test, attr1, attr2, prior_good, muv_good, cov_good, prior_bad, muv_bad, cov_bad):
    decision_list = []  # to store classification results to calculate accuracy later.
    
    # classify all test rows
    for index, row in df_test.iterrows():
        pred_good = discriminant_func(x=row, attr1=attr1, attr2=attr2, prior=prior_good, mus=muv_good, cov=cov_good)
        pred_bad = discriminant_func(x=row, attr1=attr1, attr2=attr2, prior=prior_bad, mus=muv_bad, cov=cov_bad)
        
        # take higher posterior (-> higher probabilty)
        classified_class = Outcomes.Good.value if pred_good>pred_bad else Outcomes.Bad.value
        decision_list.append(classified_class)
        
    return decision_list


def discriminant_func(x, attr1, attr2, prior, mus, cov):
    standard_dev = math.sqrt(abs(cov))
    
    if standard_dev == 0:
        return -900000000000
    
    res =  (
            -(math.log(2*PI)/2) 
            - math.log(standard_dev) 
            - ((x[attr1]-mus[0])**2)/(2*cov) 
            + np.log(prior)
        )+ (
            -(math.log(2*PI)/2) 
            - math.log(standard_dev) 
            - ((x[attr2]-mus[1])**2)/(2*cov) 
            + np.log(prior)
    )
    return res
    

# to find maximum of the discriminant function results
def find_max_likelihood(g1, g2):
    likelihood_list = [g1, g2]
    return max(likelihood_list)

### Model > Functions > Plot

In [6]:
def plot_gaussian(mu, sigma2):
    random_seed=404
    print(f'sigma2={sigma2}')

    plt.style.use('seaborn-dark')
    plt.rcParams['figure.figsize']=14,6
    fig = plt.figure()
    
    # Initializing the covariance matrix
    cov = np.array([
        [1, sigma2], 
        [sigma2, 1]
    ])
    
    # Generating a Gaussian bivariate distribution
    # with given mean and covariance matrix
    distr = multivariate_normal(cov = cov, mean = mu, seed = random_seed)
    
    
    # Generating a meshgrid complacent with
    # the 3-sigma boundary
    mean_1, mean_2 = mu[0], mu[1]
    sigma_1, sigma_2 = cov[0,0], cov[1,1]
     
    x = np.linspace(mean_1 -3*sigma_1, mean_1 + 3*sigma_1, num=100)
    y = np.linspace(mean_2 -3*sigma_2, mean_2 + 3*sigma_2, num=100)
    X, Y = np.meshgrid(x,y)
    
    # Generating the density function
    # for each point in the meshgrid
    pdf = np.zeros(X.shape)
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            pdf[i,j] = distr.pdf([X[i,j], Y[i,j]])
            
    # Plotting the density function values
    key = 131
    ax = fig.add_subplot(key, projection = '3d')
    ax.plot_surface(X, Y, pdf, cmap = 'viridis')
    plt.xlabel("x1")
    plt.ylabel("x2")
    plt.title(f'Covariance between x1 and x2 = {sigma2}')
    ax.axes.zaxis.set_ticks([])
    
    
    plt.tight_layout()
    plt.show()
    
def plot_contour(mu, sigma2, scale):
    cov = np.array([
        [1, sigma2], 
        [sigma2, 1]
    ])
    
    x = np.linspace(mu[0] -scale*cov[0,0], mu[0] + scale*cov[0,0], num=100)
    y = np.linspace(mu[1] -scale*cov[1,1], mu[0] + scale*cov[1,1], num=100)
    X, Y = np.meshgrid(x,y)

    # Pack X and Y into a single 3-dimensional array
    pos = np.empty(X.shape + (2,))
    pos[:, :, 0] = X
    pos[:, :, 1] = Y

    F = multivariate_normal(mu, cov)
    Z = F.pdf(pos)
    plt.xlabel(r"$X_1$")
    plt.ylabel(r"$X_2$")
    
    plt.title(f'$\mu$ = {mu}\n $\Sigma$ = {sigma2}')
    plt.contourf(X, Y, Z, cmap=cm.Greys)
    plt.colorbar()

## Model > Training (run  500x)

### Gradient Descent

In [9]:
def routine_gradient_descent(df, columns, iterations):
    max_avg_acc = 0
    best_attributes = []
    
    for i, attr1 in enumerate(columns):
        for attr2 in columns:
            if attr1 == attr2:
                continue

            max_avg = gradient_descent(df, attr1, attr2, iterations)

            if max_avg > max_avg_acc:
                max_avg_acc = max_avg
                best_attributes = [attr1,attr2]
        print(f"=============Progress: {round((i+1)/len(columns),2)}==============")


    print(f'Best Avg. Acc: {max_avg_acc} with features: <{best_attributes[0]}> & <{best_attributes[1]}>')

    

def gradient_descent(df, attr1, attr2, iterations):
    accuracy = 0  # total accuracy
    accuracies=[] # to store individual accuracies

    # program will run for 500 times to find average accuracy
    for i in range(0, iterations):    
        # split data train & test
        msk = np.random.rand(len(df)) < PERC_TRAIN
        df_train = df[msk]
        df_test = df[~msk]

        # separate in class 1 & 2
        df_good = df_train.loc[df_train[COLS[Cols.ClassOutcome]] == 1]    
        df_bad  = df_train.loc[df_train[COLS[Cols.ClassOutcome]] == 2]

        prior_good = len(df_good)/(len(df_good)+len(df_bad)) 
        prior_bad = len(df_bad)/(len(df_good)+len(df_bad)) 

        # calculate MUs and COVs(MLE parameters) of each class
        muv_good = [df_good[attr1].mean(), df_good[attr2].mean()]
        cov_good = df_good[attr1].cov(df_good[attr2])

        muv_bad = [df_bad[attr1].mean(), df_bad[attr2].mean()]
        cov_bad = df_bad[attr1].cov(df_bad[attr2])

        # classification_result is an array that stores the result of classification
        classification_result = do_classification(
            df_test=df_test, 
            attr1=attr1, attr2=attr2,
            prior_good=prior_good, muv_good=muv_good, cov_good=cov_good, 
            prior_bad=prior_bad, muv_bad=muv_bad, cov_bad=cov_bad)

        # compare classification results with y_test 
        col1_acc = "classification"
        col2_acc = "df_test_y"
        df_classified = pd.DataFrame({
            col1_acc : classification_result,
            col2_acc : df_test[COLS[Cols.ClassOutcome]]
        })
        
        # calculate accuracy 
        df_acc = df_classified.loc[(df_classified[col1_acc] == df_classified[col2_acc])] 
        individual_accuracy = df_acc.shape[0]/df_test.shape[0]    
        accuracy = accuracy + individual_accuracy
        accuracies.append(individual_accuracy)

    # print results + update best avg. accuracy
    print(f"<{attr1}> x <{attr2}>")
    max_avg = round((accuracy/iterations)*100,2)

    print(f"Acg Acc. = {max_avg}%")
    print(f"Max Acc. = {round(max(accuracies)*100,2)}%")
    print(f'mus={muv_good},cov={cov_good}')
    print("")   
    #plot_contour(mu=muv_good,sigma2=cov_good,scale=-5)
    return max_avg

### Dataframe = normal (only 7 numerical attributes / 7*7 combinations)

In [10]:
max_avg = routine_gradient_descent(df_normal, numerical_attr, 1)
                
#plot_gaussian(mu=muv_good, sigma2=cov_good)

<Duration in month> x <Credit amount>
Acg Acc. = 61.86%
Max Acc. = 61.86%
mus=[18.936034115138593, 2914.5671641791046],cov=16104.679519071311

<Duration in month> x <Present residence since>
Acg Acc. = 72.64%
Max Acc. = 72.64%
mus=[19.106609808102345, 2.818763326226013],cov=-0.2648251416908135

<Duration in month> x <Installment rate in percentage of disposable income>
Acg Acc. = 57.36%
Max Acc. = 57.36%
mus=[19.547916666666666, 2.8916666666666666],cov=1.2703375086986777

<Duration in month> x <Age in years>
Acg Acc. = 71.47%
Max Acc. = 71.47%
mus=[19.192560175054705, 36.5492341356674],cov=-7.2902030788130086

<Duration in month> x <Number of existing credits at this bank>
Acg Acc. = 70.15%
Max Acc. = 70.15%
mus=[19.059322033898304, 1.402542372881356],cov=-0.14707258267659795

<Credit amount> x <Duration in month>
Acg Acc. = 65.45%
Max Acc. = 65.45%
mus=[3091.12, 19.446315789473683],cov=19396.42734177216

<Credit amount> x <Present residence since>
Acg Acc. = 35.38%
Max Acc. = 35.38%
m

### Dataframe = numerical (24 numerical attributes / 24*24 combinations)

In [20]:
max_avg = routine_gradient_descent(df_numerical, df_numerical.columns, 10)

<a> x <b>
Acg Acc. = 25.42%
Max Acc. = 50.0%

<a> x <c>
Acg Acc. = 70.63%
Max Acc. = 78.57%

<a> x <d>
Acg Acc. = 74.36%
Max Acc. = 100.0%

<a> x <e>
Acg Acc. = 70.01%
Max Acc. = 88.89%

<a> x <f>
Acg Acc. = 33.66%
Max Acc. = 50.0%

<a> x <g>
Acg Acc. = 39.74%
Max Acc. = 75.0%

<a> x <h>
Acg Acc. = 75.07%
Max Acc. = 91.67%

<a> x <i>
Acg Acc. = 26.28%
Max Acc. = 50.0%

<a> x <j>
Acg Acc. = 26.77%
Max Acc. = 58.33%

<a> x <k>
Acg Acc. = 23.72%
Max Acc. = 45.45%

<a> x <l>
Acg Acc. = 49.67%
Max Acc. = 77.78%

<a> x <m>
Acg Acc. = 65.63%
Max Acc. = 83.33%

<a> x <n>
Acg Acc. = 63.25%
Max Acc. = 83.33%

<a> x <o>
Acg Acc. = 35.06%
Max Acc. = 50.0%

<a> x <p>
Acg Acc. = 30.67%
Max Acc. = 50.0%

<a> x <q>
Acg Acc. = 31.74%
Max Acc. = 63.64%

<a> x <r>
Acg Acc. = 55.27%
Max Acc. = 73.33%

<a> x <s>
Acg Acc. = 60.07%
Max Acc. = 100.0%

<a> x <t>
Acg Acc. = 31.94%
Max Acc. = 55.56%

<a> x <u>
Acg Acc. = 59.63%
Max Acc. = 75.0%

<a> x <v>
Acg Acc. = 41.82%
Max Acc. = 63.64%

<a> x <w>
Acg Acc. =

<h> x <b>
Acg Acc. = 38.85%
Max Acc. = 80.0%

<h> x <c>
Acg Acc. = 28.08%
Max Acc. = 66.67%

<h> x <d>
Acg Acc. = 57.19%
Max Acc. = 100.0%

<h> x <e>
Acg Acc. = 60.49%
Max Acc. = 87.5%

<h> x <f>
Acg Acc. = 72.31%
Max Acc. = 87.5%

<h> x <g>
Acg Acc. = 73.62%
Max Acc. = 100.0%

<h> x <i>
Acg Acc. = 27.71%
Max Acc. = 42.86%

<h> x <j>
Acg Acc. = 60.76%
Max Acc. = 87.5%

<h> x <k>
Acg Acc. = 33.92%
Max Acc. = 66.67%

<h> x <l>
Acg Acc. = 36.77%
Max Acc. = 62.5%

<h> x <m>
Acg Acc. = 34.63%
Max Acc. = 60.0%

<h> x <n>
Acg Acc. = 67.5%
Max Acc. = 100.0%

<h> x <o>
Acg Acc. = 37.08%
Max Acc. = 100.0%

<h> x <p>
Acg Acc. = 23.94%
Max Acc. = 42.86%

<h> x <q>
Acg Acc. = 38.14%
Max Acc. = 71.43%

<h> x <r>
Acg Acc. = 35.66%
Max Acc. = 62.5%

<h> x <s>
Acg Acc. = 36.13%
Max Acc. = 62.5%

<h> x <t>
Acg Acc. = 49.92%
Max Acc. = 69.23%

<h> x <u>
Acg Acc. = 66.63%
Max Acc. = 90.0%

<h> x <v>
Acg Acc. = 25.77%
Max Acc. = 40.0%

<h> x <w>
Acg Acc. = 29.74%
Max Acc. = 57.14%

<h> x <x>
Acg Acc. = 64.

<o> x <f>
Acg Acc. = 77.52%
Max Acc. = 100.0%

<o> x <g>
Acg Acc. = 35.97%
Max Acc. = 71.43%

<o> x <h>
Acg Acc. = 22.58%
Max Acc. = 35.71%

<o> x <i>
Acg Acc. = 34.36%
Max Acc. = 85.71%

<o> x <j>
Acg Acc. = 79.08%
Max Acc. = 100.0%

<o> x <k>
Acg Acc. = 22.69%
Max Acc. = 40.0%

<o> x <l>
Acg Acc. = 59.47%
Max Acc. = 80.0%

<o> x <m>
Acg Acc. = 32.5%
Max Acc. = 66.67%

<o> x <n>
Acg Acc. = 78.0%
Max Acc. = 90.91%

<o> x <p>
Acg Acc. = 71.2%
Max Acc. = 90.0%

<o> x <q>
Acg Acc. = 59.08%
Max Acc. = 77.78%

<o> x <r>
Acg Acc. = 29.93%
Max Acc. = 44.44%

<o> x <s>
Acg Acc. = 36.29%
Max Acc. = 80.0%

<o> x <t>
Acg Acc. = 65.87%
Max Acc. = 88.89%

<o> x <u>
Acg Acc. = 55.5%
Max Acc. = 100.0%

<o> x <v>
Acg Acc. = 33.21%
Max Acc. = 54.55%

<o> x <w>
Acg Acc. = 27.55%
Max Acc. = 54.55%

<o> x <x>
Acg Acc. = 34.49%
Max Acc. = 66.67%

<o> x <Class/Outcome>
Acg Acc. = 31.6%
Max Acc. = 58.33%

<p> x <a>
Acg Acc. = 36.42%
Max Acc. = 66.67%

<p> x <b>
Acg Acc. = 38.31%
Max Acc. = 70.0%

<p> x <c>
A

<v> x <j>
Acg Acc. = 34.27%
Max Acc. = 44.44%

<v> x <k>
Acg Acc. = 71.53%
Max Acc. = 100.0%

<v> x <l>
Acg Acc. = 47.69%
Max Acc. = 83.33%

<v> x <m>
Acg Acc. = 36.0%
Max Acc. = 77.27%

<v> x <n>
Acg Acc. = 68.87%
Max Acc. = 85.71%

<v> x <o>
Acg Acc. = 29.33%
Max Acc. = 62.5%

<v> x <p>
Acg Acc. = 65.09%
Max Acc. = 90.0%

<v> x <q>
Acg Acc. = 66.81%
Max Acc. = 75.0%

<v> x <r>
Acg Acc. = 30.76%
Max Acc. = 62.5%

<v> x <s>
Acg Acc. = 69.99%
Max Acc. = 88.89%

<v> x <t>
Acg Acc. = 63.87%
Max Acc. = 83.33%

<v> x <u>
Acg Acc. = 32.46%
Max Acc. = 50.0%

<v> x <w>
Acg Acc. = 64.31%
Max Acc. = 77.78%

<v> x <x>
Acg Acc. = 77.75%
Max Acc. = 90.91%

<v> x <Class/Outcome>
Acg Acc. = 28.88%
Max Acc. = 61.54%

<w> x <a>
Acg Acc. = 66.42%
Max Acc. = 92.31%

<w> x <b>
Acg Acc. = 46.58%
Max Acc. = 83.33%

<w> x <c>
Acg Acc. = 72.41%
Max Acc. = 100.0%

<w> x <d>
Acg Acc. = 44.41%
Max Acc. = 69.23%

<w> x <e>
Acg Acc. = 32.87%
Max Acc. = 57.14%

<w> x <f>
Acg Acc. = 60.78%
Max Acc. = 80.0%

<w> x <g

In [25]:
'''
Probleme:
(1) Ich weiß nicht ob ich 
    (a) irgendwo einen Fehler habe oder 
    (b) schlechte Parameter habe oder 
    (c) die Daten bei nur 2 Parametern mit MLE nur so viel Accuracy geben
    
(2) Plot ist schwierig
    --> 2 Ansätze
    (a) Contour Plot - Plotte flach aber mit Farbkreisen 
    (b) Plotte 3D 
--> brauch ich doch zwei Sigma Werte?

(3) Risk Analysis - woher die Formel?

'''

'\nProbleme:\n(1) Ich weiß nicht ob ich \n    (a) irgendwo einen Fehler habe oder \n    (b) schlechte Parameter habe oder \n    (c) die Daten bei nur 2 Parametern mit MLE nur so viel Accuracy geben\n    \n(2) Plot ist schwierig\n    --> 2 Ansätze\n    (a) Contour Plot - Plotte flach aber mit Farbkreisen \n    (b) Plotte 3D \n--> brauch ich doch zwei Sigma Werte?\n\n(3) Risk Analysis - woher die Formel?\n\n'

In [57]:
plot_contour(
    mu=[19, 3069],
    sigma2=17782,
    scale=-5
)

ValueError: the input matrix must be positive semidefinite

In [16]:
at1 = COLS[Cols.AgeInYears]
at2 = COLS[Cols.CreditHistory]

df_a = pd.DataFrame({
            at1 : df_normal[at1],
            at2 :  df_normal[at2],
            COLS[Cols.ClassOutcome] : df_normal[COLS[Cols.ClassOutcome]]
        })
df_a

Unnamed: 0,Age in years,Credit history,Class/Outcome
0,67,A34,1
1,22,A32,2
2,49,A34,1
3,45,A32,1
4,53,A33,2
...,...,...,...
995,31,A32,1
996,40,A32,1
997,38,A32,1
998,23,A32,2
