## Preparation

### Preparation > Imports

In [293]:
''' IMPORTS '''
import numpy as np
import pandas as pd
import matplotlib as plot
import math 
import string

from enum import Enum    

# for plotting
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import multivariate_normal

### Preparation > Classes

In [294]:
'''CLASSES '''
class Cols(Enum):
        StatusOfExistingCheckingAccount = 0,
        DurationInMonth = 1, 
        CreditHistory = 2, 
        Purpose = 3, 
        CreditAmount = 4 , 
        SavingsAccountBonds =5,
        PresentEmploymentSince =6 ,
        InstallmentRateInPercentageOfDisposableIncome =7 ,
        PersonalStatusAndSex = 8,
        OtherDebtorsOrGuarantors =9 ,
        PresentResidenceSince =10 ,
        Property =11 ,
        AgeInYears = 12,
        OtherInstallmentPlans =13 ,
        Housing = 14,
        NumberOfExistingCreditsAtThisBank =15 ,
        Job = 16,
        NumberOfPeopleBeingLiableToProvideMaintenanceFor =17 ,
        Telephone = 18,
        ForeignWorker =19 ,
        ClassOutcome = 20
        
class Outcomes(Enum):
        Good = 1
        Bad = 2

### Preparation > Constances

In [295]:
'''CONSTANCES'''
# dict of all column names
COLS = {
        Cols.StatusOfExistingCheckingAccount : "Status of existing checking account" ,
        Cols.DurationInMonth:"Duration in month" , # numerical
        Cols.CreditHistory: "Credit history", 
        Cols.Purpose:"Purpose" , 
        Cols.CreditAmount:  "Credit amount",  # numerical
        
        Cols.SavingsAccountBonds: "Savings account/bonds" ,
        Cols.PresentEmploymentSince:"Present employment since" ,
        Cols.InstallmentRateInPercentageOfDisposableIncome: "Installment rate in percentage of disposable income", # numerical
        Cols.PersonalStatusAndSex: "Personal status and sex",
        Cols.OtherDebtorsOrGuarantors:"Other debtors / guarantors", 
        
        Cols.PresentResidenceSince: "Present residence since", # numerical
        Cols.Property: "Property", 
        Cols.AgeInYears: "Age in years", # numerical
        Cols.OtherInstallmentPlans:"Other installment plans", 
        Cols.Housing: "Housing", 
        
        Cols.NumberOfExistingCreditsAtThisBank:"Number of existing credits at this bank", # numerical
        Cols.Job: "Job",
        Cols.NumberOfPeopleBeingLiableToProvideMaintenanceFor:"Number of people being liable to provide maintenance for", 
        Cols.Telephone: "Telephone",
        Cols.ForeignWorker:"Foreign worker",
        
            Cols.ClassOutcome: "Class/Outcome"
}

# model parameter
PERC_TRAIN = 0.67
PERC_TEST = 1-PERC_TRAIN

# constants
PI = np.pi
numerical_attr = [
    COLS[Cols.DurationInMonth],
    COLS[Cols.CreditAmount],
    COLS[Cols.PresentResidenceSince],
    COLS[Cols.InstallmentRateInPercentageOfDisposableIncome], 
    COLS[Cols.AgeInYears], 
    COLS[Cols.NumberOfExistingCreditsAtThisBank],
]

### Model > Load Data

In [296]:
df_normal = pd.read_csv(
    "german.data", 
    sep=" ",
    names=[
        COLS[Cols.StatusOfExistingCheckingAccount],
        COLS[Cols.DurationInMonth],
        COLS[Cols.CreditHistory],
        COLS[Cols.Purpose],
        COLS[Cols.CreditAmount],
        COLS[Cols.SavingsAccountBonds],
        COLS[Cols.PresentEmploymentSince],
        COLS[Cols.InstallmentRateInPercentageOfDisposableIncome],
        COLS[Cols.PersonalStatusAndSex],
        COLS[Cols.OtherDebtorsOrGuarantors],
        COLS[Cols.PresentResidenceSince],
        COLS[Cols.Property],
        COLS[Cols.AgeInYears],
        COLS[Cols.OtherInstallmentPlans],
        COLS[Cols.Housing],
        COLS[Cols.NumberOfExistingCreditsAtThisBank],
        COLS[Cols.Job],
        COLS[Cols.NumberOfPeopleBeingLiableToProvideMaintenanceFor],
        COLS[Cols.Telephone],
        COLS[Cols.ForeignWorker],
        COLS[Cols.ClassOutcome],
    ])


df_numerical = pd.read_csv(
    "german.data-numeric", 
    delim_whitespace=True,
    header=None,
    names = [
        'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x',
           COLS[Cols.ClassOutcome]]
)

df_numerical.head(10)

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,p,q,r,s,t,u,v,w,x,Class/Outcome
0,1,6,4,12,5,5,3,4,1,67,...,0,0,1,0,0,1,0,0,1,1
1,2,48,2,60,1,3,2,2,1,22,...,0,0,1,0,0,1,0,0,1,2
2,4,12,4,21,1,4,3,3,1,49,...,0,0,1,0,0,1,0,1,0,1
3,1,42,2,79,1,4,3,4,2,45,...,0,0,0,0,0,0,0,0,1,1
4,1,24,3,49,1,3,3,4,4,53,...,1,0,1,0,0,0,0,0,1,2
5,4,36,2,91,5,3,3,4,4,35,...,0,0,1,0,0,0,0,1,0,1
6,4,24,2,28,3,5,3,4,2,53,...,0,0,1,0,0,1,0,0,1,1
7,2,36,2,69,1,3,3,2,3,35,...,0,1,1,0,1,0,0,0,0,1
8,4,12,2,31,4,4,1,4,1,61,...,0,0,1,0,0,1,0,1,0,1
9,2,30,4,52,1,1,4,2,3,28,...,1,0,1,0,0,1,0,0,0,2


## Model

### Model > Functions

In [297]:
'''FUNCTIONS'''
    
# to calculate un-normalized posteriors and do classification with respect to g1 and g2results
def do_classification(df_test, prior_good, muv_good, cov_good, prior_bad, muv_bad, cov_bad):
    decision_list = []  # to store classification results to calculate accuracy later.
    for index, row in df_test.iterrows():
        pred_good = discriminant_func(x=row, prior=prior_good, mus=muv_good, cov=cov_good)
        pred_bad = discriminant_func(x=row, prior=prior_bad, mus=muv_bad, cov=cov_bad)
        
        # take higher posterior (-> higher probabilty)
        classified_class = Outcomes.Good.value if pred_good>pred_bad else Outcomes.Bad.value
        decision_list.append(classified_class)
        
    return decision_list


def discriminant_func(x, prior, mus, cov):
    standard_dev = math.sqrt(abs(cov))

    if standard_dev == 0:
        return -900000000000
    
    res =  (
            -(math.log(2*PI)/2) 
            - math.log(standard_dev) 
            - ((x[attr1]-mus[0])**2)/(2*cov) 
            + np.log(prior)
        )+ (
            -(math.log(2*PI)/2) 
            - math.log(standard_dev) 
            - ((x[attr2]-mus[1])**2)/(2*cov) 
            + np.log(prior)
    )
    return res
    

def plot_gaussian(mu, sigma2):
    random_seed=404
    print(f'sigma2={sigma2}')

    plt.style.use('seaborn-dark')
    plt.rcParams['figure.figsize']=14,6
    fig = plt.figure()
    
    # Initializing the covariance matrix
    cov = np.array([
        [1, sigma2], 
        [sigma2, 1]
    ])
    
    # Generating a Gaussian bivariate distribution
    # with given mean and covariance matrix
    distr = multivariate_normal(cov = cov, mean = mu, seed = random_seed)
    
    
    # Generating a meshgrid complacent with
    # the 3-sigma boundary
    mean_1, mean_2 = mu[0], mu[1]
    sigma_1, sigma_2 = cov[0,0], cov[1,1]
     
    x = np.linspace(mean_1 -3*sigma_1, mean_1 + 3*sigma_1, num=100)
    y = np.linspace(mean_2 -3*sigma_2, mean_2 + 3*sigma_2, num=100)
    X, Y = np.meshgrid(x,y)
    
    # Generating the density function
    # for each point in the meshgrid
    pdf = np.zeros(X.shape)
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            pdf[i,j] = distr.pdf([X[i,j], Y[i,j]])
            
    # Plotting the density function values
    key = 131
    ax = fig.add_subplot(key, projection = '3d')
    ax.plot_surface(X, Y, pdf, cmap = 'viridis')
    plt.xlabel("x1")
    plt.ylabel("x2")
    plt.title(f'Covariance between x1 and x2 = {sigma2}')
    ax.axes.zaxis.set_ticks([])
    
    
    plt.tight_layout()
    plt.show()
    

# to find maximum of the discriminant function results
def find_max_likelihood(g1, g2):
    likelihood_list = [g1, g2]
    return max(likelihood_list)

## Model > Training (run  500x)

### Gradient Descent

In [303]:
def routine_gradient_descent(df, columns, iterations):
    max_avg_acc = 0
    best_attributes = []
    
    for i, attr1 in enumerate(columns):
        for attr2 in columns:
            if attr1 == attr2:
                continue

            max_avg = gradient_descent(df, attr1, attr2, iterations)

            if max_avg > max_avg_acc:
                max_avg_acc = max_avg
                best_attributes = [attr1,attr2]
        print(f"=============Progress: {round(i/len(columns),2)}==============")


    print(f'Best Avg. Acc: {max_avg_acc} with features: <{best_attributes[0]}> & <{best_attributes[1]}>')

    

def gradient_descent(df, attr1, attr2, iterations):
    accuracy = 0  # total accuracy
    accuracies=[] # to store individual accuracies

    # program will run for 500 times to find average accuracy
    for i in range(0, iterations):    
        # split data train & test
        msk = np.random.rand(len(df)) < PERC_TRAIN
        df_train = df[msk]
        df_test = df[~msk]

        # separate in class 1 & 2
        df_good = df_train.loc[df_train[COLS[Cols.ClassOutcome]] == 1]    
        df_bad  = df_train.loc[df_train[COLS[Cols.ClassOutcome]] == 2]

        prior_good = len(df_good)/(len(df_good)+len(df_bad)) 
        prior_bad = len(df_bad)/(len(df_good)+len(df_bad)) 

        # calculate MUs and COVs(MLE parameters) of each class
        muv_good = [df_good[attr1].mean(), df_good[attr2].mean()]
        cov_good = df_good[attr1].cov(df_good[attr2])

        muv_bad = [df_bad[attr1].mean(), df_bad[attr2].mean()]
        cov_bad = df_bad[attr1].cov(df_bad[attr2])

        # classification_result is an array that stores the result of classification
        classification_result = do_classification(
            df_test=df_test, 
            prior_good=prior_good, muv_good=muv_good, cov_good=cov_good, 
            prior_bad=prior_bad, muv_bad=muv_bad, cov_bad=cov_bad)

        # compare classification results with y_test 
        col1_acc = "classification"
        col2_acc = "df_test_y"
        df_classified = pd.DataFrame({
            col1_acc : classification_result,
            col2_acc : df_test[COLS[Cols.ClassOutcome]]
        })
        df_acc = df_classified.loc[(df_classified[col1_acc] == df_classified[col2_acc])] 
        individual_accuracy = df_acc.shape[0]/df_test.shape[0]    
        accuracy = accuracy + individual_accuracy
        accuracies.append(individual_accuracy)

    print(f"<{attr1}> x <{attr2}>")
    max_avg = round((accuracy/iterations)*100,2)

    print(f"Acg Acc. = {max_avg}%")
    print(f"Max Acc. = {round(max(accuracies)*100,2)}%")
    print("")
    return max_avg

### Dataframe = normal (only 7 numerical attributes / 7*7 combinations)

In [307]:
max_avg = routine_gradient_descent(df_normal, numerical_attr, 100)
                
#plot_gaussian(mu=muv_good, sigma2=cov_good)

<Duration in month> x <Credit amount>
Acg Acc. = 69.27%
Max Acc. = 73.81%

<Duration in month> x <Present residence since>
Acg Acc. = 50.56%
Max Acc. = 74.6%

<Duration in month> x <Installment rate in percentage of disposable income>
Acg Acc. = 54.63%
Max Acc. = 76.36%

<Duration in month> x <Age in years>
Acg Acc. = 60.97%
Max Acc. = 75.52%

<Duration in month> x <Number of existing credits at this bank>
Acg Acc. = 61.49%
Max Acc. = 76.31%

<Credit amount> x <Duration in month>
Acg Acc. = 68.27%
Max Acc. = 74.29%

<Credit amount> x <Present residence since>
Acg Acc. = 46.66%
Max Acc. = 75.76%

<Credit amount> x <Installment rate in percentage of disposable income>
Acg Acc. = 51.15%
Max Acc. = 73.99%

<Credit amount> x <Age in years>
Acg Acc. = 62.56%
Max Acc. = 75.56%

<Credit amount> x <Number of existing credits at this bank>
Acg Acc. = 50.03%
Max Acc. = 75.15%

<Present residence since> x <Duration in month>
Acg Acc. = 51.51%
Max Acc. = 74.86%

<Present residence since> x <Credit 

### Dataframe = numerical (24 numerical attributes / 24*24 combinations)

In [306]:
max_avg = routine_gradient_descent(df_numerical, df_numerical.columns, 10)

<a> x <b>
Acg Acc. = 34.36%
Max Acc. = 67.54%

<a> x <c>
Acg Acc. = 29.98%
Max Acc. = 33.43%

<a> x <d>
Acg Acc. = 66.7%
Max Acc. = 76.6%

<a> x <e>
Acg Acc. = 5.91%
Max Acc. = 31.55%

<a> x <f>
Acg Acc. = 30.08%
Max Acc. = 35.35%

<a> x <g>
Acg Acc. = 53.52%
Max Acc. = 100.0%

<a> x <h>
Acg Acc. = 61.13%
Max Acc. = 72.4%

<a> x <i>
Acg Acc. = 33.93%
Max Acc. = 70.61%

<a> x <j>
Acg Acc. = 41.16%
Max Acc. = 72.42%

<a> x <k>
Acg Acc. = 29.79%
Max Acc. = 32.93%

<a> x <l>
Acg Acc. = 29.98%
Max Acc. = 32.58%

<a> x <m>
Acg Acc. = 58.74%
Max Acc. = 75.54%

<a> x <n>
Acg Acc. = 30.0%
Max Acc. = 33.33%

<a> x <o>
Acg Acc. = 65.37%
Max Acc. = 100.0%

<a> x <p>
Acg Acc. = 75.19%
Max Acc. = 100.0%

<a> x <q>
Acg Acc. = 38.12%
Max Acc. = 69.42%

<a> x <r>
Acg Acc. = 44.91%
Max Acc. = 71.9%

<a> x <s>
Acg Acc. = 79.44%
Max Acc. = 100.0%

<a> x <t>
Acg Acc. = 57.86%
Max Acc. = 72.73%

<a> x <u>
Acg Acc. = 14.96%
Max Acc. = 33.23%

<a> x <v>
Acg Acc. = 65.52%
Max Acc. = 72.63%

<a> x <w>
Acg Acc. 

<h> x <a>
Acg Acc. = 46.3%
Max Acc. = 73.55%

<h> x <b>
Acg Acc. = 50.62%
Max Acc. = 73.58%

<h> x <c>
Acg Acc. = 41.16%
Max Acc. = 72.97%

<h> x <d>
Acg Acc. = 45.68%
Max Acc. = 71.22%

<h> x <e>
Acg Acc. = 45.95%
Max Acc. = 72.12%

<h> x <f>
Acg Acc. = 18.6%
Max Acc. = 32.79%

<h> x <g>
Acg Acc. = 46.92%
Max Acc. = 70.03%

<h> x <i>
Acg Acc. = 0.0%
Max Acc. = 0.0%

<h> x <j>
Acg Acc. = 65.85%
Max Acc. = 70.86%

<h> x <k>
Acg Acc. = 46.43%
Max Acc. = 74.93%

<h> x <l>
Acg Acc. = 27.84%
Max Acc. = 69.08%

<h> x <m>
Acg Acc. = 42.71%
Max Acc. = 72.49%

<h> x <n>
Acg Acc. = 49.83%
Max Acc. = 70.4%

<h> x <o>
Acg Acc. = 50.49%
Max Acc. = 71.02%

<h> x <p>
Acg Acc. = 42.29%
Max Acc. = 72.17%

<h> x <q>
Acg Acc. = 45.97%
Max Acc. = 72.26%

<h> x <r>
Acg Acc. = 47.45%
Max Acc. = 75.07%

<h> x <s>
Acg Acc. = 58.23%
Max Acc. = 72.45%

<h> x <t>
Acg Acc. = 30.11%
Max Acc. = 33.13%

<h> x <u>
Acg Acc. = 70.67%
Max Acc. = 74.84%

<h> x <v>
Acg Acc. = 38.48%
Max Acc. = 71.62%

<h> x <w>
Acg Acc. =

<o> x <a>
Acg Acc. = 63.81%
Max Acc. = 100.0%

<o> x <b>
Acg Acc. = 46.93%
Max Acc. = 71.8%

<o> x <c>
Acg Acc. = 45.93%
Max Acc. = 100.0%

<o> x <d>
Acg Acc. = 57.82%
Max Acc. = 71.69%

<o> x <e>
Acg Acc. = 52.94%
Max Acc. = 100.0%

<o> x <f>
Acg Acc. = 55.37%
Max Acc. = 74.15%

<o> x <g>
Acg Acc. = 30.39%
Max Acc. = 33.43%

<o> x <h>
Acg Acc. = 49.5%
Max Acc. = 70.69%

<o> x <i>
Acg Acc. = 30.78%
Max Acc. = 35.09%

<o> x <j>
Acg Acc. = 46.61%
Max Acc. = 72.59%

<o> x <k>
Acg Acc. = 37.47%
Max Acc. = 68.83%

<o> x <l>
Acg Acc. = 46.38%
Max Acc. = 70.86%

<o> x <m>
Acg Acc. = 30.32%
Max Acc. = 33.12%

<o> x <n>
Acg Acc. = 61.69%
Max Acc. = 73.26%

<o> x <p>
Acg Acc. = 66.62%
Max Acc. = 73.21%

<o> x <q>
Acg Acc. = 28.93%
Max Acc. = 31.27%

<o> x <r>
Acg Acc. = 67.05%
Max Acc. = 100.0%

<o> x <s>
Acg Acc. = 29.33%
Max Acc. = 32.06%

<o> x <t>
Acg Acc. = 58.31%
Max Acc. = 72.06%

<o> x <u>
Acg Acc. = 51.73%
Max Acc. = 72.84%

<o> x <v>
Acg Acc. = 29.45%
Max Acc. = 32.14%

<o> x <w>
Acg A

<v> x <a>
Acg Acc. = 59.52%
Max Acc. = 100.0%

<v> x <b>
Acg Acc. = 57.45%
Max Acc. = 74.07%

<v> x <c>
Acg Acc. = 39.18%
Max Acc. = 73.08%

<v> x <d>
Acg Acc. = 66.64%
Max Acc. = 72.52%

<v> x <e>
Acg Acc. = 48.61%
Max Acc. = 73.51%

<v> x <f>
Acg Acc. = 59.26%
Max Acc. = 74.05%

<v> x <g>
Acg Acc. = 53.9%
Max Acc. = 73.42%

<v> x <h>
Acg Acc. = 50.57%
Max Acc. = 73.33%

<v> x <i>
Acg Acc. = 43.02%
Max Acc. = 69.0%

<v> x <j>
Acg Acc. = 30.51%
Max Acc. = 33.33%

<v> x <k>
Acg Acc. = 59.3%
Max Acc. = 73.81%

<v> x <l>
Acg Acc. = 40.44%
Max Acc. = 69.75%

<v> x <m>
Acg Acc. = 65.46%
Max Acc. = 71.65%

<v> x <n>
Acg Acc. = 69.33%
Max Acc. = 100.0%

<v> x <o>
Acg Acc. = 34.08%
Max Acc. = 70.43%

<v> x <p>
Acg Acc. = 65.62%
Max Acc. = 72.19%

<v> x <q>
Acg Acc. = 44.77%
Max Acc. = 72.89%

<v> x <r>
Acg Acc. = 42.19%
Max Acc. = 73.39%

<v> x <s>
Acg Acc. = 49.93%
Max Acc. = 71.51%

<v> x <t>
Acg Acc. = 50.29%
Max Acc. = 71.88%

<v> x <u>
Acg Acc. = 61.49%
Max Acc. = 74.11%

<v> x <w>
Acg Ac

In [218]:
'''
Probleme:
(1) Ich weiß nicht ob ich 
    (a) irgendwo einen Fehler habe oder 
    (b) schlechte Parameter habe oder 
    (c) die Daten bei nur 2 Parametern mit MLE nur so viel Accuracy geben
    
(2) Plot ist schwierig
    --> 2 Ansätze
    (a) Contour Plot - Plotte flach aber mit Farbkreisen 
    (b) Plotte 3D 
--> brauch ich doch zwei Sigma Werte?

(3) Risk Analysis - woher die Formel?

'''

'\nProbleme:\n(1) Ich weiß nicht ob ich \n    (a) irgendwo einen Fehler habe oder \n    (b) schlechte Parameter habe oder \n    (c) die Daten bei nur 2 Parametern mit MLE nur so viel Accuracy geben\n\n'