## Preparation

### Preparation > Imports

In [266]:
''' IMPORTS '''
import numpy as np
import pandas as pd
import matplotlib as plot
import math 
import string

from enum import Enum    

# for plotting
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import multivariate_normal

### Preparation > Classes

In [237]:
'''CLASSES '''
class Cols(Enum):
        StatusOfExistingCheckingAccount = 0,
        DurationInMonth = 1, 
        CreditHistory = 2, 
        Purpose = 3, 
        CreditAmount = 4 , 
        SavingsAccountBonds =5,
        PresentEmploymentSince =6 ,
        InstallmentRateInPercentageOfDisposableIncome =7 ,
        PersonalStatusAndSex = 8,
        OtherDebtorsOrGuarantors =9 ,
        PresentResidenceSince =10 ,
        Property =11 ,
        AgeInYears = 12,
        OtherInstallmentPlans =13 ,
        Housing = 14,
        NumberOfExistingCreditsAtThisBank =15 ,
        Job = 16,
        NumberOfPeopleBeingLiableToProvideMaintenanceFor =17 ,
        Telephone = 18,
        ForeignWorker =19 ,
        ClassOutcome = 20
        
class Outcomes(Enum):
        Good = 1
        Bad = 2

### Preparation > Constances

In [238]:
'''CONSTANCES'''
# dict of all column names
COLS = {
        Cols.StatusOfExistingCheckingAccount : "Status of existing checking account" ,
        Cols.DurationInMonth:"Duration in month" , # numerical
        Cols.CreditHistory: "Credit history", 
        Cols.Purpose:"Purpose" , 
        Cols.CreditAmount:  "Credit amount",  # numerical
        
        Cols.SavingsAccountBonds: "Savings account/bonds" ,
        Cols.PresentEmploymentSince:"Present employment since" ,
        Cols.InstallmentRateInPercentageOfDisposableIncome: "Installment rate in percentage of disposable income", # numerical
        Cols.PersonalStatusAndSex: "Personal status and sex",
        Cols.OtherDebtorsOrGuarantors:"Other debtors / guarantors", 
        
        Cols.PresentResidenceSince: "Present residence since", # numerical
        Cols.Property: "Property", 
        Cols.AgeInYears: "Age in years", # numerical
        Cols.OtherInstallmentPlans:"Other installment plans", 
        Cols.Housing: "Housing", 
        
        Cols.NumberOfExistingCreditsAtThisBank:"Number of existing credits at this bank", # numerical
        Cols.Job: "Job",
        Cols.NumberOfPeopleBeingLiableToProvideMaintenanceFor:"Number of people being liable to provide maintenance for", 
        Cols.Telephone: "Telephone",
        Cols.ForeignWorker:"Foreign worker",
        
            Cols.ClassOutcome: "Class/Outcome"
}

# model parameter
PERC_TRAIN = 0.67
PERC_TEST = 1-PERC_TRAIN

# constants
PI = np.pi
numerical_attr = [
    COLS[Cols.DurationInMonth],
    COLS[Cols.CreditAmount],
    COLS[Cols.PresentResidenceSince],
    COLS[Cols.InstallmentRateInPercentageOfDisposableIncome], 
    COLS[Cols.AgeInYears], 
    COLS[Cols.NumberOfExistingCreditsAtThisBank],
]

### Model > Load Data

In [271]:
df_normal = pd.read_csv(
    "german.data", 
    sep=" ",
    names=[
        COLS[Cols.StatusOfExistingCheckingAccount],
        COLS[Cols.DurationInMonth],
        COLS[Cols.CreditHistory],
        COLS[Cols.Purpose],
        COLS[Cols.CreditAmount],
        COLS[Cols.SavingsAccountBonds],
        COLS[Cols.PresentEmploymentSince],
        COLS[Cols.InstallmentRateInPercentageOfDisposableIncome],
        COLS[Cols.PersonalStatusAndSex],
        COLS[Cols.OtherDebtorsOrGuarantors],
        COLS[Cols.PresentResidenceSince],
        COLS[Cols.Property],
        COLS[Cols.AgeInYears],
        COLS[Cols.OtherInstallmentPlans],
        COLS[Cols.Housing],
        COLS[Cols.NumberOfExistingCreditsAtThisBank],
        COLS[Cols.Job],
        COLS[Cols.NumberOfPeopleBeingLiableToProvideMaintenanceFor],
        COLS[Cols.Telephone],
        COLS[Cols.ForeignWorker],
        COLS[Cols.ClassOutcome],
    ])


df_numerical = pd.read_csv(
    "german.data-numeric", 
    delim_whitespace=True,
    header=None,
    names = [
        'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x',
           COLS[Cols.ClassOutcome]]
)

df_numerical.head(10)

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,p,q,r,s,t,u,v,w,x,Class/Outcome
0,1,6,4,12,5,5,3,4,1,67,...,0,0,1,0,0,1,0,0,1,1
1,2,48,2,60,1,3,2,2,1,22,...,0,0,1,0,0,1,0,0,1,2
2,4,12,4,21,1,4,3,3,1,49,...,0,0,1,0,0,1,0,1,0,1
3,1,42,2,79,1,4,3,4,2,45,...,0,0,0,0,0,0,0,0,1,1
4,1,24,3,49,1,3,3,4,4,53,...,1,0,1,0,0,0,0,0,1,2
5,4,36,2,91,5,3,3,4,4,35,...,0,0,1,0,0,0,0,1,0,1
6,4,24,2,28,3,5,3,4,2,53,...,0,0,1,0,0,1,0,0,1,1
7,2,36,2,69,1,3,3,2,3,35,...,0,1,1,0,1,0,0,0,0,1
8,4,12,2,31,4,4,1,4,1,61,...,0,0,1,0,0,1,0,1,0,1
9,2,30,4,52,1,1,4,2,3,28,...,1,0,1,0,0,1,0,0,0,2


## Model

### Model > Functions

In [277]:
'''FUNCTIONS'''
    
# to calculate un-normalized posteriors and do classification with respect to g1 and g2results
def do_classification(df_test, prior_good, muv_good, cov_good, prior_bad, muv_bad, cov_bad):
    decision_list = []  # to store classification results to calculate accuracy later.
    for index, row in df_test.iterrows():
        pred_good = discriminant_func(x=row, prior=prior_good, mus=muv_good, cov=cov_good)
        pred_bad = discriminant_func(x=row, prior=prior_bad, mus=muv_bad, cov=cov_bad)
        
        # take higher posterior (-> higher probabilty)
        classified_class = Outcomes.Good.value if pred_good>pred_bad else Outcomes.Bad.value
        decision_list.append(classified_class)
        
    return decision_list


def discriminant_func(x, prior, mus, cov):
    standard_dev = math.sqrt(abs(cov))

    if standard_dev == 0:
        return -900000000000
    
    res =  (
            -(math.log(2*PI)/2) 
            - math.log(standard_dev) 
            - ((x[attr1]-mus[0])**2)/(2*cov) 
            + np.log(prior)
        )+ (
            -(math.log(2*PI)/2) 
            - math.log(standard_dev) 
            - ((x[attr2]-mus[1])**2)/(2*cov) 
            + np.log(prior)
    )
    return res
    

def plot_gaussian(mu, sigma2):
    random_seed=404
    print(f'sigma2={sigma2}')

    plt.style.use('seaborn-dark')
    plt.rcParams['figure.figsize']=14,6
    fig = plt.figure()
    
    # Initializing the covariance matrix
    cov = np.array([
        [1, sigma2], 
        [sigma2, 1]
    ])
    
    # Generating a Gaussian bivariate distribution
    # with given mean and covariance matrix
    distr = multivariate_normal(cov = cov, mean = mu, seed = random_seed)
    
    
    # Generating a meshgrid complacent with
    # the 3-sigma boundary
    mean_1, mean_2 = mu[0], mu[1]
    sigma_1, sigma_2 = cov[0,0], cov[1,1]
     
    x = np.linspace(mean_1 -3*sigma_1, mean_1 + 3*sigma_1, num=100)
    y = np.linspace(mean_2 -3*sigma_2, mean_2 + 3*sigma_2, num=100)
    X, Y = np.meshgrid(x,y)
    
    # Generating the density function
    # for each point in the meshgrid
    pdf = np.zeros(X.shape)
    for i in range(X.shape[0]):
        for j in range(X.shape[1]):
            pdf[i,j] = distr.pdf([X[i,j], Y[i,j]])
            
    # Plotting the density function values
    key = 131
    ax = fig.add_subplot(key, projection = '3d')
    ax.plot_surface(X, Y, pdf, cmap = 'viridis')
    plt.xlabel("x1")
    plt.ylabel("x2")
    plt.title(f'Covariance between x1 and x2 = {sigma2}')
    ax.axes.zaxis.set_ticks([])
    
    
    plt.tight_layout()
    plt.show()
    

# to find maximum of the discriminant function results
def find_max_likelihood(g1, g2):
    likelihood_list = [g1, g2]
    return max(likelihood_list)

## Model > Training (run  500x)

### Gradient Descent

In [272]:
def gradient_descent(df, attr1, attr2):
    accuracy = 0  # total accuracy
    accuracies=[] # to store individual accuracies

    # program will run for 500 times to find average accuracy
    for i in range(0, ITERATIONS):    
        # split data train & test
        msk = np.random.rand(len(df)) < PERC_TRAIN
        df_train = df[msk]
        df_test = df[~msk]

        # separate in class 1 & 2
        df_good = df_train.loc[df_train[COLS[Cols.ClassOutcome]] == 1]    
        df_bad  = df_train.loc[df_train[COLS[Cols.ClassOutcome]] == 2]

        prior_good = len(df_good)/(len(df_good)+len(df_bad)) 
        prior_bad = len(df_bad)/(len(df_good)+len(df_bad)) 

        # calculate MUs and COVs(MLE parameters) of each class
        muv_good = [df_good[attr1].mean(), df_good[attr2].mean()]
        cov_good = df_good[attr1].cov(df_good[attr2])

        muv_bad = [df_bad[attr1].mean(), df_bad[attr2].mean()]
        cov_bad = df_bad[attr1].cov(df_bad[attr2])

        # classification_result is an array that stores the result of classification
        classification_result = do_classification(
            df_test=df_test, 
            prior_good=prior_good, muv_good=muv_good, cov_good=cov_good, 
            prior_bad=prior_bad, muv_bad=muv_bad, cov_bad=cov_bad)

        # compare classification results with y_test 
        col1_acc = "classification"
        col2_acc = "df_test_y"
        df_classified = pd.DataFrame({
            col1_acc : classification_result,
            col2_acc : df_test[COLS[Cols.ClassOutcome]]
        })
        df_acc = df_classified.loc[(df_classified[col1_acc] == df_classified[col2_acc])] 
        individual_accuracy = df_acc.shape[0]/df_test.shape[0]    
        accuracy = accuracy + individual_accuracy
        accuracies.append(individual_accuracy)

    print(f"<{attr1}> x <{attr2}>")
    max_avg = round((accuracy/ITERATIONS)*100,2)

    print(f"Acg Acc. = {max_avg}%")
    print(f"Max Acc. = {round(max(accuracies)*100,2)}%")
    print("")
    return max_avg

In [278]:
ITERATIONS = 10

# NORMAL DATAFRAME
max_avg_acc = 0
best_attributes = []

for attr1 in numerical_attr:
    for attr2 in numerical_attr:
        if attr1 == attr2:
            continue
        
        max_avg = gradient_descent(df_normal, attr1, attr2)
        
        if max_avg > max_avg_acc:
            max_avg_acc = max_avg
            best_attributes = [attr1,attr2]
    print("=========================================")


print(f'Best Avg. Acc: {max_avg_acc} with features: <{best_attributes[0]}> & <{best_attributes[1]}>')
                
#plot_gaussian(mu=muv_good, sigma2=cov_good)

<Duration in month> x <Credit amount>
Acg Acc. = 61.62%
Max Acc. = 65.24%

<Duration in month> x <Present residence since>
Acg Acc. = 59.1%
Max Acc. = 73.72%

<Duration in month> x <Installment rate in percentage of disposable income>
Acg Acc. = 51.77%
Max Acc. = 62.38%

<Duration in month> x <Age in years>
Acg Acc. = 70.27%
Max Acc. = 72.22%

<Duration in month> x <Number of existing credits at this bank>
Acg Acc. = 56.68%
Max Acc. = 69.97%

<Credit amount> x <Duration in month>
Acg Acc. = 61.99%
Max Acc. = 64.26%

<Credit amount> x <Present residence since>
Acg Acc. = 49.86%
Max Acc. = 60.31%

<Credit amount> x <Installment rate in percentage of disposable income>
Acg Acc. = 39.83%
Max Acc. = 48.84%

<Credit amount> x <Age in years>
Acg Acc. = 67.15%
Max Acc. = 73.9%

<Credit amount> x <Number of existing credits at this bank>
Acg Acc. = 56.44%
Max Acc. = 72.48%

<Present residence since> x <Duration in month>
Acg Acc. = 54.56%
Max Acc. = 71.83%

<Present residence since> x <Credit a

In [218]:
'''
Probleme:
(1) Ich weiß nicht ob ich 
    (a) irgendwo einen Fehler habe oder 
    (b) schlechte Parameter habe oder 
    (c) die Daten bei nur 2 Parametern mit MLE nur so viel Accuracy geben
    
(2) Plot ist schwierig
    --> 2 Ansätze
    (a) Contour Plot - Plotte flach aber mit Farbkreisen 
    (b) Plotte 3D 
--> brauch ich doch zwei Sigma Werte?

(3) Risk Analysis - woher die Formel?

'''

'\nProbleme:\n(1) Ich weiß nicht ob ich \n    (a) irgendwo einen Fehler habe oder \n    (b) schlechte Parameter habe oder \n    (c) die Daten bei nur 2 Parametern mit MLE nur so viel Accuracy geben\n\n'

In [None]:
ITERATIONS = 10

# NUMERICAL DATAFRAME
max_avg_acc = 0
best_attributes = []

for attr1 in df_numerical.columns:
    for attr2 in df_numerical.columns:
        if attr1 == attr2:
            continue
        
        max_avg = gradient_descent(df_numerical, attr1, attr2)
        
        if max_avg > max_avg_acc:
            max_avg_acc = max_avg
            best_attributes = [attr1,attr2]
    print("=========================================")


print(f'Best Avg. Acc: {max_avg_acc} with features: <{best_attributes[0]}> & <{best_attributes[1]}>')
                
#plot_gaussian(mu=muv_good, sigma2=cov_good)

<a> x <b>
Acg Acc. = 36.59%
Max Acc. = 64.76%

<a> x <c>
Acg Acc. = 63.16%
Max Acc. = 68.81%

<a> x <d>
Acg Acc. = 62.75%
Max Acc. = 72.52%

<a> x <e>
Acg Acc. = 67.63%
Max Acc. = 71.61%

<a> x <f>
Acg Acc. = 30.36%
Max Acc. = 33.03%

<a> x <g>
Acg Acc. = 44.8%
Max Acc. = 71.94%

<a> x <h>
Acg Acc. = 50.3%
Max Acc. = 74.28%

<a> x <i>
Acg Acc. = 31.37%
Max Acc. = 34.12%

<a> x <j>
Acg Acc. = 44.91%
Max Acc. = 71.6%

<a> x <k>
Acg Acc. = 29.15%
Max Acc. = 31.55%

<a> x <l>
Acg Acc. = 52.37%
Max Acc. = 68.16%

<a> x <m>
Acg Acc. = 58.56%
Max Acc. = 71.7%

<a> x <n>
Acg Acc. = 54.57%
Max Acc. = 70.91%

<a> x <o>
Acg Acc. = 35.68%
Max Acc. = 38.85%

<a> x <p>
Acg Acc. = 39.2%
Max Acc. = 69.36%

<a> x <q>
Acg Acc. = 33.68%
Max Acc. = 69.48%

<a> x <r>
Acg Acc. = 54.3%
Max Acc. = 67.94%

<a> x <s>
Acg Acc. = 56.35%
Max Acc. = 72.27%

<a> x <t>
Acg Acc. = 40.38%
Max Acc. = 70.36%

<a> x <u>
Acg Acc. = 61.95%
Max Acc. = 64.8%

<a> x <v>
Acg Acc. = 41.02%
Max Acc. = 71.26%

<a> x <w>
Acg Acc. =

<h> x <a>
Acg Acc. = 53.69%
Max Acc. = 72.04%

<h> x <b>
Acg Acc. = 52.12%
Max Acc. = 72.64%

<h> x <c>
Acg Acc. = 35.49%
Max Acc. = 63.19%

<h> x <d>
Acg Acc. = 58.18%
Max Acc. = 73.8%

<h> x <e>
Acg Acc. = 50.71%
Max Acc. = 75.74%

<h> x <f>
Acg Acc. = 65.75%
Max Acc. = 71.47%

<h> x <g>
Acg Acc. = 45.1%
Max Acc. = 71.21%

<h> x <i>
Acg Acc. = 37.16%
Max Acc. = 61.38%

<h> x <j>
Acg Acc. = 60.21%
Max Acc. = 68.81%

<h> x <k>
Acg Acc. = 31.19%
Max Acc. = 34.0%

<h> x <l>
Acg Acc. = 36.09%
Max Acc. = 48.21%

<h> x <m>
Acg Acc. = 44.04%
Max Acc. = 68.71%

<h> x <n>
Acg Acc. = 52.82%
Max Acc. = 70.93%

<h> x <o>
Acg Acc. = 39.77%
Max Acc. = 72.34%

<h> x <p>
Acg Acc. = 41.78%
Max Acc. = 71.57%

<h> x <q>
Acg Acc. = 44.89%
Max Acc. = 72.78%

<h> x <r>
Acg Acc. = 45.93%
Max Acc. = 72.73%

<h> x <s>
Acg Acc. = 45.3%
Max Acc. = 72.02%

<h> x <t>
Acg Acc. = 51.51%
Max Acc. = 68.77%

<h> x <u>
Acg Acc. = 70.47%
Max Acc. = 72.42%

<h> x <v>
Acg Acc. = 46.55%
Max Acc. = 71.17%

<h> x <w>
Acg Acc

0
