# Setup

## Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np

## Global Configurations

In [2]:
INPUT_DIR = '../data/source/'
INPUT_PATH = f'{INPUT_DIR}german.data'
OUTPUT_DIR = '../data/input/'
OUTPUT_FILE = f'{OUTPUT_DIR}german_data.csv'
OUTPUT = True

In [3]:
if not os.path.exists(INPUT_DIR):
    os.makedirs(INPUT_DIR)

In [4]:
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Data

## Download Data

In [5]:
![ -f ../data/source/german.data ] && echo "File exist" || (wget https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data && mv german.data ../data/source/)

--2022-01-14 09:40:15--  https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 79793 (78K) [application/x-httpd-php]
Saving to: ‘german.data’


2022-01-14 09:40:17 (164 KB/s) - ‘german.data’ saved [79793/79793]



## Read Data

In [6]:
df = pd.read_csv(INPUT_PATH, header=None, sep=' ')
cols  = [
         'existing_checking_account_status',
         'duration_mth',
         'credit_history',
         'purpose',
         'credit_amount',
         'saving_accounts_or_bonds',
         'present_employment_since',
         'installment_rate_percent', # disposable income
         'personal_status_sex',
         'other_debtors_or_guarantors',
         'present_residence_since',
         'property',
         'age_years',
         'other_installment_plans',
         'housing',
         'no_of_existing_credits', # in this bank
         'job',
         'no_of_ppl_liable',
         'telephone',
         'foreign_worker',
         'label' 
]

df.columns = cols
df.loc[:, 'label'] = np.where(df.label == 2, 1, 0)
df.loc[:, 'monthly_affordability_amount'] =  df.credit_amount * (1 + df.installment_rate_percent / 100 * df.duration_mth) / df.duration_mth 

In [7]:
df.shape

(1000, 22)

In [8]:
display(
    df.label.value_counts(),
    df.label.value_counts(normalize=True)
)

0    700
1    300
Name: label, dtype: int64

0    0.7
1    0.3
Name: label, dtype: float64

In [9]:
df.columns

Index(['existing_checking_account_status', 'duration_mth', 'credit_history',
       'purpose', 'credit_amount', 'saving_accounts_or_bonds',
       'present_employment_since', 'installment_rate_percent',
       'personal_status_sex', 'other_debtors_or_guarantors',
       'present_residence_since', 'property', 'age_years',
       'other_installment_plans', 'housing', 'no_of_existing_credits', 'job',
       'no_of_ppl_liable', 'telephone', 'foreign_worker', 'label',
       'monthly_affordability_amount'],
      dtype='object')

# Output

In [10]:
if OUTPUT:
    df.to_csv(OUTPUT_FILE, index=False)