# Experiments on the German Credit datasets
Old version: https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)
Corrected version: https://archive.ics.uci.edu/ml/datasets/South+German+Credit+%28UPDATE%29 / https://www.kaggle.com/c/south-german-credit-prediction/overview/data-overview

Some meanings of the discrete/ordinal feature values in the old version were wrong.
For example, for feature "checking status",

(inferred by the old dataset) a data object's value is                    '1' in the old dataset meaning negative DM.
(by new dataset) However, it should have been value          '4' in the old dataset meaning no checking account

(inferred by the old dataset) a data object's value is                    **'2' in the new dataset** *meaning negative DM*
(by new dataset) However, it should have been value          '1' in the old dataset meaning no checking account

Whether using the encoding scheme in the old or the new dataset, the feature value should be corrected according to the true meaning. The procedures in this experiment are:
1. Encode the dataset using the new dataset's meaning,
2. According to the *meanings of the old dataset*, **find encoded number in the new dataset and modify the feature values**.
3. Train first on the data points by *meanings of the old dataset* to get the base NN
4. gradually train on the data points by meanings of the new dataset to get the shifted NNs.

- Q here regardless of the experiments we are doing: this does not make much sense, if we spotted some error in the dataset, why don't we retrain the model on the corrected dataset??

In [1]:
# Python Standard Libraries
import time
import os, sys, pickle, json, math, time, multiprocessing, warnings, itertools, random, warnings, gc, ast, subprocess
import copy
from collections import defaultdict, Counter, namedtuple
from math import log
from itertools import product, combinations
from random import choice, choices, sample, seed
from datetime import datetime
from sklearn.metrics.pairwise import euclidean_distances

# Basics
import numpy as np
import pandas as pd

# Plotting
import matplotlib.pyplot as plt

# Multi-processing
from joblib import Parallel, delayed
from tqdm import tqdm

# sci-kit learn
import sklearn
import sklearn.datasets as datasets
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, make_scorer
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve

if sklearn.__version__ >= '0.20':
    from sklearn.naive_bayes import CategoricalNB
    from sklearn.metrics import jaccard_score, balanced_accuracy_score
else:
    print('WARNING! Old version of sklearn, can\'t load CategoricalNB.')

pd.options.display.max_columns = 100
pd.options.display.max_rows = 150

import warnings

warnings.filterwarnings('ignore')

### Process the old version dataset

In [20]:
df = pd.read_csv("../datasets/credit/old/german.data", header=None, delimiter=',')
df = df.dropna()
df.columns = ["status", "duration", "history", "purpose", "amount", "savings", "employment", "rate", "sex", "guarantors",
           "residence", "property", "age", "installment", "housing", "existing", "job", "liable", "phone", "foreign", "good-credit"]
df = df.replace(to_replace=
                {'status': {'A14': int(0), 'A11': int(1), 'A12': int(2), 'A13':int(3)},
                 'history': {'A30': int(0), 'A31': int(1), 'A32': int(2), 'A33': int(3), 'A34': int(4)},
                 'purpose': {'A40': int(0), 'A41': int(1),'A42': int(2),'A43': int(3),'A44': int(4),'A45': int(5),'A46': int(6),'A47': int(7),'A48': int(8),'A49': int(9),'A410': int(10),},
                 'savings': {'A65': int(0),'A61': int(1),'A62': int(2),'A63': int(3),'A64': int(4),},
                 'employment': {'A71': int(0),'A72': int(1),'A73': int(2),'A74': int(3),'A75': int(4),},
                 'sex': {'A91': int(0),'A92': int(1),'A93': int(2),'A94': int(3),'A95': int(4)},
                 'guarantors': {'A101': int(0),'A102': int(1),'A103': int(2)},
                 'property': {'A121': int(0),'A122': int(1),'A123': int(2),'A124': int(3)},
                 'installment': {'A141': int(0),'A142': int(1),'A143': int(2)},
                 'housing': {'A151': int(0),'A152': int(1),'A153': int(2)},
                 'job': {'A171': int(0),'A172': int(1),'A173': int(2),'A174': int(3)},
                 'phone': {'A191': int(0),'A192': int(1)},
                 'foreign': {'A201': int(0),'A202': int(1)},
                 'good-credit': {1: int(1),2: int(0)},
                 })
print(df)

     status  duration  history  purpose  amount  savings  employment  rate  \
0         1         6        4        3    1169        0           4     4   
1         2        48        2        3    5951        1           2     2   
2         0        12        4        6    2096        1           3     2   
3         1        42        2        2    7882        1           3     2   
4         1        24        3        0    4870        1           2     3   
..      ...       ...      ...      ...     ...      ...         ...   ...   
995       0        12        2        2    1736        1           3     3   
996       1        30        2        1    3857        1           2     4   
997       0        12        2        3     804        1           4     4   
998       1        45        2        3    1845        1           2     4   
999       2        45        4        1    4576        2           0     3   

     sex  guarantors  residence  property  age  installment  ho

In [21]:
ordinal_features = {"status": 4, "history": 5, "savings": 5, "employment": 5, "guarantors": 3, "job": 4}
discrete_features = {"purpose": 11, "sex": 5, "property": 4, "installment": 3, "housing": 3, "phone": 2, "foreign": 2}
continuous_features = ["duration", "amount", "rate", "residence", "age", "existing", "liable"]


In [29]:
for i in range(21):
    print(df.columns[i], len(np.unique(df.values[:, i])))
    if df.columns[i] in ordinal_features or df.columns[i] in discrete_features:
        print(np.unique(df.values[:, i]))

status 4
[0 1 2 3]
duration 33
history 5
[0 1 2 3 4]
purpose 10
[ 0  1  2  3  4  5  6  8  9 10]
amount 921
savings 5
[0 1 2 3 4]
employment 5
[0 1 2 3 4]
rate 4
sex 4
[0 1 2 3]
guarantors 3
[0 1 2]
residence 4
property 4
[0 1 2 3]
age 53
installment 3
[0 1 2]
housing 3
[0 1 2]
existing 4
job 4
[0 1 2 3]
liable 2
phone 2
[0 1]
foreign 2
[0 1]
good-credit 2
