# Experiments on the German Credit datasets
Old version: https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)
Corrected version: https://archive.ics.uci.edu/ml/datasets/South+German+Credit+%28UPDATE%29 / https://www.kaggle.com/c/south-german-credit-prediction/overview/data-overview

Some meanings of the discrete/ordinal feature values in the old version were wrong.
For example, for feature "checking status",

(inferred by the old dataset) a data object's value is                    '1' in the old dataset meaning negative DM.
(by new dataset) However, it should have been value          '4' in the old dataset meaning no checking account

(inferred by the old dataset) a data object's value is                    **'2' in the new dataset** *meaning negative DM*
(by new dataset) However, it should have been value          '1' in the old dataset meaning no checking account

Whether using the encoding scheme in the old or the new dataset, the feature value should be corrected according to the true meaning. The procedures in this experiment are:
1. Encode the dataset using the new dataset's meaning,
2. According to the *meanings of the old dataset*, **find encoded number in the new dataset and modify the feature values**.
3. Train first on the data points by *meanings of the old dataset* to get the base NN
4. gradually train on the data points by meanings of the new dataset to get the shifted NNs.

In [1]:
# Python Standard Libraries
import time
import os, sys, pickle, json, math, time, multiprocessing, warnings, itertools, random, warnings, gc, ast, subprocess
import copy
from collections import defaultdict, Counter, namedtuple
from math import log
from itertools import product, combinations
from random import choice, choices, sample, seed
from datetime import datetime
from sklearn.metrics.pairwise import euclidean_distances
from importlib import reload
# Basics
import numpy as np
import pandas as pd

# Plotting
import matplotlib.pyplot as plt

# Multi-processing
from joblib import Parallel, delayed
from tqdm import tqdm

# sci-kit learn
import sklearn
import sklearn.datasets as datasets
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, make_scorer
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve

pd.options.display.max_columns = 100
pd.options.display.max_rows = 150

import warnings

warnings.filterwarnings('ignore')
from expnns.utilcredit import *
from expnns.preprocessor import Preprocessor

### Process the old version dataset

In [2]:
df_old, df_old_mm, df_old_enc, preprocessor_old = load_old("../datasets/credit/old/german.data")

In [3]:
df_old.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   checking-status  1000 non-null   int64
 1   duration         1000 non-null   int64
 2   credit-history   1000 non-null   int64
 3   purpose          1000 non-null   int64
 4   amount           1000 non-null   int64
 5   savings          1000 non-null   int64
 6   employment       1000 non-null   int64
 7   rate             1000 non-null   int64
 8   sex-status       1000 non-null   int64
 9   guarantors       1000 non-null   int64
 10  residence        1000 non-null   int64
 11  property         1000 non-null   int64
 12  age              1000 non-null   int64
 13  installment      1000 non-null   int64
 14  housing          1000 non-null   int64
 15  num-credits      1000 non-null   int64
 16  job              1000 non-null   int64
 17  liable           1000 non-null   int64
 18  phone    

In [4]:
df_old_enc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 73 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   checking-status_0  1000 non-null   float64
 1   checking-status_1  1000 non-null   float64
 2   checking-status_2  1000 non-null   float64
 3   checking-status_3  1000 non-null   float64
 4   duration           1000 non-null   float64
 5   credit-history_0   1000 non-null   float64
 6   credit-history_1   1000 non-null   float64
 7   credit-history_2   1000 non-null   float64
 8   credit-history_3   1000 non-null   float64
 9   credit-history_4   1000 non-null   float64
 10  purpose_0          1000 non-null   float64
 11  purpose_1          1000 non-null   float64
 12  purpose_2          1000 non-null   float64
 13  purpose_3          1000 non-null   float64
 14  purpose_4          1000 non-null   float64
 15  purpose_5          1000 non-null   float64
 16  purpose_6          1000 n

In [5]:
display(pd.DataFrame(data=df_old_mm.values[15].reshape(1, -1), columns=columns))
display(preprocessor_old.encode_one(df_old_mm.values[15]))

Unnamed: 0,checking-status,duration,credit-history,purpose,amount,savings,employment,rate,sex-status,guarantors,residence,property,age,installment,housing,num-credits,job,liable,phone,foreign,good-credit
0,1.0,0.294118,3.0,4.0,0.056784,2.0,2.0,3.0,1.0,0.0,1.0,1.0,0.232143,2.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0


Unnamed: 0,checking-status_0,checking-status_1,checking-status_2,checking-status_3,duration,credit-history_0,credit-history_1,credit-history_2,credit-history_3,credit-history_4,purpose_0,purpose_1,purpose_2,purpose_3,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,purpose_10,amount,savings_0,savings_1,savings_2,savings_3,savings_4,employment_0,employment_1,employment_2,employment_3,employment_4,rate_0,rate_1,rate_2,rate_3,sex-status_0,sex-status_1,sex-status_2,sex-status_3,guarantors_0,guarantors_1,guarantors_2,residence_0,residence_1,residence_2,residence_3,property_0,property_1,property_2,property_3,age,installment_0,installment_1,installment_2,housing_0,housing_1,housing_2,num-credits_0,num-credits_1,num-credits_2,num-credits_3,job_0,job_1,job_2,job_3,liable_0,liable_1,phone_0,phone_1,foreign_0,foreign_1,good-credit
0,1.0,1.0,0.0,0.0,0.294118,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.056784,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.232143,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


### Process the new version dataset


In [10]:
df_new, df_new_mm, df_new_enc, preprocessor_new = load_new("../datasets/credit/new/train.csv")

In [11]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 1 to 800
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype
---  ------           --------------  -----
 0   checking-status  800 non-null    int32
 1   duration         800 non-null    int32
 2   credit-history   800 non-null    int32
 3   purpose          800 non-null    int32
 4   amount           800 non-null    int32
 5   savings          800 non-null    int32
 6   employment       800 non-null    int32
 7   rate             800 non-null    int32
 8   sex-status       800 non-null    int32
 9   guarantors       800 non-null    int32
 10  residence        800 non-null    int32
 11  property         800 non-null    int32
 12  age              800 non-null    int32
 13  installment      800 non-null    int32
 14  housing          800 non-null    int32
 15  num-credits      800 non-null    int32
 16  job              800 non-null    int32
 17  liable           800 non-null    int32
 18  phone     

In [12]:
df_new_enc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 801 entries, 0 to 800
Data columns (total 73 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   checking-status_0  800 non-null    float64
 1   checking-status_1  800 non-null    float64
 2   checking-status_2  800 non-null    float64
 3   checking-status_3  800 non-null    float64
 4   duration           800 non-null    float64
 5   credit-history_0   800 non-null    float64
 6   credit-history_1   800 non-null    float64
 7   credit-history_2   800 non-null    float64
 8   credit-history_3   800 non-null    float64
 9   credit-history_4   800 non-null    float64
 10  purpose_0          800 non-null    float64
 11  purpose_1          800 non-null    float64
 12  purpose_2          800 non-null    float64
 13  purpose_3          800 non-null    float64
 14  purpose_4          800 non-null    float64
 15  purpose_5          800 non-null    float64
 16  purpose_6          800 non

In [13]:
display(pd.DataFrame(data=df_new_mm.values[15].reshape(1, -1), columns=columns))
display(preprocessor_new.encode_one(df_new_mm.values[15]))


Unnamed: 0,checking-status,duration,credit-history,purpose,amount,savings,employment,rate,sex-status,guarantors,residence,property,age,installment,housing,num-credits,job,liable,phone,foreign,good-credit
0,1.0,0.205882,2.0,3.0,0.163035,2.0,1.0,0.0,3.0,0.0,2.0,0.0,0.107143,2.0,0.0,0.0,2.0,1.0,0.0,1.0,1.0


Unnamed: 0,checking-status_0,checking-status_1,checking-status_2,checking-status_3,duration,credit-history_0,credit-history_1,credit-history_2,credit-history_3,credit-history_4,purpose_0,purpose_1,purpose_2,purpose_3,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,purpose_10,amount,savings_0,savings_1,savings_2,savings_3,savings_4,employment_0,employment_1,employment_2,employment_3,employment_4,rate_0,rate_1,rate_2,rate_3,sex-status_0,sex-status_1,sex-status_2,sex-status_3,guarantors_0,guarantors_1,guarantors_2,residence_0,residence_1,residence_2,residence_3,property_0,property_1,property_2,property_3,age,installment_0,installment_1,installment_2,housing_0,housing_1,housing_2,num-credits_0,num-credits_1,num-credits_2,num-credits_3,job_0,job_1,job_2,job_3,liable_0,liable_1,phone_0,phone_1,foreign_0,foreign_1,good-credit
0,1.0,1.0,0.0,0.0,0.205882,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.163035,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.107143,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
