In [1]:
# Let's first load the data to see its structure and understand the problem we need to solve.
import pandas as pd

# Load the dataset
file_path = '../data/german.csv'
data = pd.read_csv(file_path, delimiter=';')

# Display the first few rows of the dataframe to understand its structure and the type of data it contains
data.head()

Unnamed: 0,Sex,Single,Unemployed,Age,Credit,LoanDuration,PurposeOfLoan,InstallmentRate,Housing,Label
0,1.0,1,0,67,1169,6,3,4,1.0,1
1,0.0,0,0,22,5951,48,3,2,1.0,0
2,1.0,1,0,49,2096,12,2,2,1.0,1
3,1.0,1,0,45,7882,42,4,2,3.0,1
4,1.0,1,0,53,4870,24,6,3,3.0,0


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Sex              1000 non-null   float64
 1   Single           1000 non-null   int64  
 2   Unemployed       1000 non-null   int64  
 3   Age              1000 non-null   int64  
 4   Credit           1000 non-null   int64  
 5   LoanDuration     1000 non-null   int64  
 6   PurposeOfLoan    1000 non-null   int64  
 7   InstallmentRate  1000 non-null   int64  
 8   Housing          1000 non-null   float64
 9   Label            1000 non-null   int64  
dtypes: float64(2), int64(8)
memory usage: 78.2 KB


In [3]:
data.describe()

Unnamed: 0,Sex,Single,Unemployed,Age,Credit,LoanDuration,PurposeOfLoan,InstallmentRate,Housing,Label
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.69,0.548,0.062,35.546,3271.258,20.903,4.596,2.973,1.395,0.7
std,0.462725,0.49794,0.241276,11.375469,2822.736876,12.058814,2.518954,1.118715,0.674856,0.458487
min,0.0,0.0,0.0,19.0,250.0,4.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,27.0,1365.5,12.0,3.0,2.0,1.0,0.0
50%,1.0,1.0,0.0,33.0,2319.5,18.0,4.0,3.0,1.0,1.0
75%,1.0,1.0,0.0,42.0,3972.25,24.0,6.0,4.0,2.0,1.0
max,1.0,1.0,1.0,75.0,18424.0,72.0,10.0,4.0,3.0,1.0


In [4]:
# The dataset contains 1000 rows and 21 columns. The columns are as follows:
data.columns

Index(['Sex', 'Single', 'Unemployed', 'Age', 'Credit', 'LoanDuration',
       'PurposeOfLoan', 'InstallmentRate', 'Housing', 'Label'],
      dtype='object')

In [5]:
import numpy as np

def train_test_val_split(num_instances, train_ratio=0.6, test_ratio=0.2, val_ratio=0.2, seed=42):
    """
    Splits the indexes of rows into training, testing, and validation sets.

    Parameters:
        num_instances (int): Total number of instances in the dataset.
        train_ratio (float): Ratio of instances for the training set (default: 0.6).
        test_ratio (float): Ratio of instances for the testing set (default: 0.2).
        val_ratio (float): Ratio of instances for the validation set (default: 0.2).
        seed (int): Random seed for reproducibility (default: 42).

    Returns:
        train_idxs (numpy.array): Indexes of rows for the training set.
        test_idxs (numpy.array): Indexes of rows for the testing set.
        val_idxs (numpy.array): Indexes of rows for the validation set.
    """
    assert train_ratio + test_ratio + val_ratio == 1, "Ratios should sum up to 1"
    np.random.seed(seed)

    # Generate random indexes
    all_idxs = np.arange(num_instances)
    np.random.shuffle(all_idxs)

    # Calculate split points
    train_end = int(num_instances * train_ratio)
    test_end = train_end + int(num_instances * test_ratio)

    # Split indexes
    train_idxs = all_idxs[:train_end]
    test_idxs = all_idxs[train_end:test_end]
    val_idxs = all_idxs[test_end:]

    return train_idxs, test_idxs, val_idxs


In [6]:
# Split the dataset into training, testing, and validation sets
num_instances = len(data)
train_idxs, test_idxs, val_idxs = train_test_val_split(num_instances)

In [7]:
# Let's now split the dataset into features and labels, and then into training, testing, and validation sets.
# Split the dataset into features and labels
X = data.drop('Label', axis=1)
y = data['Label']

# Split the features and labels into training, testing, and validation sets
X_train, X_test, X_val = X.iloc[train_idxs], X.iloc[test_idxs], X.iloc[val_idxs]
y_train, y_test, y_val = y.iloc[train_idxs], y.iloc[test_idxs], y.iloc[val_idxs]

# Display the shapes of the training, testing, and validation sets
X_train.shape, X_test.shape, X_val.shape

((600, 9), (200, 9), (200, 9))

In [8]:
# Save the training, testing, and validation sets to CSV files
X_train.to_csv('tra_tst_val/X_train.csv', index=False)
X_test.to_csv('tra_tst_val/X_test.csv', index=False)
X_val.to_csv('tra_tst_val/X_val.csv', index=False)
y_train.to_csv('tra_tst_val/y_train.csv', index=False)
y_test.to_csv('tra_tst_val/y_test.csv', index=False)
y_val.to_csv('tra_tst_val/y_val.csv', index=False)

In [9]:
# Normalizing numerical features (Age, Credit, LoanDuration) using StandardScaler
from sklearn.preprocessing import StandardScaler
import joblib

# Features to be scaled
features_to_scale = ['Age', 'Credit', 'LoanDuration']

scaler = StandardScaler()
data[features_to_scale] = scaler.fit_transform(data[features_to_scale])

joblib.dump(scaler, '../black_box/bb_scaler.joblib')

data.head()

Unnamed: 0,Sex,Single,Unemployed,Age,Credit,LoanDuration,PurposeOfLoan,InstallmentRate,Housing,Label
0,1.0,1,0,2.766456,-0.745131,-1.236478,3,4,1.0,1
1,0.0,0,0,-1.191404,0.949817,2.248194,3,2,1.0,0
2,1.0,1,0,1.183312,-0.416562,-0.738668,2,2,1.0,1
3,1.0,1,0,0.831502,1.634247,1.750384,4,2,3.0,1
4,1.0,1,0,1.535122,0.566664,0.256953,6,3,3.0,0


In [10]:
# Split the preprocessed dataset into features and labels
X = data.drop('Label', axis=1)
y = data['Label']

# Split the features and labels into training, testing, and validation sets
X_train, X_test, X_val = X.iloc[train_idxs], X.iloc[test_idxs], X.iloc[val_idxs]
y_train, y_test, y_val = y.iloc[train_idxs], y.iloc[test_idxs], y.iloc[val_idxs]

# Display the shapes of the training, testing, and validation sets
X_train.shape, X_test.shape, X_val.shape

((600, 9), (200, 9), (200, 9))

In [12]:
# Save the preprocessed training, testing, and validation sets to CSV files
X_train.to_csv('tra_tst_val/X_train_normalised.csv', index=False)
X_test.to_csv('tra_tst_val/X_test_normalised.csv', index=False)
X_val.to_csv('tra_tst_val/X_val_normalised.csv', index=False)
y_train.to_csv('tra_tst_val/y_train_normalised.csv', index=False)
y_test.to_csv('tra_tst_val/y_test_normalised.csv', index=False)
y_val.to_csv('tra_tst_val/y_val_normalised.csv', index=False)

In [13]:
# number of unique values in each column
data.nunique()

Sex                  2
Single               2
Unemployed           2
Age                 53
Credit             921
LoanDuration        33
PurposeOfLoan       10
InstallmentRate      4
Housing              3
Label                2
dtype: int64

In [14]:
# number of rows in each value for the Label column
data['Label'].value_counts()

Label
1    700
0    300
Name: count, dtype: int64

In [15]:
y_train.value_counts(), y_test.value_counts(), y_val.value_counts()

(Label
 1    415
 0    185
 Name: count, dtype: int64,
 Label
 1    145
 0     55
 Name: count, dtype: int64,
 Label
 1    140
 0     60
 Name: count, dtype: int64)