In [17]:
# Let's first load the data to see its structure and understand the problem we need to solve.
import pandas as pd

# Load the dataset
file_path = '../data/german.csv'
data = pd.read_csv(file_path, delimiter=';')

# Display the first few rows of the dataframe to understand its structure and the type of data it contains
data.head()

Unnamed: 0,Sex,Single,Unemployed,Age,Credit,LoanDuration,PurposeOfLoan,InstallmentRate,Housing,Label
0,1.0,1,0,67,1169,6,3,4,1.0,1
1,0.0,0,0,22,5951,48,3,2,1.0,0
2,1.0,1,0,49,2096,12,2,2,1.0,1
3,1.0,1,0,45,7882,42,4,2,3.0,1
4,1.0,1,0,53,4870,24,6,3,3.0,0


In [18]:
# Normalizing numerical features (Age, Credit, LoanDuration) using StandardScaler
from sklearn.preprocessing import StandardScaler
import joblib

# Features to be scaled
features_to_scale = ['Age', 'Credit', 'LoanDuration']

scaler = StandardScaler()
data[features_to_scale] = scaler.fit_transform(data[features_to_scale])

joblib.dump(scaler, 'bb_scaler.joblib')

data.head()

Unnamed: 0,Sex,Single,Unemployed,Age,Credit,LoanDuration,PurposeOfLoan,InstallmentRate,Housing,Label
0,1.0,1,0,2.766456,-0.745131,-1.236478,3,4,1.0,1
1,0.0,0,0,-1.191404,0.949817,2.248194,3,2,1.0,0
2,1.0,1,0,1.183312,-0.416562,-0.738668,2,2,1.0,1
3,1.0,1,0,0.831502,1.634247,1.750384,4,2,3.0,1
4,1.0,1,0,1.535122,0.566664,0.256953,6,3,3.0,0


In [5]:
#
from sklearn.model_selection import train_test_split

# Features and label
X = data.drop('Label', axis=1)
y = data['Label']

# Splitting the data into training+validation (80%) and test (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Splitting the training+validation into training (60% of total) and validation (20% of total)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2

# Displaying the size of each subset to verify correct splitting
X_train.shape, X_val.shape, X_test.shape


((600, 9), (200, 9), (200, 9))

In [6]:
# Save training test and validations sets
X_train.to_csv('../data/tra_tst_val/X_train.csv')
y_train.to_csv('../data/tra_tst_val/y_train.csv')
X_test.to_csv('../data/tra_tst_val/X_test.csv')
y_test.to_csv('../data/tra_tst_val/y_test.csv')
X_val.to_csv('../data/tra_tst_val/X_val.csv')
y_val.to_csv('../data/tra_tst_val/y_val.csv')

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

# Defining a simple neural network function to test different hyperparameters
def build_model(input_dim, layers, activation, optimizer_lr):
    model = Sequential()
    model.add(Dense(layers[0], input_dim=input_dim, activation=activation))
    for units in layers[1:]:
        model.add(Dense(units, activation=activation))
    model.add(Dense(1, activation='sigmoid')) # Binary classification
    model.compile(optimizer=Adam(learning_rate=optimizer_lr), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Initial hyperparameters
input_dim = X_train.shape[1] # Number of features
layers = [64, 32] # Two layers with 64 and 32 neurons
activation = 'relu'
optimizer_lr = 0.001

# Building the initial model
model = build_model(input_dim, layers, activation, optimizer_lr)

# Training the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=10, verbose=0)

# Evaluating the model on the validation set
val_accuracy = np.max(history.history['val_accuracy'])

val_accuracy



2024-02-16 15:16:42.642224: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-16 15:16:47.524246: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


0.7250000238418579

In [10]:
model.save('bb_model')

INFO:tensorflow:Assets written to: bb_model/assets


In [11]:
from tensorflow.keras.models import load_model

model_path = '../black_box/bb_model'

# Load the model
model = load_model(model_path)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=2)

# Print the results
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

7/7 - 0s - loss: 0.6979 - accuracy: 0.6950 - 150ms/epoch - 21ms/step
Test Loss: 0.6978618502616882
Test Accuracy: 0.6949999928474426


In [15]:
# Predict on training set
train_predictions = model.predict(X_train)
train_predictions_df = pd.DataFrame(train_predictions, columns=['pred_label'])
train_predictions_df.to_csv('../data/tra_tst_val/pred_train.csv')

test_predictions = model.predict(X_train)
test_predictions_df = pd.DataFrame(test_predictions)
test_predictions_df.to_csv('../data/tra_tst_val/pred_test.csv')

val_predictions = model.predict(X_val)
val_predictions_df = pd.DataFrame(val_predictions)
val_predictions_df.to_csv('../data/tra_tst_val/pred_val.csv')




In [23]:
from joblib import load

# Load the dataset
file_path = '../data/german.csv'
input_df = pd.read_csv(file_path, delimiter=';')
y = input_df.Label
input_df.drop(columns=['Label'], inplace=True)

original_input = input_df.copy()

# Features to be scaled
features_to_scale = ['Age', 'Credit', 'LoanDuration']

# Load the scaler
scaler_path = '/Users/juls/Documents/Repositories/Contesting-Black-Box-Decisions/black_box/bb_scaler.joblib'  # Update this path
scaler = load(scaler_path)
input_df[features_to_scale] = scaler.fit_transform(input_df[features_to_scale])

# Make prediction
prediction_result = model.predict(input_df)

original_input['Label'] = y
original_input['Prediction'] = prediction_result
original_input['BinaryPrediction'] = original_input['Prediction'].apply(lambda x: 1 if x >=0.5 else 0)

original_input.to_csv('../data/bb_results.csv', index=None)


