In [1]:
# notebooks/data_exploration.ipynb

# Import necessary libraries
import pandas as pd

# Load datasets
application_df = pd.read_csv('./data/raw/application_record.csv')
credit_df = pd.read_csv('./data/raw/credit_record.csv')

# Display first few rows of application data
print(application_df.head())

# Display first few rows of credit data
print(credit_df.head())

# Basic information about datasets
print(application_df.info())
print(credit_df.info())

# Check if 'TARGET' column is in the dataset (you may need to add or rename it)
print("Columns in application_df:", application_df.columns)
print("Columns in credit_df:", credit_df.columns)


        ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY  CNT_CHILDREN  \
0  5008804           M            Y               Y             0   
1  5008805           M            Y               Y             0   
2  5008806           M            Y               Y             0   
3  5008808           F            N               Y             0   
4  5008809           F            N               Y             0   

   AMT_INCOME_TOTAL      NAME_INCOME_TYPE            NAME_EDUCATION_TYPE  \
0          427500.0               Working               Higher education   
1          427500.0               Working               Higher education   
2          112500.0               Working  Secondary / secondary special   
3          270000.0  Commercial associate  Secondary / secondary special   
4          270000.0  Commercial associate  Secondary / secondary special   

     NAME_FAMILY_STATUS  NAME_HOUSING_TYPE  DAYS_BIRTH  DAYS_EMPLOYED  \
0        Civil marriage   Rented apartment      -12005 

In [2]:
# notebooks/data_preprocessing.ipynb

# Import necessary libraries
import pandas as pd

# Function to load data
def load_data():
    application_df = pd.read_csv('./data/raw/application_record.csv')
    credit_df = pd.read_csv('./data/raw/credit_record.csv')
    return application_df, credit_df

# Function to preprocess data
def preprocess_data(application_df, credit_df):
    # Handle missing values
    application_df.fillna(application_df.median(numeric_only=True), inplace=True)
    
    # Convert categorical columns to numeric using one-hot encoding
    categorical_cols = ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 
                        'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 
                        'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']
    
    application_df = pd.get_dummies(application_df, columns=categorical_cols, drop_first=True)

    # Create 'TARGET' column: 1 if STATUS indicates overdue payments, else 0
    credit_df['TARGET'] = credit_df['STATUS'].apply(lambda x: 1 if x in ['2', '3', '4', '5'] else 0)
    
    # Aggregate 'TARGET' to have one row per ID
    credit_agg = credit_df.groupby('ID')['TARGET'].max().reset_index()
    
    # Merge datasets on ID column
    merged_data = pd.merge(application_df, credit_agg, on='ID', how='inner')
    
    return merged_data

# Load and preprocess data
application_df, credit_df = load_data()
merged_data = preprocess_data(application_df, credit_df)

# Save preprocessed data
merged_data.to_csv('./data/processed/merged_data.csv', index=False)

# Display first few rows of merged data
print(merged_data.head())

# Check the columns to ensure 'TARGET' is present
print("Columns in merged_data:", merged_data.columns)


        ID  CNT_CHILDREN  AMT_INCOME_TOTAL  DAYS_BIRTH  DAYS_EMPLOYED  \
0  5008804             0          427500.0      -12005          -4542   
1  5008805             0          427500.0      -12005          -4542   
2  5008806             0          112500.0      -21474          -1134   
3  5008808             0          270000.0      -19110          -3051   
4  5008809             0          270000.0      -19110          -3051   

   FLAG_MOBIL  FLAG_WORK_PHONE  FLAG_PHONE  FLAG_EMAIL  CNT_FAM_MEMBERS  ...  \
0           1                1           0           0              2.0  ...   
1           1                1           0           0              2.0  ...   
2           1                0           0           0              2.0  ...   
3           1                0           1           1              1.0  ...   
4           1                0           1           1              1.0  ...   

   OCCUPATION_TYPE_Low-skill Laborers  OCCUPATION_TYPE_Managers  \
0            

In [3]:
# notebooks/feature_engineering.ipynb

# Import necessary libraries
import pandas as pd

# Load preprocessed data
merged_data = pd.read_csv('./data/processed/merged_data.csv')

# Function for feature engineering
def feature_engineering(merged_data):
    # Feature engineering example
    merged_data['Income_Per_Family_Member'] = merged_data['AMT_INCOME_TOTAL'] / merged_data['CNT_FAM_MEMBERS']
    
    # Encode categorical variables (already done in preprocessing)
    
    return merged_data

# Apply feature engineering
feature_engineered_data = feature_engineering(merged_data)

# Save feature-engineered data
feature_engineered_data.to_csv('./data/processed/feature_engineered_data.csv', index=False)

# Display first few rows of feature engineered data
print(feature_engineered_data.head())

# Check the columns to ensure 'TARGET' is present
print("Columns in feature_engineered_data:", feature_engineered_data.columns)


        ID  CNT_CHILDREN  AMT_INCOME_TOTAL  DAYS_BIRTH  DAYS_EMPLOYED  \
0  5008804             0          427500.0      -12005          -4542   
1  5008805             0          427500.0      -12005          -4542   
2  5008806             0          112500.0      -21474          -1134   
3  5008808             0          270000.0      -19110          -3051   
4  5008809             0          270000.0      -19110          -3051   

   FLAG_MOBIL  FLAG_WORK_PHONE  FLAG_PHONE  FLAG_EMAIL  CNT_FAM_MEMBERS  ...  \
0           1                1           0           0              2.0  ...   
1           1                1           0           0              2.0  ...   
2           1                0           0           0              2.0  ...   
3           1                0           1           1              1.0  ...   
4           1                0           1           1              1.0  ...   

   OCCUPATION_TYPE_Managers  OCCUPATION_TYPE_Medicine staff  \
0                

In [4]:
# notebooks/model_training_rf.ipynb

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load feature engineered data
feature_engineered_data = pd.read_csv('./data/processed/feature_engineered_data.csv')

# Ensure 'TARGET' column is present
if 'TARGET' not in feature_engineered_data.columns:
    raise KeyError("'TARGET' column not found in the dataset")

# Train model function
def train_model(feature_engineered_data):
    X = feature_engineered_data.drop('TARGET', axis=1)
    y = feature_engineered_data['TARGET']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print("Random Forest Accuracy:", accuracy)
    print("Random Forest Classification Report:\n", classification_report(y_test, y_pred))
    
    # Save the model
    joblib.dump(model, './models/credit_approval_rf_model.pkl')

# Train the model
train_model(feature_engineered_data)


Random Forest Accuracy: 0.9787438288535382
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      7175
           1       0.29      0.22      0.25       117

    accuracy                           0.98      7292
   macro avg       0.64      0.61      0.62      7292
weighted avg       0.98      0.98      0.98      7292



In [5]:
# notebooks/model_training_nn.ipynb

# Import necessary libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load feature engineered data
feature_engineered_data = pd.read_csv('./data/processed/feature_engineered_data.csv')

# Prepare the data
X = feature_engineered_data.drop('TARGET', axis=1).astype(np.float32)
y = feature_engineered_data['TARGET'].astype(np.float32)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a simple neural network model
def build_neural_network(input_dim):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Build and train the model
model = build_neural_network(X_train.shape[1])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")
print("Neural Network Accuracy:", model.evaluate(X_test, y_test)[1])
print("Neural Network Classification Report:")
print(classification_report(y_test, y_pred_nn))


2024-08-13 18:45:56.887529: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-13 18:45:56.887605: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-13 18:45:56.887620: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-13 18:45:56.900302: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-13 18:46:00.541854: I te

Epoch 1/10


2024-08-13 18:46:04.201539: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-08-13 18:46:05.023794: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f41a805bc00 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-08-13 18:46:05.023859: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Quadro T1000, Compute Capability 7.5
2024-08-13 18:46:05.036214: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-08-13 18:46:05.070898: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8907
2024-08-13 18:46:05.192760: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Accuracy: 0.9839550256729126
Neural Network Classification Report:
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      7175
         1.0       0.00      0.00      0.00       117

    accuracy                           0.98      7292
   macro avg       0.49      0.50      0.50      7292
weighted avg       0.97      0.98      0.98      7292



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# notebooks/interpreting_results.ipynb

# Summary of the analysis
print("Interpreting the Results:")

# Random Forest Model Interpretation
# Here you can discuss feature importances and model performance.
print("Random Forest Model: The Random Forest model achieved a high accuracy, primarily because it correctly classified most 'good' clients. However, its ability to detect 'bad' clients is limited due to the class imbalance.")

# Neural Network Model Interpretation
# Here you can discuss how the neural network performed and any overfitting signs.
print("Neural Network Model: The Neural Network also achieved high accuracy, but similarly struggled with detecting 'bad' clients. Tuning the model and handling class imbalance could improve its performance.")

# Conclusion
print("Conclusion: Both models perform well in identifying 'good' clients, but struggle with 'bad' clients due to the imbalance in the dataset. Techniques like SMOTE and further tuning of the models may improve the classification of 'bad' clients.")


Interpreting the Results:
Random Forest Model: The Random Forest model achieved a high accuracy, primarily because it correctly classified most 'good' clients. However, its ability to detect 'bad' clients is limited due to the class imbalance.
Neural Network Model: The Neural Network also achieved high accuracy, but similarly struggled with detecting 'bad' clients. Tuning the model and handling class imbalance could improve its performance.
Conclusion: Both models perform well in identifying 'good' clients, but struggle with 'bad' clients due to the imbalance in the dataset. Techniques like SMOTE and further tuning of the models may improve the classification of 'bad' clients.
