In [1]:
# Importing necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Reading the CSV file into a pandas DataFrame
data=pd.read_csv("C:/Users/leopo/Desktop/Springboard-Capstone-2/Data/data_cleaned.csv")

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29130 entries, 0 to 29129
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   LIMIT_BAL               29130 non-null  int64
 1   SEX                     29130 non-null  int64
 2   EDUCATION               29130 non-null  int64
 3   MARRIAGE                29130 non-null  int64
 4   AGE                     29130 non-null  int64
 5   repayment_status_sept   29130 non-null  int64
 6   repayment_status_aug    29130 non-null  int64
 7   repayment_status_july   29130 non-null  int64
 8   repayment_status_june   29130 non-null  int64
 9   repayment_status_may    29130 non-null  int64
 10  repayment_status_april  29130 non-null  int64
 11  bill_sept               29130 non-null  int64
 12  bill_aug                29130 non-null  int64
 13  bill_july               29130 non-null  int64
 14  bill_june               29130 non-null  int64
 15  bill_may           

In [4]:
# Renaming columns for better readability

data = data.rename(columns={
    'LIMIT_BAL' : 'Limit_bal',
    'SEX' : 'Sex',
    'EDUCATION' : 'Education',
    'MARRIAGE' : 'Marriage',
    'AGE' : 'Age',
    'repayment_status_sept' : 'Repayment_status_sept',
    'repayment_status_aug' : 'Repayment_status_aug',
    'repayment_status_july' : 'Repayment_status_july',
    'repayment_status_june' : 'Repayment_status_june',
    'repayment_status_may' : 'Repayment_status_may',
    'repayment_status_april' : 'Repayment_status_april',
    'bill_sept': 'Bill_sept',
    'bill_aug' : 'Bill_aug',
    'bill_july' : 'Bill_july',
    'bill_june' : 'Bill_june',
    'bill_may' : 'Bill_may',
    'bill_april': 'Bill_april',
    'previous_payment_sept' : 'Previous_payment_sept',
    'previous_payment_aug' : 'Previous_payment_aug',
    'previous_payment_july' : 'Previous_payment_july',
    'previous_payment_june' : 'Previous_payment_june',
    'previous_payment_may' : 'Previous_payment_may',
    'previous_payment_april' : 'Previous_payment_april',
    'default' : 'Default',
    'is_outlier' : 'Is_outlier'
})

In [5]:
# Converting selected columns to categorical data type
C=['Education', 'Marriage','Repayment_status_sept','Repayment_status_aug','Repayment_status_july','Repayment_status_june','Repayment_status_may','Repayment_status_april', 'Default', 'Is_outlier']
for c in C:
    data[c] = data[c].astype('category')

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29130 entries, 0 to 29129
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Limit_bal               29130 non-null  int64   
 1   Sex                     29130 non-null  int64   
 2   Education               29130 non-null  category
 3   Marriage                29130 non-null  category
 4   Age                     29130 non-null  int64   
 5   Repayment_status_sept   29130 non-null  category
 6   Repayment_status_aug    29130 non-null  category
 7   Repayment_status_july   29130 non-null  category
 8   Repayment_status_june   29130 non-null  category
 9   Repayment_status_may    29130 non-null  category
 10  Repayment_status_april  29130 non-null  category
 11  Bill_sept               29130 non-null  int64   
 12  Bill_aug                29130 non-null  int64   
 13  Bill_july               29130 non-null  int64   
 14  Bill_june             

In [6]:
# Creating age bins and adding a new categorical column 'Age_range'
age_bins = [18, 30, 40, 50, 60, 70, 100]
age_labels = ['18-29', '30-39', '40-49', '50-59', '60-69', '60+']
data['Age_range'] = pd.cut(data['Age'], bins=age_bins, labels=age_labels, right=False).astype('category')

In [7]:
# Label encoding for ordinal categorical columns

ordinal_categorical_columns = ['Education', 
                               'Repayment_status_sept',
                               'Repayment_status_aug',
                               'Repayment_status_july',
                               'Repayment_status_june',
                               'Repayment_status_may',
                               'Repayment_status_april',
                               'Age_range']

label_encoder = LabelEncoder()

for col in ordinal_categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])


In [8]:
# One-hot encoding for nominal categorical columns
nominal_categorical_columns = ['Sex', 
                               'Marriage', 
                               'Is_outlier',]

onehot_encoder = OneHotEncoder()
data_encoded = onehot_encoder.fit_transform(data[nominal_categorical_columns])
data_encoded_df = pd.DataFrame(data_encoded.toarray(), columns=onehot_encoder.get_feature_names_out(nominal_categorical_columns))
data = data.drop(nominal_categorical_columns, axis=1)
data = pd.concat([data, data_encoded_df], axis=1)
data.head()

Unnamed: 0,Limit_bal,Education,Age,Repayment_status_sept,Repayment_status_aug,Repayment_status_july,Repayment_status_june,Repayment_status_may,Repayment_status_april,Bill_sept,...,Default,Age_range,Sex_1,Sex_2,Marriage_0,Marriage_1,Marriage_2,Marriage_3,Is_outlier_0,Is_outlier_1
0,20000,2,24,4,4,1,1,0,0,3913,...,1,0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,120000,2,26,1,4,2,2,2,3,2682,...,1,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,90000,2,34,2,2,2,2,2,2,29239,...,0,1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,50000,2,37,2,2,2,2,2,2,46990,...,0,1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,50000,2,57,1,2,1,2,2,2,8617,...,0,3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29130 entries, 0 to 29129
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Limit_bal               29130 non-null  int64   
 1   Education               29130 non-null  int64   
 2   Age                     29130 non-null  int64   
 3   Repayment_status_sept   29130 non-null  int64   
 4   Repayment_status_aug    29130 non-null  int64   
 5   Repayment_status_july   29130 non-null  int64   
 6   Repayment_status_june   29130 non-null  int64   
 7   Repayment_status_may    29130 non-null  int64   
 8   Repayment_status_april  29130 non-null  int64   
 9   Bill_sept               29130 non-null  int64   
 10  Bill_aug                29130 non-null  int64   
 11  Bill_july               29130 non-null  int64   
 12  Bill_june               29130 non-null  int64   
 13  Bill_may                29130 non-null  int64   
 14  Bill_april            

In [10]:
# Normalizing selected columns using StandardScaler
col_to_norm=['Bill_sept',
             'Bill_aug',
             'Bill_july',
             'Bill_june',
             'Bill_may',
             'Bill_april',
             'Previous_payment_sept',
             'Previous_payment_aug',
             'Previous_payment_july',
             'Previous_payment_june',
             'Previous_payment_may',
             'Previous_payment_april',
             'Limit_bal']

scaler = StandardScaler()
data[col_to_norm] = scaler.fit_transform(data[col_to_norm])

In [11]:
data.head()

Unnamed: 0,Limit_bal,Education,Age,Repayment_status_sept,Repayment_status_aug,Repayment_status_july,Repayment_status_june,Repayment_status_may,Repayment_status_april,Bill_sept,...,Default,Age_range,Sex_1,Sex_2,Marriage_0,Marriage_1,Marriage_2,Marriage_3,Is_outlier_0,Is_outlier_1
0,-1.125688,2,24,4,4,1,1,0,0,-0.658364,...,1,0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-0.355457,2,26,1,4,2,2,2,3,-0.674958,...,1,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,-0.586526,2,34,2,2,2,2,2,2,-0.316971,...,0,1,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,-0.894619,2,37,2,2,2,2,2,2,-0.077688,...,0,1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,-0.894619,2,57,1,2,1,2,2,2,-0.594955,...,0,3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [12]:
# Separating features (X) and target variable (y), shuffling the data, and splitting into train and test sets
X=data.drop('Default', axis=1)
y=data['Default']
X, y = shuffle(X, y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
# Displaying the proportion of target values in the training and test sets
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

Default
0    0.783084
1    0.216916
Name: proportion, dtype: float64
Default
0    0.783042
1    0.216958
Name: proportion, dtype: float64


In [13]:
# Saving the train and test sets as CSV files
X_train.to_csv("C:/Users/leopo/Desktop/Springboard-Capstone-2/Data/train_features.csv", index=False)
X_test.to_csv("C:/Users/leopo/Desktop/Springboard-Capstone-2/Data/test_features.csv", index=False)
y_train.to_csv("C:/Users/leopo/Desktop/Springboard-Capstone-2/Data/train_target.csv", index=False)
y_test.to_csv("C:/Users/leopo/Desktop/Springboard-Capstone-2/Data/test_target.csv", index=False)