In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
import sys
import os
sys.path.append(os.path.join(os.path.abspath('..')))

In [2]:
training_data = pd.read_csv("../data/cs-training.csv")
test_data = pd.read_csv("../data/cs-test.csv")

In [3]:
training_data.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [4]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,,0.885519,43,0,0.177513,5700.0,4,0,0,0,0.0
1,2,,0.463295,57,0,0.527237,9141.0,15,0,4,0,2.0
2,3,,0.043275,59,0,0.687648,5083.0,12,0,1,0,2.0
3,4,,0.280308,38,1,0.925961,3200.0,7,0,2,0,0.0
4,5,,1.0,27,0,0.019917,3865.0,4,0,0,0,1.0


In [5]:
print(training_data.shape)
print(test_data.shape)

(150000, 12)
(101503, 12)


In [6]:
training_data.isnull().sum()

Unnamed: 0                                  0
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [7]:
test_data.isnull().sum()

Unnamed: 0                                   0
SeriousDlqin2yrs                        101503
RevolvingUtilizationOfUnsecuredLines         0
age                                          0
NumberOfTime30-59DaysPastDueNotWorse         0
DebtRatio                                    0
MonthlyIncome                            20103
NumberOfOpenCreditLinesAndLoans              0
NumberOfTimes90DaysLate                      0
NumberRealEstateLoansOrLines                 0
NumberOfTime60-89DaysPastDueNotWorse         0
NumberOfDependents                        2626
dtype: int64

In [8]:
# Drop unnecessary columns
training_data.drop(columns=['Unnamed: 0'], inplace=True)

In [9]:
# Median Imputation for MonthlyIncome
income_imputer = SimpleImputer(strategy='median')
training_data['MonthlyIncome'] = income_imputer.fit_transform(training_data[['MonthlyIncome']])
test_data['MonthlyIncome'] = income_imputer.transform(test_data[['MonthlyIncome']])

In [10]:
# Mode Imputation for NumberOfDependents
dependents_imputer = SimpleImputer(strategy='most_frequent')
training_data['NumberOfDependents'] = dependents_imputer.fit_transform(training_data[['NumberOfDependents']])
test_data['NumberOfDependents'] = dependents_imputer.transform(test_data[['NumberOfDependents']])

In [11]:
# Remove rows with missing values in the target variable (SeriousDlqin2yrs)
test_data = test_data.dropna(subset=['SeriousDlqin2yrs'])

In [12]:
# checking duplicates
print(training_data.duplicated().sum())
print(test_data.duplicated().sum())

767
0


In [13]:
# Save the cleaned datasets
training_data.to_csv('../data/training_cleaned.csv', index=False)
test_data.to_csv('../data/test_cleaned.csv', index=False)