In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
credit_record = pd.read_csv('data/credit_record.csv')

In [3]:
#head
credit_record.head()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [4]:
credit_record.tail()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
1048570,5150487,-25,C
1048571,5150487,-26,C
1048572,5150487,-27,C
1048573,5150487,-28,C
1048574,5150487,-29,C


In [5]:
credit_record.shape

(1048575, 3)

In [6]:
credit_record.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   ID              1048575 non-null  int64 
 1   MONTHS_BALANCE  1048575 non-null  int64 
 2   STATUS          1048575 non-null  object
dtypes: int64(2), object(1)
memory usage: 24.0+ MB


In [7]:
credit_record.isnull().sum()

ID                0
MONTHS_BALANCE    0
STATUS            0
dtype: int64

In [8]:
(credit_record.isnull().sum()/credit_record.shape[0])*100

ID                0.0
MONTHS_BALANCE    0.0
STATUS            0.0
dtype: float64

In [9]:
valid_status_values = ['C', 'X', '0', '1', '2', '3', '4', '5']
garbage_status = credit_record[~credit_record['STATUS'].isin(valid_status_values)]
print("Garbage values in STATUS column:\n", garbage_status)

Garbage values in STATUS column:
 Empty DataFrame
Columns: [ID, MONTHS_BALANCE, STATUS]
Index: []


In [10]:
unique_counts = credit_record.nunique()
print(unique_counts)

ID                45985
MONTHS_BALANCE       61
STATUS                8
dtype: int64


In [11]:
duplicates = credit_record[credit_record.duplicated(subset=['ID'], keep=False)]
print(duplicates)

              ID  MONTHS_BALANCE STATUS
0        5001711               0      X
1        5001711              -1      0
2        5001711              -2      0
3        5001711              -3      0
4        5001712               0      C
...          ...             ...    ...
1048570  5150487             -25      C
1048571  5150487             -26      C
1048572  5150487             -27      C
1048573  5150487             -28      C
1048574  5150487             -29      C

[1048176 rows x 3 columns]


In [12]:
def aggregate_columns(column):
    if column.dtype == 'O': 
        return column.mode().iloc[0]  
    else:  
        return column.mean()  
credit_record = credit_record.groupby('ID').agg(aggregate_columns).reset_index()

In [14]:
duplicates = credit_record[credit_record.duplicated(subset=['ID'], keep=False)]
print(duplicates)

Empty DataFrame
Columns: [ID, MONTHS_BALANCE, STATUS]
Index: []


In [99]:
print(credit_record)

            ID  MONTHS_BALANCE STATUS
0      5001711            -1.5      0
1      5001712            -9.0      0
2      5001713           -10.5      X
3      5001714            -7.0      X
4      5001715           -29.5      X
...        ...             ...    ...
45980  5150482           -19.5      0
45981  5150483            -8.5      X
45982  5150484            -6.0      0
45983  5150485            -0.5      0
45984  5150487           -14.5      C

[45985 rows x 3 columns]


In [15]:
status_mapping = {
    'C': 0,  # Approved
    'X': 0,  # Approved
    '0': 1,  # Not Approved
    '1': 1,  # Not Approved
    '2': 1,  # Not Approved
    '3': 1,  # Not Approved
    '4': 1,  # Not Approved
    '5': 1   # Not Approved
}

# Apply the mapping to the STATUS column
credit_record['STATUS'] = credit_record['STATUS'].map(status_mapping)

print(credit_record)


            ID  MONTHS_BALANCE  STATUS
0      5001711            -1.5       1
1      5001712            -9.0       1
2      5001713           -10.5       0
3      5001714            -7.0       0
4      5001715           -29.5       0
...        ...             ...     ...
45980  5150482           -19.5       1
45981  5150483            -8.5       0
45982  5150484            -6.0       1
45983  5150485            -0.5       1
45984  5150487           -14.5       0

[45985 rows x 3 columns]


In [16]:
credit_record = credit_record.drop(columns=['MONTHS_BALANCE'], errors='ignore')

print(credit_record)

            ID  STATUS
0      5001711       1
1      5001712       1
2      5001713       0
3      5001714       0
4      5001715       0
...        ...     ...
45980  5150482       1
45981  5150483       0
45982  5150484       1
45983  5150485       1
45984  5150487       0

[45985 rows x 2 columns]


In [17]:
credit_record.to_csv('data/credit_record_cleaned.csv', index=False)