## Credit Card Default Dataset - Data Cleaning
1. Check missing values
2. Check undocumented labels in categorical variables: EDUCATION, MARRIAGE, PAY_n
3. Check outliers in numerical variables: a) weird negative values; b) extreme values

In [1]:
import pandas as pd
import numpy as np

### Load original datasets

In [2]:
path = '.\\' # change to your local file location

df_train = pd.read_csv(path + 'CreditCard_train.csv', header=1) # pass the second line in the CSV file as column names
df_train.rename(columns={'PAY_0':'PAY_1'}, inplace=True) # change strange (inconsistent) column name 'PAY_0'
df_train.rename(columns={'default payment next month':'default'}, inplace=True) # shorten the name of the target column

df_test = pd.read_csv(path + 'CreditCard_test.csv', header=1)
df_test.rename(columns={'PAY_0':'PAY_1'}, inplace=True)
df_test.rename(columns={'default payment next month':'default'}, inplace=True)

In [3]:
df_train.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [4]:
df_test.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,24001,50000,1,2,2,23,2,2,0,0,...,44116,21247,20066,8,2401,2254,2004,704,707,0
1,24002,60000,1,2,2,26,0,0,0,0,...,55736,26958,28847,2282,2324,2049,2000,3000,1120,1
2,24003,400000,1,2,2,27,0,0,0,0,...,10745,20737,9545,2501,10009,1437,1105,510,959,0
3,24004,20000,1,5,2,27,5,4,3,2,...,19709,20113,19840,0,0,0,900,0,0,0
4,24005,50000,1,3,2,27,0,0,-2,-2,...,0,70,120,0,100,0,70,200,100,0


### Check missing values

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24000 entries, 0 to 23999
Data columns (total 25 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   ID         24000 non-null  int64
 1   LIMIT_BAL  24000 non-null  int64
 2   SEX        24000 non-null  int64
 3   EDUCATION  24000 non-null  int64
 4   MARRIAGE   24000 non-null  int64
 5   AGE        24000 non-null  int64
 6   PAY_1      24000 non-null  int64
 7   PAY_2      24000 non-null  int64
 8   PAY_3      24000 non-null  int64
 9   PAY_4      24000 non-null  int64
 10  PAY_5      24000 non-null  int64
 11  PAY_6      24000 non-null  int64
 12  BILL_AMT1  24000 non-null  int64
 13  BILL_AMT2  24000 non-null  int64
 14  BILL_AMT3  24000 non-null  int64
 15  BILL_AMT4  24000 non-null  int64
 16  BILL_AMT5  24000 non-null  int64
 17  BILL_AMT6  24000 non-null  int64
 18  PAY_AMT1   24000 non-null  int64
 19  PAY_AMT2   24000 non-null  int64
 20  PAY_AMT3   24000 non-null  int64
 21  PAY_AMT4   2

In [6]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 25 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   ID         6000 non-null   int64
 1   LIMIT_BAL  6000 non-null   int64
 2   SEX        6000 non-null   int64
 3   EDUCATION  6000 non-null   int64
 4   MARRIAGE   6000 non-null   int64
 5   AGE        6000 non-null   int64
 6   PAY_1      6000 non-null   int64
 7   PAY_2      6000 non-null   int64
 8   PAY_3      6000 non-null   int64
 9   PAY_4      6000 non-null   int64
 10  PAY_5      6000 non-null   int64
 11  PAY_6      6000 non-null   int64
 12  BILL_AMT1  6000 non-null   int64
 13  BILL_AMT2  6000 non-null   int64
 14  BILL_AMT3  6000 non-null   int64
 15  BILL_AMT4  6000 non-null   int64
 16  BILL_AMT5  6000 non-null   int64
 17  BILL_AMT6  6000 non-null   int64
 18  PAY_AMT1   6000 non-null   int64
 19  PAY_AMT2   6000 non-null   int64
 20  PAY_AMT3   6000 non-null   int64
 21  PAY_AMT4   600

No missing values. All variables have numerical values (int64).

### Check undocumented labels

In [7]:
# check the values of categorical variable in training set are well-documented
print('SEX:', sorted(df_train['SEX'].unique()))
print('EDUCATION:', sorted(df_train['EDUCATION'].unique()))
print('MARRIAGE:', sorted(df_train['MARRIAGE'].unique()))
print('PAY_1:', sorted(df_train['PAY_1'].unique()))
print('PAY_2:', sorted(df_train['PAY_2'].unique()))
print('PAY_3:', sorted(df_train['PAY_3'].unique()))
print('PAY_4:', sorted(df_train['PAY_4'].unique()))
print('PAY_5:', sorted(df_train['PAY_5'].unique()))
print('PAY_6:', sorted(df_train['PAY_6'].unique()))
print('default:', sorted(df_train['default'].unique()))

SEX: [1, 2]
EDUCATION: [0, 1, 2, 3, 4, 5, 6]
MARRIAGE: [0, 1, 2, 3]
PAY_1: [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_2: [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_3: [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_4: [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_5: [-2, -1, 0, 2, 3, 4, 5, 6, 7, 8]
PAY_6: [-2, -1, 0, 2, 3, 4, 5, 6, 7, 8]
default: [0, 1]


In [8]:
# check the values of categorical variable in testing set are well-documented
print('SEX:', sorted(df_test['SEX'].unique()))
print('EDUCATION:', sorted(df_test['EDUCATION'].unique()))
print('MARRIAGE:', sorted(df_test['MARRIAGE'].unique()))
print('PAY_1:', sorted(df_test['PAY_1'].unique()))
print('PAY_2:', sorted(df_test['PAY_2'].unique()))
print('PAY_3:', sorted(df_test['PAY_3'].unique()))
print('PAY_4:', sorted(df_test['PAY_4'].unique()))
print('PAY_5:', sorted(df_test['PAY_5'].unique()))
print('PAY_6:', sorted(df_test['PAY_6'].unique()))
print('default:', sorted(df_test['default'].unique()))

SEX: [1, 2]
EDUCATION: [0, 1, 2, 3, 4, 5, 6]
MARRIAGE: [0, 1, 2, 3]
PAY_1: [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_2: [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7]
PAY_3: [-2, -1, 0, 2, 3, 4, 5, 6, 8]
PAY_4: [-2, -1, 0, 2, 3, 4, 5, 7]
PAY_5: [-2, -1, 0, 2, 3, 4, 5, 6, 7]
PAY_6: [-2, -1, 0, 2, 3, 4, 5, 6, 7]
default: [0, 1]


Undocumented categories existed in both training and testing set:
- **EDUCATION**: 0, 5, 6
- **MARRIAGE**: 0
- **PAY_1** to **PAY_6**: -2, 0

#### Count the number of rows for each undocumented label

In [20]:
# create a DataFrame to hold the results
undocumented_labels = ['EDUCATION == 0','EDUCATION == 5','EDUCATION == 6','MARRIAGE == 0','PAY_n == -2', 'PAY_n == 0']
counts = pd.DataFrame(index=undocumented_labels, columns=['Training', 'Testing'])
counts

Unnamed: 0,Training,Testing
EDUCATION == 0,,
EDUCATION == 5,,
EDUCATION == 6,,
MARRIAGE == 0,,
PAY_n == -2,,
PAY_n == 0,,


In [31]:
# EDUCATION, training set
counts.loc['EDUCATION == 0', 'Training'] = df_train[df_train['EDUCATION'] == 0].shape[0]
counts.loc['EDUCATION == 5', 'Training'] = df_train[df_train['EDUCATION'] == 5].shape[0]
counts.loc['EDUCATION == 6', 'Training'] = df_train[df_train['EDUCATION'] == 6].shape[0]
# EDUCATION, testing set
counts.loc['EDUCATION == 0', 'Testing'] = df_test[df_test['EDUCATION'] == 0].shape[0]
counts.loc['EDUCATION == 5', 'Testing'] = df_test[df_test['EDUCATION'] == 5].shape[0]
counts.loc['EDUCATION == 6', 'Testing'] = df_test[df_test['EDUCATION'] == 6].shape[0]
# MARRIAGE, training set
counts.loc['MARRIAGE == 0', 'Training'] = df_train[df_train['MARRIAGE'] == 0].shape[0]
# MARRIAGE, testing set
counts.loc['MARRIAGE == 0', 'Testing'] = df_test[df_test['MARRIAGE'] == 0].shape[0]
# PAY_1 to PAY_6, training set
counts.loc['PAY_n == -2', 'Training'] = df_train[(df_train['PAY_1'] == -2) | (df_train['PAY_2'] == -2) | 
                                                 (df_train['PAY_3'] == -2) | (df_train['PAY_4'] == -2) | 
                                                 (df_train['PAY_5'] == -2) | (df_train['PAY_6'] == -2)].shape[0]
counts.loc['PAY_n == 0', 'Training'] = df_train[(df_train['PAY_1'] == 0) | (df_train['PAY_2'] == 0) | 
                                                (df_train['PAY_3'] == 0) | (df_train['PAY_4'] == 0) | 
                                                (df_train['PAY_5'] == 0) | (df_train['PAY_6'] == 0)].shape[0]
# PAY_1 to PAY_6, training set
counts.loc['PAY_n == -2', 'Testing'] = df_test[(df_test['PAY_1'] == -2) | (df_test['PAY_2'] == -2) | 
                                               (df_test['PAY_3'] == -2) | (df_test['PAY_4'] == -2) | 
                                               (df_test['PAY_5'] == -2) | (df_test['PAY_6'] == -2)].shape[0]
counts.loc['PAY_n == 0', 'Testing'] = df_test[(df_test['PAY_1'] == 0) | (df_test['PAY_2'] == 0) | 
                                              (df_test['PAY_3'] == 0) | (df_test['PAY_4'] == 0) | 
                                              (df_test['PAY_5'] == 0) | (df_test['PAY_6'] == 0)].shape[0]
counts

Unnamed: 0,Training,Testing
EDUCATION == 0,11,3
EDUCATION == 5,207,73
EDUCATION == 6,37,14
MARRIAGE == 0,41,13
PAY_n == -2,5185,1376
PAY_n == 0,16929,4244


**Solution:**
- **EDUCATION**: categorise 0, 5, 6 as 4 (**others**)
<br> **others** may refer to education level either higher than graduate school or lower than high school.
- **MARRIAGE**: categorise 0 as 3 (**others**)
- **PAY_1** to **PAY_6**: keep -2 and categorise 0 as -1 (**pay duly**)

In [32]:
# clean EDUCATION
df_train.loc[(df_train['EDUCATION'] == 0) | (df_train['EDUCATION'] == 5) | (df_train['EDUCATION'] == 6), 'EDUCATION'] = 4
df_test.loc[(df_test['EDUCATION'] == 0) | (df_test['EDUCATION'] == 5) | (df_test['EDUCATION'] == 6), 'EDUCATION'] = 4

print('EDUCATION training:', sorted(df_train['EDUCATION'].unique()))
print('EDUCATION testing:', sorted(df_test['EDUCATION'].unique()))

EDUCATION training: [1, 2, 3, 4]
EDUCATION testing: [1, 2, 3, 4]


In [33]:
# clean MARRIAGE
df_train.loc[df_train['MARRIAGE'] == 0, 'MARRIAGE'] = 3
df_test.loc[df_test['MARRIAGE'] == 0, 'MARRIAGE'] = 3

print('MARRIAGE training:', sorted(df_train['MARRIAGE'].unique()))
print('MARRIAGE testing:', sorted(df_test['MARRIAGE'].unique()))

MARRIAGE training: [1, 2, 3]
MARRIAGE testing: [1, 2, 3]


In [34]:
# clean PAY_1 to PAY_6 in training set
df_train.loc[(df_train['PAY_1'] == -2) | (df_train['PAY_1'] == 0), 'PAY_1'] = -1
df_train.loc[(df_train['PAY_2'] == -2) | (df_train['PAY_2'] == 0), 'PAY_2'] = -1
df_train.loc[(df_train['PAY_3'] == -2) | (df_train['PAY_3'] == 0), 'PAY_3'] = -1
df_train.loc[(df_train['PAY_4'] == -2) | (df_train['PAY_4'] == 0), 'PAY_4'] = -1
df_train.loc[(df_train['PAY_5'] == -2) | (df_train['PAY_5'] == 0), 'PAY_5'] = -1
df_train.loc[(df_train['PAY_6'] == -2) | (df_train['PAY_6'] == 0), 'PAY_6'] = -1

print('PAY_1:', sorted(df_train['PAY_1'].unique()))
print('PAY_2:', sorted(df_train['PAY_2'].unique()))
print('PAY_3:', sorted(df_train['PAY_3'].unique()))
print('PAY_4:', sorted(df_train['PAY_4'].unique()))
print('PAY_5:', sorted(df_train['PAY_5'].unique()))
print('PAY_6:', sorted(df_train['PAY_6'].unique()))

PAY_1: [-1, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_2: [-1, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_3: [-1, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_4: [-1, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_5: [-1, 2, 3, 4, 5, 6, 7, 8]
PAY_6: [-1, 2, 3, 4, 5, 6, 7, 8]


In [35]:
# clean PAY_1 to PAY_6 in testing set
df_test.loc[(df_test['PAY_1'] == -2) | (df_test['PAY_1'] == 0), 'PAY_1'] = -1
df_test.loc[(df_test['PAY_2'] == -2) | (df_test['PAY_2'] == 0), 'PAY_2'] = -1
df_test.loc[(df_test['PAY_3'] == -2) | (df_test['PAY_3'] == 0), 'PAY_3'] = -1
df_test.loc[(df_test['PAY_4'] == -2) | (df_test['PAY_4'] == 0), 'PAY_4'] = -1
df_test.loc[(df_test['PAY_5'] == -2) | (df_test['PAY_5'] == 0), 'PAY_5'] = -1
df_test.loc[(df_test['PAY_6'] == -2) | (df_test['PAY_6'] == 0), 'PAY_6'] = -1

print('PAY_1:', sorted(df_test['PAY_1'].unique()))
print('PAY_2:', sorted(df_test['PAY_2'].unique()))
print('PAY_3:', sorted(df_test['PAY_3'].unique()))
print('PAY_4:', sorted(df_test['PAY_4'].unique()))
print('PAY_5:', sorted(df_test['PAY_5'].unique()))
print('PAY_6:', sorted(df_test['PAY_6'].unique()))

PAY_1: [-1, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_2: [-1, 1, 2, 3, 4, 5, 6, 7]
PAY_3: [-1, 2, 3, 4, 5, 6, 8]
PAY_4: [-1, 2, 3, 4, 5, 7]
PAY_5: [-1, 2, 3, 4, 5, 6, 7]
PAY_6: [-1, 2, 3, 4, 5, 6, 7]


### Check outliers in numerical variables

#### 1. Check weird negative values

In [18]:
# bill statement description
df_train[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].describe()
# alternative demo: histogram

Unnamed: 0,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6
count,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0
mean,50596.884708,48646.064125,46367.06,42368.188417,40000.682542,38563.710625
std,72649.374256,70364.600436,68193.9,63070.680934,60345.012766,59155.759799
min,-165580.0,-69777.0,-157264.0,-170000.0,-81334.0,-339603.0
25%,3631.5,3098.5,2773.5,2340.0,1740.0,1234.75
50%,22330.0,21339.0,20039.0,18940.5,18107.5,17036.0
75%,65779.5,62761.25,59298.0,52188.5,49746.5,48796.25
max,964511.0,983931.0,1664089.0,891586.0,927171.0,961664.0


In [38]:
df_test[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].describe()
# alternative demo: histogram

Unnamed: 0,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,53729.115667,51311.119333,49597.5435,46841.991167,41554.274667,40103.9595
std,77411.882245,74290.225276,73739.999695,69041.251707,62562.774782,61111.043232
min,-11545.0,-67526.0,-46127.0,-65167.0,-61372.0,-209051.0
25%,3213.0,2578.5,2487.75,2290.75,1900.0,1320.0
50%,22820.0,20814.5,20360.5,19657.0,18071.5,17203.5
75%,71456.5,68612.25,65252.75,61357.75,52099.0,50620.5
max,746814.0,671563.0,855086.0,706864.0,587067.0,514975.0


In [37]:
print('Negative BILL_AMT in training set:', df_train[(df_train['BILL_AMT1'] < 0) | (df_train['BILL_AMT2'] < 0) |
                                                     (df_train['BILL_AMT3'] < 0) | (df_train['BILL_AMT4'] < 0) |
                                                     (df_train['BILL_AMT5'] < 0) | (df_train['BILL_AMT6'] < 0)].shape[0])
print('Negative BILL_AMT in testing set:', df_test[(df_test['BILL_AMT1'] < 0) | (df_test['BILL_AMT2'] < 0) |
                                                   (df_test['BILL_AMT3'] < 0) | (df_test['BILL_AMT4'] < 0) |
                                                   (df_test['BILL_AMT5'] < 0) | (df_test['BILL_AMT6'] < 0)].shape[0])

Negative BILL_AMT in training set: 1496
Negative BILL_AMT in testing set: 434


There are negative values for bill statement in both training set (**1496** rows) and testing set (**434** rows).
<br>
**Solution**: convert all negatives to zero.

In [39]:
# clean BILL_AMT in training set
df_train.loc[(df_train['BILL_AMT1'] < 0), 'BILL_AMT1'] = 0
df_train.loc[(df_train['BILL_AMT2'] < 0), 'BILL_AMT2'] = 0
df_train.loc[(df_train['BILL_AMT3'] < 0), 'BILL_AMT3'] = 0
df_train.loc[(df_train['BILL_AMT4'] < 0), 'BILL_AMT4'] = 0
df_train.loc[(df_train['BILL_AMT5'] < 0), 'BILL_AMT5'] = 0
df_train.loc[(df_train['BILL_AMT6'] < 0), 'BILL_AMT6'] = 0

# clean BILL_AMT in testing set
df_test.loc[(df_test['BILL_AMT1'] < 0), 'BILL_AMT1'] = 0
df_test.loc[(df_test['BILL_AMT2'] < 0), 'BILL_AMT2'] = 0
df_test.loc[(df_test['BILL_AMT3'] < 0), 'BILL_AMT3'] = 0
df_test.loc[(df_test['BILL_AMT4'] < 0), 'BILL_AMT4'] = 0
df_test.loc[(df_test['BILL_AMT5'] < 0), 'BILL_AMT5'] = 0
df_test.loc[(df_test['BILL_AMT6'] < 0), 'BILL_AMT6'] = 0

In [40]:
df_train[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].describe()

Unnamed: 0,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6
count,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0
mean,50621.6645,48672.266917,46398.97,42402.834333,40034.232,38626.357083
std,72616.99812,70342.956984,68161.92,63033.233335,60316.86467,59058.854771
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,3631.5,3098.5,2773.5,2340.0,1740.0,1234.75
50%,22330.0,21339.0,20039.0,18940.5,18107.5,17036.0
75%,65779.5,62761.25,59298.0,52188.5,49746.5,48796.25
max,964511.0,983931.0,1664089.0,891586.0,927171.0,961664.0


In [41]:
df_test[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].describe()

Unnamed: 0,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,53743.5515,51343.5225,49629.981833,46886.922333,41598.5325,40205.9155
std,77401.37894,74261.749954,73714.327016,69001.028596,62522.784018,60958.507135
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,3213.0,2578.5,2487.75,2290.75,1900.0,1320.0
50%,22820.0,20814.5,20360.5,19657.0,18071.5,17203.5
75%,71456.5,68612.25,65252.75,61357.75,52099.0,50620.5
max,746814.0,671563.0,855086.0,706864.0,587067.0,514975.0


#### 2. Check extremely large or small values

In [17]:
# given credit description
df_train[['LIMIT_BAL']].describe()
# alternative demo: histogram

Unnamed: 0,LIMIT_BAL
count,24000.0
mean,165495.986667
std,129128.744855
min,10000.0
25%,50000.0
50%,140000.0
75%,240000.0
max,1000000.0


In [26]:
# previous payment description
df_train[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].describe()\
                                        .apply(lambda s: s.apply(lambda x: format(x, 'f'))) # suppress scientific notation
# alternative demo: histogram

Unnamed: 0,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
count,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0
mean,5542.912917,5815.336208,4969.266,4743.480042,4783.486042,5189.399042
std,15068.576072,20797.031923,16095.61434,14883.26999,15270.405279,17630.37199
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1000.0,800.0,379.0,279.75,244.0,60.75
50%,2100.0,2000.0,1702.5,1500.0,1500.0,1500.0
75%,5000.0,5000.0,4347.25,4000.0,4005.0,4000.0
max,505000.0,1684259.0,896040.0,497000.0,417990.0,528666.0


LIMIT_BAL, BILL_AMT and PAY_AMT all have a very broad range. Need to check if these are outliers.
<br>
**Method:** Check if the amount of previous payment (PAY_AMT) & bill statement (BILL_AMT) lie within the 'sensible' range of given credit (LIMIT_BAL). Choose LIMIT_BAL values **below 5th percentile** or **above 95th percentile**.

In [48]:
# investigate the percentiles for LIMIT_BAL, PAY_AMT and BILL_AMT
percentiles = []
variables = ['LIMIT_BAL', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 
             'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']

for var in variables:
    percentiles.append(list(np.percentile(df_train[var], i) for i in range(5,100,10)))

pd.DataFrame(data=percentiles, index=variables, columns=['5th','15th','25th','35th','45th','55th','65th','75th','85th','95th'])

Unnamed: 0,5th,15th,25th,35th,45th,55th,65th,75th,85th,95th
LIMIT_BAL,20000.0,50000.0,50000.0,80000.0,120000.0,150000.0,200000.0,240000.0,300000.0,430000.0
PAY_AMT1,0.0,0.0,1000.0,1500.0,2000.0,2550.0,3510.4,5000.0,8000.0,18243.85
PAY_AMT2,0.0,0.0,800.0,1379.0,1885.55,2390.9,3300.0,5000.0,7784.15,19000.15
PAY_AMT3,0.0,0.0,379.0,1000.0,1424.1,2000.0,3000.0,4347.25,6800.0,16513.55
PAY_AMT4,0.0,0.0,279.75,764.65,1100.0,1920.0,2728.35,4000.0,6200.0,16000.0
PAY_AMT5,0.0,0.0,244.0,780.0,1170.0,2000.0,2832.7,4005.0,6251.15,15963.75
PAY_AMT6,0.0,0.0,60.75,690.0,1065.0,1911.9,2711.0,4000.0,6100.0,17398.1
BILL_AMT1,0.0,815.0,3631.5,9245.9,17706.2,28347.0,46009.85,65779.5,105939.05,197600.0
BILL_AMT2,0.0,495.0,3098.5,8760.3,17385.0,27619.45,43981.0,62761.25,101462.4,191542.1
BILL_AMT3,0.0,390.0,2773.5,8600.55,16985.55,26011.9,40181.05,59298.0,97224.8,184792.9


In [61]:
# Check outliers: 
# 1. Select the rows where the LIMIT_BAL value is below 5th percentile or above 95th percentile. 
# 2. Regard LIMIT_BAL as the credit base, check the rows where BILL_AMT or PAY_EMT exceed 2 * LIMIT_BAL
data = df_train[(df_train['LIMIT_BAL'] < np.percentile(df_train['LIMIT_BAL'], 5)) | 
                (df_train['LIMIT_BAL'] > np.percentile(df_train['LIMIT_BAL'], 95))][variables]
    
data_out_of_range = []

for i in data.index.values.tolist():
    limit_bal = df_train.loc[i, 'LIMIT_BAL']
    upper_limit = 2 * limit_bal

    if (upper_limit < df_train.loc[i, 'BILL_AMT1']) | (upper_limit < df_train.loc[i, 'BILL_AMT2']) |\
       (upper_limit < df_train.loc[i, 'BILL_AMT3']) | (upper_limit < df_train.loc[i, 'BILL_AMT4']) |\
       (upper_limit < df_train.loc[i, 'BILL_AMT5']) | (upper_limit < df_train.loc[i, 'BILL_AMT6']) |\
       (upper_limit < df_train.loc[i, 'PAY_AMT1']) | (upper_limit < df_train.loc[i, 'PAY_AMT2']) |\
       (upper_limit < df_train.loc[i, 'PAY_AMT3']) | (upper_limit < df_train.loc[i, 'PAY_AMT4']) |\
       (upper_limit < df_train.loc[i, 'PAY_AMT5']) | (upper_limit < df_train.loc[i, 'PAY_AMT6']):
            data_out_of_range.append(i)
    
print('Number of outliers:', len(data_out_of_range))
outliers = df_train.loc[data_out_of_range, variables].sort_values(by=['LIMIT_BAL'])
outliers

Number of outliers: 5


Unnamed: 0,LIMIT_BAL,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6
5843,10000,1475,780,0,31250,0,0,885,1475,780,390,780,390
7508,10000,2134,1000,2500,122,0,0,53095,54562,5428,7928,6091,0
7688,10000,10400,1000,0,20200,200,0,219,9110,9797,0,9767,9067
16504,10000,1500,5000,4000,2000,22400,0,8525,5141,5239,7911,17890,10000
5296,500000,4366,1684259,121831,97670,379267,26759,125,0,1664089,121757,97115,377217


We filtered out 5 records. In these records, either one of the previous payments or one of the bill statements or both exceeded the twice of the range of their given credit. However, if we take a closer look at these records, we will notice that the previous payments and the bill statements are relatively balanced in general and most of their values lied with in the given credit. Therefore, we cannot regard them as anomalies. 

### Save pre-processed datasets

In [62]:
# save pre-processed training & testing data
df_train.to_csv(path + 'CreditCard_train_processed.csv', index=False)
df_test.to_csv(path + 'CreditCard_test_processed.csv', index=False)