In [17]:
import pandas as pd
import numpy as np

### Load original datasets

In [18]:
path = '../data/' # change to your local file location

df_train = pd.read_csv(path + 'CreditCard_train.csv', header=1) # pass the second line in the CSV file as column names
df_train.rename(columns={'PAY_0':'PAY_1'}, inplace=True) # change strange (inconsistent) column name 'PAY_0'
df_train.rename(columns={'default payment next month':'default'}, inplace=True) # shorten the name of the target column

In [105]:
def rename_pay_default(df):
    df = df.rename(columns={'PAY_0':'PAY_1'})
    df = df.rename(columns={'default payment next month':'default'})
    return df

In [19]:
df_train.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


### Check missing values

In [20]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24000 entries, 0 to 23999
Data columns (total 25 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   ID         24000 non-null  int64
 1   LIMIT_BAL  24000 non-null  int64
 2   SEX        24000 non-null  int64
 3   EDUCATION  24000 non-null  int64
 4   MARRIAGE   24000 non-null  int64
 5   AGE        24000 non-null  int64
 6   PAY_1      24000 non-null  int64
 7   PAY_2      24000 non-null  int64
 8   PAY_3      24000 non-null  int64
 9   PAY_4      24000 non-null  int64
 10  PAY_5      24000 non-null  int64
 11  PAY_6      24000 non-null  int64
 12  BILL_AMT1  24000 non-null  int64
 13  BILL_AMT2  24000 non-null  int64
 14  BILL_AMT3  24000 non-null  int64
 15  BILL_AMT4  24000 non-null  int64
 16  BILL_AMT5  24000 non-null  int64
 17  BILL_AMT6  24000 non-null  int64
 18  PAY_AMT1   24000 non-null  int64
 19  PAY_AMT2   24000 non-null  int64
 20  PAY_AMT3   24000 non-null  int64
 21  PAY_AMT4   2

No missing values. All variables have numerical values (int64).

### Check undocumented labels

In [21]:
# check the values of categorical variable in training set are well-documented
print('SEX:', sorted(df_train['SEX'].unique()))
print('EDUCATION:', sorted(df_train['EDUCATION'].unique()))
print('MARRIAGE:', sorted(df_train['MARRIAGE'].unique()))
print('PAY_1:', sorted(df_train['PAY_1'].unique()))
print('PAY_2:', sorted(df_train['PAY_2'].unique()))
print('PAY_3:', sorted(df_train['PAY_3'].unique()))
print('PAY_4:', sorted(df_train['PAY_4'].unique()))
print('PAY_5:', sorted(df_train['PAY_5'].unique()))
print('PAY_6:', sorted(df_train['PAY_6'].unique()))
print('default:', sorted(df_train['default'].unique()))

SEX: [1, 2]
EDUCATION: [0, 1, 2, 3, 4, 5, 6]
MARRIAGE: [0, 1, 2, 3]
PAY_1: [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_2: [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_3: [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_4: [-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_5: [-2, -1, 0, 2, 3, 4, 5, 6, 7, 8]
PAY_6: [-2, -1, 0, 2, 3, 4, 5, 6, 7, 8]
default: [0, 1]


Undocumented categories existed in both training and testing set:
- **EDUCATION**: 0, 5, 6
- **MARRIAGE**: 0
- **PAY_1** to **PAY_6**: -2, 0

#### Count the number of rows for each undocumented label

In [22]:
# create a DataFrame to hold the results
undocumented_labels = ['EDUCATION == 0','EDUCATION == 5','EDUCATION == 6','MARRIAGE == 0','PAY_n == -2', 'PAY_n == 0']
counts = pd.DataFrame(index=undocumented_labels, columns=['Training'])
counts

Unnamed: 0,Training
EDUCATION == 0,
EDUCATION == 5,
EDUCATION == 6,
MARRIAGE == 0,
PAY_n == -2,
PAY_n == 0,


In [23]:
# EDUCATION, training set
counts.loc['EDUCATION == 0', 'Training'] = df_train[df_train['EDUCATION'] == 0].shape[0]
counts.loc['EDUCATION == 5', 'Training'] = df_train[df_train['EDUCATION'] == 5].shape[0]
counts.loc['EDUCATION == 6', 'Training'] = df_train[df_train['EDUCATION'] == 6].shape[0]

# MARRIAGE, training set
counts.loc['MARRIAGE == 0', 'Training'] = df_train[df_train['MARRIAGE'] == 0].shape[0]

# PAY_1 to PAY_6, training set
counts.loc['PAY_n == -2', 'Training'] = df_train[(df_train['PAY_1'] == -2) | (df_train['PAY_2'] == -2) | 
                                                 (df_train['PAY_3'] == -2) | (df_train['PAY_4'] == -2) | 
                                                 (df_train['PAY_5'] == -2) | (df_train['PAY_6'] == -2)].shape[0]
counts.loc['PAY_n == 0', 'Training'] = df_train[(df_train['PAY_1'] == 0) | (df_train['PAY_2'] == 0) | 
                                                (df_train['PAY_3'] == 0) | (df_train['PAY_4'] == 0) | 
                                                (df_train['PAY_5'] == 0) | (df_train['PAY_6'] == 0)].shape[0]

counts

Unnamed: 0,Training
EDUCATION == 0,11
EDUCATION == 5,207
EDUCATION == 6,37
MARRIAGE == 0,41
PAY_n == -2,5185
PAY_n == 0,16929


**Solution:**
- **EDUCATION**: categorise 0, 5, 6 as 4 (**others**)
<br> **others** may refer to education level either higher than graduate school or lower than high school.
- **MARRIAGE**: categorise 0 as 3 (**others**)
- **PAY_1** to **PAY_6**: keep -2 and categorise 0 as -1 (**pay duly**)

In [24]:
# clean EDUCATION
df_train.loc[(df_train['EDUCATION'] == 0) | (df_train['EDUCATION'] == 5) | (df_train['EDUCATION'] == 6), 'EDUCATION'] = 4

print('EDUCATION training:', sorted(df_train['EDUCATION'].unique()))

EDUCATION training: [1, 2, 3, 4]


In [46]:
def clean_edu(df):
    df.loc[(df['EDUCATION'] == 0) | (df['EDUCATION'] == 5) | (df['EDUCATION'] == 6), 'EDUCATION'] = 4
    return df

In [25]:
# clean MARRIAGE
df_train.loc[df_train['MARRIAGE'] == 0, 'MARRIAGE'] = 3

print('MARRIAGE training:', sorted(df_train['MARRIAGE'].unique()))

MARRIAGE training: [1, 2, 3]


In [47]:
def clean_marriage(df):
    df.loc[df['MARRIAGE'] == 0, 'MARRIAGE'] = 3
    return df

In [26]:
# clean PAY_1 to PAY_6 in training set
df_train.loc[(df_train['PAY_1'] == 0), 'PAY_1'] = -1
df_train.loc[(df_train['PAY_2'] == 0), 'PAY_2'] = -1
df_train.loc[(df_train['PAY_3'] == 0), 'PAY_3'] = -1
df_train.loc[(df_train['PAY_4'] == 0), 'PAY_4'] = -1
df_train.loc[(df_train['PAY_5'] == 0), 'PAY_5'] = -1
df_train.loc[(df_train['PAY_6'] == 0), 'PAY_6'] = -1

print('PAY_1:', sorted(df_train['PAY_1'].unique()))
print('PAY_2:', sorted(df_train['PAY_2'].unique()))
print('PAY_3:', sorted(df_train['PAY_3'].unique()))
print('PAY_4:', sorted(df_train['PAY_4'].unique()))
print('PAY_5:', sorted(df_train['PAY_5'].unique()))
print('PAY_6:', sorted(df_train['PAY_6'].unique()))

PAY_1: [-2, -1, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_2: [-2, -1, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_3: [-2, -1, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_4: [-2, -1, 1, 2, 3, 4, 5, 6, 7, 8]
PAY_5: [-2, -1, 2, 3, 4, 5, 6, 7, 8]
PAY_6: [-2, -1, 2, 3, 4, 5, 6, 7, 8]


In [49]:
def clean_pay(df):
    for i in range(1, 7):
        df.loc[(df[f'PAY_{i}'] == 0), f'PAY_{i}'] = -1
    return df

### Check outliers in numerical variables

#### 1. Check weird negative values

In [27]:
# bill statement description
df_train[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].describe()
# alternative demo: histogram

Unnamed: 0,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6
count,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0
mean,50596.884708,48646.064125,46367.06,42368.188417,40000.682542,38563.710625
std,72649.374256,70364.600436,68193.9,63070.680934,60345.012766,59155.759799
min,-165580.0,-69777.0,-157264.0,-170000.0,-81334.0,-339603.0
25%,3631.5,3098.5,2773.5,2340.0,1740.0,1234.75
50%,22330.0,21339.0,20039.0,18940.5,18107.5,17036.0
75%,65779.5,62761.25,59298.0,52188.5,49746.5,48796.25
max,964511.0,983931.0,1664089.0,891586.0,927171.0,961664.0


In [28]:
print('Negative BILL_AMT in training set:', df_train[(df_train['BILL_AMT1'] < 0) | (df_train['BILL_AMT2'] < 0) |
                                                     (df_train['BILL_AMT3'] < 0) | (df_train['BILL_AMT4'] < 0) |
                                                     (df_train['BILL_AMT5'] < 0) | (df_train['BILL_AMT6'] < 0)].shape[0])

Negative BILL_AMT in training set: 1496


There are negative values for bill statement in both training set (**1496** rows)  
**We are not going to convert negatives to zero**

#### 2. Check extremely large or small values

In [29]:
# given credit description
df_train[['LIMIT_BAL']].describe()
# alternative demo: histogram

Unnamed: 0,LIMIT_BAL
count,24000.0
mean,165495.986667
std,129128.744855
min,10000.0
25%,50000.0
50%,140000.0
75%,240000.0
max,1000000.0


In [30]:
# previous payment description
df_train[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].describe()\
                                        .apply(lambda s: s.apply(lambda x: format(x, 'f'))) # suppress scientific notation
# alternative demo: histogram

Unnamed: 0,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
count,24000.0,24000.0,24000.0,24000.0,24000.0,24000.0
mean,5542.912917,5815.336208,4969.266,4743.480042,4783.486042,5189.399042
std,15068.576072,20797.031923,16095.61434,14883.26999,15270.405279,17630.37199
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1000.0,800.0,379.0,279.75,244.0,60.75
50%,2100.0,2000.0,1702.5,1500.0,1500.0,1500.0
75%,5000.0,5000.0,4347.25,4000.0,4005.0,4000.0
max,505000.0,1684259.0,896040.0,497000.0,417990.0,528666.0


LIMIT_BAL, BILL_AMT and PAY_AMT all have a very broad range. Need to check if these are outliers.
<br>
**Method:** Check if the amount of previous payment (PAY_AMT) & bill statement (BILL_AMT) lie within the 'sensible' range of given credit (LIMIT_BAL). Choose LIMIT_BAL values **below 5th percentile** or **above 95th percentile**.

In [31]:
# investigate the percentiles for LIMIT_BAL, PAY_AMT and BILL_AMT
percentiles = []
variables = ['LIMIT_BAL', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 
             'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']

for var in variables:
    percentiles.append(list(np.percentile(df_train[var], i) for i in range(5,100,10)))

pd.DataFrame(data=percentiles, index=variables, columns=['5th','15th','25th','35th','45th','55th','65th','75th','85th','95th'])

Unnamed: 0,5th,15th,25th,35th,45th,55th,65th,75th,85th,95th
LIMIT_BAL,20000.0,50000.0,50000.0,80000.0,120000.0,150000.0,200000.0,240000.0,300000.0,430000.0
PAY_AMT1,0.0,0.0,1000.0,1500.0,2000.0,2550.0,3510.4,5000.0,8000.0,18243.85
PAY_AMT2,0.0,0.0,800.0,1379.0,1885.55,2390.9,3300.0,5000.0,7784.15,19000.15
PAY_AMT3,0.0,0.0,379.0,1000.0,1424.1,2000.0,3000.0,4347.25,6800.0,16513.55
PAY_AMT4,0.0,0.0,279.75,764.65,1100.0,1920.0,2728.35,4000.0,6200.0,16000.0
PAY_AMT5,0.0,0.0,244.0,780.0,1170.0,2000.0,2832.7,4005.0,6251.15,15963.75
PAY_AMT6,0.0,0.0,60.75,690.0,1065.0,1911.9,2711.0,4000.0,6100.0,17398.1
BILL_AMT1,0.0,815.0,3631.5,9245.9,17706.2,28347.0,46009.85,65779.5,105939.05,197600.0
BILL_AMT2,0.0,495.0,3098.5,8760.3,17385.0,27619.45,43981.0,62761.25,101462.4,191542.1
BILL_AMT3,0.0,390.0,2773.5,8600.55,16985.55,26011.9,40181.05,59298.0,97224.8,184792.9


In [32]:
# Check outliers: 
# 1. Select the rows where the LIMIT_BAL value is below 5th percentile or above 95th percentile. 
# 2. Regard LIMIT_BAL as the credit base, check the rows where BILL_AMT or PAY_EMT exceed 2 * LIMIT_BAL
data = df_train[(df_train['LIMIT_BAL'] < np.percentile(df_train['LIMIT_BAL'], 5)) | 
                (df_train['LIMIT_BAL'] > np.percentile(df_train['LIMIT_BAL'], 95))][variables]
    
data_out_of_range = []

for i in data.index.values.tolist():
    limit_bal = df_train.loc[i, 'LIMIT_BAL']
    upper_limit = 2 * limit_bal

    if (upper_limit < df_train.loc[i, 'BILL_AMT1']) | (upper_limit < df_train.loc[i, 'BILL_AMT2']) |\
       (upper_limit < df_train.loc[i, 'BILL_AMT3']) | (upper_limit < df_train.loc[i, 'BILL_AMT4']) |\
       (upper_limit < df_train.loc[i, 'BILL_AMT5']) | (upper_limit < df_train.loc[i, 'BILL_AMT6']) |\
       (upper_limit < df_train.loc[i, 'PAY_AMT1']) | (upper_limit < df_train.loc[i, 'PAY_AMT2']) |\
       (upper_limit < df_train.loc[i, 'PAY_AMT3']) | (upper_limit < df_train.loc[i, 'PAY_AMT4']) |\
       (upper_limit < df_train.loc[i, 'PAY_AMT5']) | (upper_limit < df_train.loc[i, 'PAY_AMT6']):
            data_out_of_range.append(i)
    
print('Number of outliers:', len(data_out_of_range))
outliers = df_train.loc[data_out_of_range, variables].sort_values(by=['LIMIT_BAL'])
outliers

Number of outliers: 5


Unnamed: 0,LIMIT_BAL,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6
5843,10000,1475,780,0,31250,0,0,885,1475,780,390,780,390
7508,10000,2134,1000,2500,122,0,0,53095,54562,5428,7928,6091,0
7688,10000,10400,1000,0,20200,200,0,219,9110,9797,-10433,9767,9067
16504,10000,1500,5000,4000,2000,22400,0,8525,5141,5239,7911,17890,10000
5296,500000,4366,1684259,121831,97670,379267,26759,125,-18088,1664089,121757,97115,377217


We filtered out 5 records. In these records, either one of the previous payments or one of the bill statements or both exceeded the twice of the range of their given credit. However, if we take a closer look at these records, we will notice that the previous payments and the bill statements are relatively balanced in general and most of their values lied with in the given credit. Therefore, we cannot regard them as anomalies. 

In [33]:
df_train

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,-1,-1,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,-1,-1,-1,-1,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,-1,-1,-1,-1,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,-1,-1,-1,...,20940,19146,19131,2000,36681,10000,9000,689,679,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,23996,80000,1,2,1,25,1,2,2,-1,...,80215,63296,49854,3800,6,3636,2646,2000,1830,0
23996,23997,20000,1,2,1,25,-1,-1,-1,-1,...,17322,17119,17350,1552,2659,1419,606,500,1000,0
23997,23998,10000,1,2,2,26,-1,-1,-1,-1,...,17506,16608,9176,1300,2200,1300,320,1820,1000,0
23998,23999,20000,1,4,2,26,-1,-1,-1,-1,...,39950,0,0,3055,1467,1096,1000,0,0,0


### **Feature processing**

In [35]:
def drop_id(df):
    df = df.drop(['ID'], axis=1)
    return df

In [36]:
def combine_gender_marital(df):
    df['SEX_MAR'] = df.SEX * df.MARRIAGE
    df = df.drop(['SEX', 'MARRIAGE'], axis=1)
    return df

In [37]:
def make_age_bins(df):
    df['AgeBin'] = 0 # placeholder column
    df.loc[((df['AGE'] < 30)) , 'AgeBin'] = 1
    df.loc[((df['AGE'] >= 30) & (df['AGE'] < 40)) , 'AgeBin'] = 2
    df.loc[((df['AGE'] >= 40) & (df['AGE'] < 50)) , 'AgeBin'] = 3
    df.loc[((df['AGE'] >= 50) & (df['AGE'] < 60)) , 'AgeBin'] = 4
    df.loc[((df['AGE'] >= 60) & (df['AGE'] < 70)) , 'AgeBin'] = 5
    df.loc[((df['AGE'] >= 70) & (df['AGE'] < 81)) , 'AgeBin'] = 6
    
    df = df.drop(['AGE'], axis=1)
    return df

In [38]:
def get_closeness(df):
    for i in range(1, 7):
        df[f'Closeness_{i}'] = (df.LIMIT_BAL - df[f'BILL_AMT{i}']) #/ df.LIMIT_BAL
    
    return df

In [39]:
def get_diff_of_totals(df):
    df['diff'] = 0   # Make placeholder
    
    bill_cols = [f'BILL_AMT{i}' for i in range(1, 7)]
    pay_cols = [f'PAY_AMT{i}' for i in range(1, 7)]
    
    for index, row in df.iterrows():
        bill_df = df.loc[index, bill_cols]
        pay_df = df.loc[index, pay_cols]

        bill_sum = np.sum(bill_df.values)
        pay_sum = np.sum(pay_df.values)

        df.loc[index, 'diff'] = bill_sum - pay_sum
        
    return df

In [40]:
def drop_bill_amt(df):
    df = df.drop([f'BILL_AMT{i}' for i in range(1, 7)], axis=1)
    return df

In [41]:
def drop_pay_amt(df):
    df = df.drop([f'PAY_AMT{i}' for i in range(1, 7)], axis=1)
    return df

In [42]:
def drop_age(df):
    df = df.drop(['AGE'], axis=1)
    return df

**Normalisation or Standardization**

In [74]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

In [111]:
def scale_columns(df, cols=['LIMIT_BAL', 'BILL_AMT', 'PAY_AMT', 'Closeness', 'diff'], method='minmax_norm'):
    if method == 'normalise':
        scaler = Normalizer()
    elif method == 'minmax_norm':
        scaler = MinMaxScaler()
    elif method == 'standardise':
        scaler = StandardScaler()
    
    if 'LIMIT_BAL' in cols:
        df[['LIMIT_BAL']] = scaler.fit_transform(df[['LIMIT_BAL']])
    if 'BILL_AMT' in cols:
        df[[f'BILL_AMT{i}' for i in range(1, 7)]] = scaler.fit_transform(df[[f'BILL_AMT{i}' for i in range(1, 7)]])
    if 'PAY_AMT' in cols:
        df[[f'PAY_AMT{i}' for i in range(1, 7)]] = scaler.fit_transform(df[[f'PAY_AMT{i}' for i in range(1, 7)]])
    if 'Closeness' in cols:
        df[[f'Closeness_{i}' for i in range(1, 7)]] = scaler.fit_transform(df[[f'Closeness_{i}' for i in range(1, 7)]])
    if 'diff' in cols:
        df[['diff']] = scaler.fit_transform(df[['diff']])
        
    return df

## Pipeline

In [112]:
def processing_pipline(df):
    ### Cleaning ###
    """
    Available functions:
    rename_pay, clean_edu, clean_marriage, clean_pay
    """
    for func in [rename_pay_default, clean_edu, clean_marriage, clean_pay]:
        df = func(df)
    
    print("Cleaned Data")
    
    ### Feature Engineering ###
    """
    Available functions:
    drop_id, combine_gender_marital, make_age_bins, get_closeness, get_diff_of_totals, drop_bill_amt, drop_pay_amt, drop_age
    """
    for func in [drop_id, combine_gender_marital, make_age_bins, get_closeness, get_diff_of_totals]:
        df = func(df)
    
    print("Transformed Features")
        
    ### Feature scaling ###
    """
    Can specify columns that are to be scaled by 'cols' param
    
    Available methods:
    normalise, minmax_norm, standardise
    """
    df = scale_columns(df, method='minmax_norm')
    
    print("scaled numerical values")
    
    return df

In [113]:
final_df = processing_pipline(df_train)

Cleaned Data
Transformed Features
scaled numerical values


In [114]:
final_df.columns

Index(['LIMIT_BAL', 'EDUCATION', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5',
       'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4',
       'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
       'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default', 'SEX_MAR', 'AgeBin',
       'Closeness_1', 'Closeness_2', 'Closeness_3', 'Closeness_4',
       'Closeness_5', 'Closeness_6', 'diff'],
      dtype='object')

In [115]:
final_df = final_df[['LIMIT_BAL', 'EDUCATION', 'SEX_MAR', 'AgeBin', \
                    'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',\
                    'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',\
                    'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',\
                    'Closeness_1', 'Closeness_2', 'Closeness_3', 'Closeness_4','Closeness_5', 'Closeness_6',\
                    'diff', 'default'
                   ]]

In [116]:
final_df

Unnamed: 0,LIMIT_BAL,EDUCATION,SEX_MAR,AgeBin,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,...,PAY_AMT5,PAY_AMT6,Closeness_1,Closeness_2,Closeness_3,Closeness_4,Closeness_5,Closeness_6,diff,default
0,0.010101,2,2,1,2,2,-1,-1,-2,-2,...,0.000000,0.000000,0.271688,0.291875,0.603690,0.271715,0.273931,0.235987,0.143959,1
1,0.111111,2,4,1,-1,2,-1,-1,-1,2,...,0.000000,0.003783,0.366492,0.384092,0.653686,0.362753,0.364027,0.331130,0.145013,1
2,0.080808,2,4,2,-1,-1,-1,-1,-1,-1,...,0.002392,0.009458,0.313526,0.345612,0.632833,0.324109,0.325306,0.289540,0.161379,0
3,0.040404,2,2,2,-1,-1,-1,-1,-1,-1,...,0.002557,0.001892,0.259441,0.278111,0.594200,0.273302,0.274903,0.236432,0.188944,0
4,0.040404,2,1,4,-1,-1,-1,-1,-1,-1,...,0.001648,0.001284,0.295378,0.316828,0.601064,0.280242,0.284060,0.246676,0.152974,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,0.070707,2,1,1,1,2,2,-1,-1,-1,...,0.004785,0.003462,0.255774,0.273967,0.593378,0.252689,0.270855,0.245965,0.230839,0
23996,0.010101,2,1,1,-1,-1,-1,-1,-1,-1,...,0.001196,0.001892,0.261823,0.280638,0.595082,0.255412,0.257956,0.218923,0.161563,0
23997,0.000000,2,2,1,-1,-1,-1,-1,-1,-1,...,0.004354,0.001892,0.257670,0.276565,0.593928,0.245827,0.249101,0.217127,0.155828,0
23998,0.010101,4,2,1,-1,-1,-1,-1,-1,-2,...,0.000000,0.000000,0.256094,0.276245,0.594148,0.234115,0.273931,0.235987,0.161992,0


In [117]:
# save pre-processed training
final_df.to_csv(path + 'CreditCard_train_processed.csv', index=False)

In [118]:
test = pd.read_csv('../data/CreditCard_test.csv', header=1)
test

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,24001,50000,1,2,2,23,2,2,0,0,...,44116,21247,20066,8,2401,2254,2004,704,707,0
1,24002,60000,1,2,2,26,0,0,0,0,...,55736,26958,28847,2282,2324,2049,2000,3000,1120,1
2,24003,400000,1,2,2,27,0,0,0,0,...,10745,20737,9545,2501,10009,1437,1105,510,959,0
3,24004,20000,1,5,2,27,5,4,3,2,...,19709,20113,19840,0,0,0,900,0,0,0
4,24005,50000,1,3,2,27,0,0,-2,-2,...,0,70,120,0,100,0,70,200,100,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,29996,220000,1,3,1,39,0,0,0,0,...,88004,31237,15980,8500,20000,5003,3047,5000,1000,0
5996,29997,150000,1,3,2,43,-1,-1,-1,-1,...,8979,5190,0,1837,3526,8998,129,0,0,0
5997,29998,30000,1,2,2,37,4,3,2,-1,...,20878,20582,19357,0,0,22000,4200,2000,3100,1
5998,29999,80000,1,3,1,41,1,-1,0,0,...,52774,11855,48944,85900,3409,1178,1926,52964,1804,1


In [119]:
test_final = processing_pipline(test)

Cleaned Data
Transformed Features
scaled numerical values


In [120]:
test_final

Unnamed: 0,LIMIT_BAL,EDUCATION,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,...,default,SEX_MAR,AgeBin,Closeness_1,Closeness_2,Closeness_3,Closeness_4,Closeness_5,Closeness_6,diff
0,0.051948,2,2,2,-1,-1,-1,-1,0.082799,0.158687,...,0,2,1,0.296318,0.272401,0.512154,0.237237,0.250794,0.232837,0.486996
1,0.064935,2,-1,-1,-1,-1,-1,-1,0.091800,0.171246,...,1,2,1,0.299436,0.273128,0.512840,0.235595,0.255447,0.234194,0.494807
2,0.506494,2,-1,-1,-1,-1,-1,-1,0.035438,0.103035,...,0,2,1,0.675499,0.668738,0.767354,0.625767,0.631084,0.634201,0.458956
3,0.012987,4,5,4,3,2,2,2,0.043802,0.119846,...,0,2,1,0.295898,0.271091,0.510845,0.231568,0.219475,0.199690,0.469373
4,0.051948,3,-1,-1,-2,-2,-1,-1,0.058198,0.091229,...,0,2,1,0.314648,0.322922,0.544108,0.281946,0.273770,0.255042,0.454298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0.272727,3,-1,-1,-1,-1,-1,-1,0.264377,0.352246,...,0,1,2,0.328052,0.299702,0.518808,0.365046,0.424399,0.426645,0.563615
5996,0.181818,3,-1,-1,-1,-1,-1,-1,0.017443,0.093837,...,0,2,3,0.443271,0.422299,0.607745,0.374192,0.376711,0.366505,0.450010
5997,0.025974,2,4,3,2,-1,-1,-1,0.019925,0.095905,...,1,2,2,0.323516,0.299154,0.529101,0.240518,0.229816,0.211360,0.455472
5998,0.090909,3,1,-1,-1,-1,-1,-1,0.013055,0.197412,...,1,1,3,0.377762,0.273798,0.513573,0.258866,0.293533,0.234086,0.468953


In [121]:
test_final = test_final[['LIMIT_BAL', 'EDUCATION', 'SEX_MAR', 'AgeBin', \
                    'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',\
                    'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',\
                    'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',\
                    'Closeness_1', 'Closeness_2', 'Closeness_3', 'Closeness_4','Closeness_5', 'Closeness_6',\
                    'diff', 'default'
                   ]]

In [122]:
test_final.to_csv(path + 'CreditCard_test_processed.csv', index=False)