In [1]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [9]:
def jps_df_pipeline(datafilecsv):
    '''
    pipeline function for data prossing of credit default data
    '''
    
    df = pd.read_csv(datafilecsv)
    
    cols = ['Unnamed: 0', 'X1','X2','X3','X4','X5','X6','X7','X8','X9','X10','X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23']
    df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
        
    rnm_dict = {'Unnamed: 0': 'id',
                'X1' :'given_credit',
                'X2' :'gender',
                'X3' :'education',
                'X4' :'mar_status',
                'X5' :'age',
                'X6' :'paystatus_09',
                'X7' :'paystatus_08',
                'X8' :'paystatus_07',
                'X9' :'paystatus_06',
                'X10':'paystatus_05',
                'X11':'paystatus_04',
                'X12':'billamt_09',
                'X13':'billamt_08',
                'X14':'billamt_07',
                'X15':'billamt_06',
                'X16':'billamt_05',
                'X17':'billamt_04',
                'X18':'payamt_09', 
                'X19':'payamt_08',
                'X20':'payamt_07',
                'X21':'payamt_06',
                'X22':'payamt_05',
                'X23':'payamt_04'
               }

    df.rename(columns = rnm_dict, inplace = True)

    df = df.dropna(axis=0)
    
    int_list = ['id', 'gender', 'education', 'mar_status', 'age', 'paystatus_09', 'paystatus_08', 'paystatus_07', 'paystatus_06', 'paystatus_05', 'paystatus_04']

    for i in int_list:
        df[i] = df[i].astype('int')
    
    df.sort_values(by='given_credit', ascending=False,inplace=True)
    
    df.education = np.where( df.education > 4, 4, df.education ) 
    df.education = np.where( df.education < 1, 4, df.education ) 
    
    df.mar_status = np.where( df.mar_status < 1, 3 , df.mar_status ) 
            
    ### FEATURE ENGINEERING
            
    df['pay_ratio_09'] = (df['payamt_09']/df['billamt_09'])
    df['pay_ratio_08'] = (df['payamt_08']/df['billamt_08'])
    df['pay_ratio_07'] = (df['payamt_07']/df['billamt_07'])
    df['pay_ratio_06'] = (df['payamt_06']/df['billamt_06'])
    df['pay_ratio_05'] = (df['payamt_05']/df['billamt_05'])
    df['pay_ratio_04'] = (df['payamt_04']/df['billamt_04'])
            
    df.pay_ratio_09.fillna(1, inplace=True)
    df.pay_ratio_08.fillna(1, inplace=True)
    df.pay_ratio_07.fillna(1, inplace=True)
    df.pay_ratio_06.fillna(1, inplace=True)
    df.pay_ratio_05.fillna(1, inplace=True)
    df.pay_ratio_04.fillna(1, inplace=True)
            
    df.pay_ratio_09 = np.where(((df.pay_ratio_09 < 0)|(df.pay_ratio_09 > 1)), 1, df.pay_ratio_09)
    df.pay_ratio_08 = np.where(((df.pay_ratio_08 < 0)|(df.pay_ratio_08 > 1)), 1, df.pay_ratio_08)
    df.pay_ratio_07 = np.where(((df.pay_ratio_07 < 0)|(df.pay_ratio_07 > 1)), 1, df.pay_ratio_07)
    df.pay_ratio_06 = np.where(((df.pay_ratio_06 < 0)|(df.pay_ratio_06 > 1)), 1, df.pay_ratio_06)
    df.pay_ratio_05 = np.where(((df.pay_ratio_05 < 0)|(df.pay_ratio_05 > 1)), 1, df.pay_ratio_05)
    df.pay_ratio_04 = np.where(((df.pay_ratio_04 < 0)|(df.pay_ratio_04 > 1)), 1, df.pay_ratio_04)
            
    df['avg_bill_stmt'] = (df.billamt_09 + df.billamt_08 + df.billamt_07 + df.billamt_06 + df.billamt_05 + df.billamt_04) / 6
    df['avg_amt_paid'] = (df.payamt_09 + df.payamt_08 + df.payamt_07 + df.payamt_06 + df.payamt_05 + df.payamt_04) / 6
    df['gross_pay_ratio']= df.avg_amt_paid / df.avg_bill_stmt
            
    
    df.gross_pay_ratio.fillna(1, inplace=True)
    df.replace(np.inf, 1, inplace=True)
    df.gross_pay_ratio = np.where(((df.gross_pay_ratio < 0)|(df.gross_pay_ratio > 1)), 1, df.gross_pay_ratio)
    
    df = pd.get_dummies(df, columns=['gender', 'education', 'mar_status', 'paystatus_09', 'paystatus_08',
       'paystatus_07', 'paystatus_06', 'paystatus_05', 'paystatus_04'], drop_first=True)
            
    ### feature selection was done using lasso.
    working_df = df[['given_credit',
                     'paystatus_09_0',
                     'avg_amt_paid',
                     'gender_2',
                     'mar_status_2',
                     'paystatus_07_-1',
                     'education_4',
                     'paystatus_04_0',
                     'payamt_09',
                     'paystatus_05_3',
                     'paystatus_08_6',
                     'paystatus_09_4',
                     'paystatus_08_3',
                     'paystatus_04_3',
                     'paystatus_06_2',
                     'paystatus_05_2',
                     'paystatus_09_1',
                     'paystatus_04_2',
                     'paystatus_07_2',
                     'paystatus_08_2',
                     'paystatus_09_3',
                     'paystatus_09_2']]
    
    scaler = StandardScaler()
    
    scaler.fit(working_df)
    
    scaled_df = pd.DataFrame(data = scaler.transform(working_df), columns = working_df.columns)
    
    return scaled_df
    


In [4]:
import pickle
loaded_model = pickle.load(open('bestmodel.sav', 'rb'))

In [10]:
holdout_df = jps_df_pipeline('holdout_data.csv')

In [11]:
y_preds = loaded_model.predict(holdout_df)

In [12]:
y_preds

array([0, 0, 0, ..., 1, 1, 1])

In [13]:
y_preds.mean()

0.26369817357685643

In [14]:
y_preds.shape

(7501,)

In [None]:
pd.DataFrame(y_preds).to_csv('_default_preds_JV.csv', header= None)