In [174]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# ORIG 

In [175]:
headers_orig = open('headers_orig.txt', 'r')
headers_orig = headers_orig.read().strip().split('\n')
orig = pd.read_csv("sample_orig_2018.txt", index_col=False, names=headers_orig, sep="|")
orig.head()

Unnamed: 0,Credit Score,First Payment Date,First Time Homebuyer Flag,Maturity Date,Metropolitan Statistical Area (MSA) Or Metropolitan Division,Mortgage Insurance Percentage (MI %),Number of Units,Occupancy Status,Original Combined Loan-to-Value (CLTV),Original Debt-to-Income (DTI) Ratio,Original UPB,Original Loan-to-Value (LTV),Original Interest Rate,Channel,Prepayment Penalty Mortgage (PPM) Flag,Amortization Type (Formerly Product Type),Property State,Property Type,Postal Code,Loan Sequence Number,Loan Purpose,Original Loan Term,Number of Borrowers,Seller Name,Servicer Name,Super Conforming Flag,Pre-HARP Loan Sequence Number,Program Indicator,HARP Indicator,Property Valuation Method,Interest Only (I/O) Indicator
0,784,201803,9,204802,49180.0,0,1,I,75,39,74000,75,4.625,R,N,FRM,NC,PU,27100,F18Q10000027,C,360,1,Other sellers,Other servicers,,,9,,2,N
1,693,201803,N,203302,24340.0,0,1,P,80,41,132000,80,3.25,R,N,FRM,MI,SF,49300,F18Q10000052,P,180,2,Other sellers,Other servicers,,,9,,1,N
2,731,201803,9,204802,,0,1,P,45,37,175000,45,3.75,R,N,FRM,MN,SF,56500,F18Q10000083,C,360,1,Other sellers,Other servicers,,,9,,2,N
3,763,201802,Y,204801,,30,1,P,95,34,27000,95,4.25,R,N,FRM,WI,SF,54500,F18Q10000116,P,360,1,Other sellers,Other servicers,,,9,,2,N
4,755,201803,9,204802,49700.0,0,1,P,80,33,168000,80,3.875,R,N,FRM,CA,PU,95900,F18Q10000139,C,360,2,Other sellers,Other servicers,,,9,,2,N


# SVCG

In [26]:
headers_svcg = open('headers_svcg.txt', 'r')
headers_svcg = headers_svcg.read().strip().split('\n')
svcg = pd.read_csv("sample_svcg_2018.txt", index_col=False, names=headers_svcg, sep="|")
svcg.head()

Unnamed: 0,Loan Sequence Number,Monthly Reporting Period,Current Actual UPB,Current Loan Delinquency Status,Loan Age,Remaining Months to Legal Maturity,Repurchase Flag,Modification Flag,Zero Balance Code,Zero Balance Effective Date,...,Miscellaneous Expenses,Actual Loss Calculation,Modification Cost,Step Modification Flag,Deferred Payment Plan,Estimated Loan-to-Value (ELTV),Zero Balance Removal UPB,Delinquent Accrued Interest,Delinquency Due to Disaster,Borrower Assistance Status Code
0,F18Q10000027,201802,74000.0,0,0,360,,,,,...,,,,,,75.0,,,,
1,F18Q10000027,201803,74000.0,0,1,359,,,,,...,,,,,,73.0,,,,
2,F18Q10000027,201804,73000.0,0,2,358,,,,,...,,,,,,69.0,,,,
3,F18Q10000027,201805,73000.0,0,3,357,,,,,...,,,,,,69.0,,,,
4,F18Q10000027,201806,73000.0,0,4,356,,,,,...,,,,,,69.0,,,,


# Join SVCG+Orig -> new_sample.csv

In [24]:
new_svcg = svcg.head(50000)
new_svcg = new_svcg.join(orig.set_index("Loan Sequence Number"), on="Loan Sequence Number", how="left")
new_svcg.to_csv('new_sample.csv', index=False) 

In [25]:
# zero_balance_codes = {1: 0, 2: 0, 3: 1, 6: 1, 9: 1, 15: 0, np.nan:0}
# svcg['Zero Balance Code'] = svcg['Zero Balance Code'].replace(zero_balance_codes)
# svcg['Current Loan Delinquency Status'] = svcg['Current Loan Delinquency Status'].replace({'R': 100, 'XX': 100}).astype(int)

In [6]:
# bi_1 = Binarizer(threshold=2)
# ct_1 = make_column_transformer(
#     (bi, ['Current Loan Delinquency Status']),
#     remainder='passthrough'
# )

# new_svcg = pd.DataFrame(ct.fit_transform(svcg))
# new_svcg

In [27]:
def current_loan_helper(val):
    try:
        if int(val) >= 3:
            return 1
        else:
            return 0
    except:
        return 1
    
def zero_balance_helper(val):
    target = [3,6,9]
    if val in target:
        return 1
    else:
        return 0
    
def default_helper(val):
    if val > 0:
        return 1
    else:
        return 0

In [28]:
svcg['Zero Balance Code'] = svcg['Zero Balance Code'].apply(zero_balance_helper)

In [29]:
svcg['Current Loan Delinquency Status'] = svcg['Current Loan Delinquency Status'].apply(current_loan_helper)

In [30]:
svcg = svcg.assign(**{'Default': svcg['Current Loan Delinquency Status'] + svcg['Zero Balance Code']})
svcg['Default'] = svcg['Default'].apply(default_helper)

In [31]:
new_svcg = svcg.head(50000)
new_svcg = new_svcg.join(orig.set_index("Loan Sequence Number"), on="Loan Sequence Number", how="left")
new_svcg.to_csv('new_sample.csv', index=False) 

# Feature Prep

In [176]:
orig = orig.iloc[:, :16]
orig.head()

Unnamed: 0,Credit Score,First Payment Date,First Time Homebuyer Flag,Maturity Date,Metropolitan Statistical Area (MSA) Or Metropolitan Division,Mortgage Insurance Percentage (MI %),Number of Units,Occupancy Status,Original Combined Loan-to-Value (CLTV),Original Debt-to-Income (DTI) Ratio,Original UPB,Original Loan-to-Value (LTV),Original Interest Rate,Channel,Prepayment Penalty Mortgage (PPM) Flag,Amortization Type (Formerly Product Type)
0,784,201803,9,204802,49180.0,0,1,I,75,39,74000,75,4.625,R,N,FRM
1,693,201803,N,203302,24340.0,0,1,P,80,41,132000,80,3.25,R,N,FRM
2,731,201803,9,204802,,0,1,P,45,37,175000,45,3.75,R,N,FRM
3,763,201802,Y,204801,,30,1,P,95,34,27000,95,4.25,R,N,FRM
4,755,201803,9,204802,49700.0,0,1,P,80,33,168000,80,3.875,R,N,FRM


In [177]:
drop_features = ['First Payment Date', 
                 'Maturity Date', 
                 'Metropolitan Statistical Area (MSA) Or Metropolitan Division']
orig = orig.drop(drop_features, axis=1)
orig['Number of Units'] = orig['Number of Units'].replace({99: 1})
orig['Occupancy Status'] = orig['Occupancy Status'].replace({'9': 'P'})
orig['Original Debt-to-Income (DTI) Ratio'] = orig['Original Debt-to-Income (DTI) Ratio'].replace({999: 65})
avg = int(orig['Original Loan-to-Value (LTV)'].mean())
orig['Original Loan-to-Value (LTV)'] = orig['Original Loan-to-Value (LTV)'].replace({999: avg})
orig['Channel'] = orig['Channel'].replace({'T': 'R', '9': 'R'})
orig['Prepayment Penalty Mortgage (PPM) Flag'] = orig['Prepayment Penalty Mortgage (PPM) Flag'].replace({'Y': 1, 'N': 0})
orig['Amortization Type (Formerly Product Type)'] = orig['Amortization Type (Formerly Product Type)'].replace({'FRM': 1, 'ARM': 0})

In [178]:
orig.head()

Unnamed: 0,Credit Score,First Time Homebuyer Flag,Mortgage Insurance Percentage (MI %),Number of Units,Occupancy Status,Original Combined Loan-to-Value (CLTV),Original Debt-to-Income (DTI) Ratio,Original UPB,Original Loan-to-Value (LTV),Original Interest Rate,Channel,Prepayment Penalty Mortgage (PPM) Flag,Amortization Type (Formerly Product Type)
0,784,9,0,1,I,75,39,74000,75,4.625,R,0,1
1,693,N,0,1,P,80,41,132000,80,3.25,R,0,1
2,731,9,0,1,P,45,37,175000,45,3.75,R,0,1
3,763,Y,30,1,P,95,34,27000,95,4.25,R,0,1
4,755,9,0,1,P,80,33,168000,80,3.875,R,0,1


In [179]:
onehot_features = ['First Time Homebuyer Flag', 
                   'Occupancy Status', 
                   'Channel']
onehot_transformer = OneHotEncoder()

numerical_features = ['Credit Score', 
                      'Mortgage Insurance Percentage (MI %)', 
                      'Number of Units', 
                      'Original Combined Loan-to-Value (CLTV)', 
                      'Original Debt-to-Income (DTI) Ratio', 
                      'Original UPB', 
                      'Original Loan-to-Value (LTV)',
                      'Original Interest Rate']
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(transformers=[
    ('onehot', onehot_transformer, onehot_features), 
    ('numerical', numerical_transformer, numerical_features)],
    remainder='passthrough'
)

preprocessor.fit_transform(orig)

array([[1.0, 0.0, 0.0, ..., -0.23554714929288956, 0, 1],
       [0.0, 1.0, 0.0, ..., -3.1116445676191966, 0, 1],
       [1.0, 0.0, 0.0, ..., -2.0657909609550846, 0, 1],
       ...,
       [1.0, 0.0, 0.0, ..., -3.634571370951252, 0, 1],
       [1.0, 0.0, 0.0, ..., -3.4254006496184295, 0, 1],
       [1.0, 0.0, 0.0, ..., -3.634571370951252, 0, 1]], dtype=object)

# IGNORE

In [99]:
X = pd.DataFrame({'Fare': [7.25, 71.2833, 7.9250, 53.1, 8.05, 8.4583],
                  'Embarked': ['S', 'C', 'S', 'S', 'S', np.nan], 
                  'Sex': ['male', 'female', 'female', 'female', 'male', 'male'], 
                  'Age': [22, 38, 26, 35, 35, np.nan]})

ohe = OneHotEncoder()
imp = SimpleImputer()

In [107]:
numeric_features = ['Fare', 'Age']
numeric_transformer = SimpleImputer(strategy='mean')

categorical_features = ['Embarked', 'Sex']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ('numerical', numeric_transformer, numeric_features),
    ('categorical', categorical_transformer, categorical_features)
])

preprocessor.fit_transform(X)

array([[ 7.25  , 22.    ,  0.    ,  1.    ,  0.    ,  1.    ],
       [71.2833, 38.    ,  1.    ,  0.    ,  1.    ,  0.    ],
       [ 7.925 , 26.    ,  0.    ,  1.    ,  1.    ,  0.    ],
       [53.1   , 35.    ,  0.    ,  1.    ,  1.    ,  0.    ],
       [ 8.05  , 35.    ,  0.    ,  1.    ,  0.    ,  1.    ],
       [ 8.4583, 31.2   ,  0.    ,  1.    ,  0.    ,  1.    ]])

# IGNORE