In [1]:
import settings as st
import os
import pandas as pd
import numpy as np

In [3]:
'''
*************** STAGE 1: DATA GATHERING ***********************
Assemble all the different .txt files (Aquisition and Performance) into 2 files :
We’ll first need to define the headers for each file
https://loanperformancedata.fanniemae.com/lppub-docs/lppub_file_layout.pdf
'''

HEADERS = {
    "Acquisition": [
        "id", "channel", "seller", "interest_rate", "balance", "loan_term",
        "origination_date", "first_payment_date", "ltv", "cltv",
        "borrower_count", "dti", "borrower_credit_score",
        "first_time_homebuyer", "loan_purpose", "property_type", "unit_count",
        "occupancy_status", "property_state", "zip", "insurance_percentage",
        "product_type", "co_borrower_credit_score","mortgage_insurance_type","relocation_mortgage_indicator"
    ],
    "Performance": [
        "id", "reporting_period", "servicer_name", "interest_rate", "balance",
        "loan_age", "months_to_maturity", "maturity_date", "msa",
        "delinquency_status", "modification_flag", "zero_balance_code",
        "zero_balance_date", "last_paid_installment_date", "foreclosure_date",
        "disposition_date", "foreclosure_costs", "property_repair_costs",
        "recovery_costs", "misc_costs", "tax_costs", "sale_proceeds",
        "credit_enhancement_proceeds", "repurchase_proceeds",
        "other_foreclosure_proceeds", "non_interest_bearing_balance",
        "principal_forgiveness_balance","REPURCHASE_MAKE_WHOLE_PROCEEDS_FLAG",
        "FORECLOSURE_PRINCIPAL_WRITE_OFF_AMOUNT","SERVICING_ACTIVITY_INDICATOR"
    ]
}

# We will now define the columns we want to keep. Since all we’re measuring on an ongoing basis about the loan is
# whether or not it was ever foreclosed on, we can discard many of the columns in the performance data.
# We’ll need to keep all the columns in the acquisition data, though, because we want to maximize the
# information we have about when the loan was acquired

SELECT = {
    "Acquisition": HEADERS["Acquisition"],
    "Performance": ["id", "foreclosure_date"]
}


def concatenate(prefix="Acquisition"):
    dir_files = os.listdir(st.DATA_DIR)
    full_file = []
    # concatenate all the files in the directory

    for f in dir_files:
        if f.startswith(prefix):
            read_file = pd.read_csv(
                os.path.join(st.DATA_DIR, f),
                sep='|',
                header=None,
                names=HEADERS[prefix],
                index_col=False,
                error_bad_lines=False,
                low_memory=False)
            read_file = read_file[SELECT[prefix]]
            full_file.append(read_file)
        else:
            continue
    full_file = pd.concat(full_file, axis=0)

    test = full_file.head(5)
    # get shape of the file
    print(full_file.shape)

    # convert the processed files -
    full_file.to_csv(
        os.path.join(st.PROCESSED_DIR, "{}.csv".format(prefix)), index=False)

#concatenate()
if __name__ == "__main__":
    concatenate("Acquisition")
    concatenate("Performance")


(4579594, 25)


CParserError: Error tokenizing data. C error: out of memory

In [3]:
'''
******************** STAGE 2: DATA CLEANING AND TRANSFORMATION ***************
Function to counts number of performance rows for each Loan Id.
counts dictionary: get the count of id and their count of occurance
'''


def count_performance_rows():
    foreclosure_counts = {}
    # read data from files
    # NOTE: Opening a file handler to read rather than Pandas.
    # Reason : We want to re
    with open(os.path.join(st.PROCESSED_DIR, "Performance.csv"), "r") as f:
        for i, line in enumerate(f):
            if i == 0:
                continue
            loan_id, date = line.split(",")
            loan_id = int(loan_id)
            if loan_id not in foreclosure_counts:
                foreclosure_counts[loan_id] = {
                    "foreclosure_status": 0,
                    "performance_count": 0
                }  # initialize dict

            foreclosure_counts[loan_id]["performance_count"] += 1
            if len(date.strip()) > 0:
                foreclosure_counts[loan_id]["foreclosure_status"] = 1
    return foreclosure_counts


'''
Function to extract values from the dictionary if a loan_id and a key are passed in
This function will enable us to assign a foreclosure_status value and a performance_count value to each row in the Acquisition data.
'''


def get_summary_of_performance(loan_id, key, foreclosure_count_dictionary):
    #The get method on dictionaries returns a default value if a key isn’t found.
    summary_value = foreclosure_count_dictionary.get(
    loan_id, {
            "foreclosure_status": 0,
            "performance_count": 0
        })
    return summary_value[key]


'''
This step will involve:
A. data transformation -
- Converting all columns to numeric.
- Filling in any missing values.
- Assigning a performance_count and a foreclosure_status to each row.
- Removing any rows that don’t have a lot of performance history (where performance_count is low).
B. data cleaning - 
This will prepare training dataset that can be used in a machine learning algorithm.
There are few different category codes, like R, S, that will be converted to 1 , 2.
For columns that contain dates we will split them into 2 columns - Month , Year
'''


def data_transform(acquisition, counts):
    # add "foreclosure_status" column in acquisition dataframe by getting the values from the counts dictionary
    acquisition["foreclosure_status"] = acquisition["id"].apply(
    lambda x: get_summary_of_performance(x, "foreclosure_status", counts))

    # add "performance_count" column in acquisition dataframe by getting the values from the counts dictionary.
    acquisition["performance_count"] = acquisition["id"].apply(
    lambda x: get_summary_of_performance(x, "performance_count", counts))

    # convert following columns to int - These are category variables
    #["channel","seller","first_time_homebuyer","loan_purpose","property_type","occupancy_status","property_state","product_type"]
    string_columns = [
        "channel", "seller", "first_time_homebuyer", "loan_purpose",
        "property_type", "occupancy_status", "property_state", "product_type"
    ]
    for column in string_columns:
        acquisition[column] = acquisition[column].astype("category").cat.codes

    # convert date values - "first_payment_date" and "origination_date"
    for date in ["first_payment", "origination"]:
        cols = "{}_date".format(date)
        acquisition["{}_month".format(date)] = pd.to_numeric(
            acquisition[cols].str.split('/').str.get(0))
        acquisition["{}_year".format(date)] = pd.to_numeric(
            acquisition[cols].str.split('/').str.get(1))

    acquisition = acquisition.fillna(-1)
    acquisition = acquisition[
        acquisition["performance_count"] > st.MINIMUM_TRACKING_QUARTERS]
    return acquisition


'''
Read the Acquisition dataset
'''


def read():
    acquisition = pd.read_csv(
        os.path.join(st.PROCESSED_DIR, "Acquisition.csv"))
    return acquisition


'''
write the training dataset to train.csv
'''


def write(acquisition):
    acquisition.to_csv(
        os.path.join(st.PROCESSED_DIR, "train.csv"), index=False)


if __name__ == "__main__":
    acquisition = read()
    counts = count_performance_rows()
    acquisition = data_transform(acquisition, counts)
    write(acquisition)


In [5]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
'''
********************* STAGE 3: MAKING PREDICTION ************************
The training dataset is very unbalanced with very few loans foreclosed compared to the loans that
were not foreclosed. This would make a biased prediction.
Therefore to predict foreclosures correctly this imbalance needs to be accounted.
False Negative is more dangerous here than False Positive, as False Negative would indicate 
a risky loan being acquired.
Error Metric: FP / FN
Classifier : Logistic Regression (sklearn library) to classify the loan. The class needs to be
balanced as the dataset is biased to non-foreclosures. use : class_weight = balanced
OverFitting : Cross Validation (sklearn library) to counter OverFitting of model.
'''


def prediction_model(train):
    model = LogisticRegression(random_state=1, class_weight="balanced")

    predictors = train.columns.tolist()

    predictors = [p for p in predictors if p not in st.NON_PREDICTORS]

    predictions = cross_validation.cross_val_predict(
        model, train[predictors], train[st.TARGET], cv=st.CV_FOLDS)

    return predictions


def compute_error(target, predictions):
    return metrics.accuracy_score(target, predictions)


def compute_false_negatives(target, predictions):
    false_negatives = pd.DataFrame({
        "target": target,
        "predictions": predictions
    })
    neg_rate = false_negatives[(false_negatives["target"] == 1) & (
        false_negatives["predictions"] == 0)].shape[0] / (
            false_negatives[(false_negatives["target"] == 1)].shape[0] + 1)
    return neg_rate


def compute_false_positive(target, predictions):
    false_positives = pd.DataFrame({
        "target": target,
        "predictions": predictions
    })
    pos_rate = false_positives[(false_positives["target"] == 0) & (
        false_positives["predictions"] == 1)].shape[0] / (
            false_positives[(false_positives["target"] == 0)].shape[0] + 1)
    return pos_rate


def read():
    train = pd.read_csv(os.path.join(st.PROCESSED_DIR, "train.csv"))
    return train


if __name__ == "__main__":
    train = read()
    train = train.drop(["origination_date", "first_payment_date"], 1)
    predictions = prediction_model(train)
    print("Prediction model done")
    model_error = compute_error(train[st.TARGET], predictions)
    FN = compute_false_negatives(train[st.TARGET], predictions)
    FP = compute_false_positive(train[st.TARGET], predictions)
    print("Accuracy of the model:{}".format(model_error))
    print("False Negatives:{}".format(FN))
    print("False Positive:{}".format(FP))


Prediction model done
Accuracy of the model:0.779409198198
False Negatives:0
False Positive:0
