## Import Dependencies

In [1]:
import csv
from pathlib import Path
import pandas as pd
import glob
from datetime import datetime, timedelta
import numpy as np
# from sklearn import preprocessing
import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression, RidgeCV
# from sklearn.metrics import mean_squared_error
import seaborn as sns
import pandas_profiling as pp
from settings import *
# from settings import DATA_DIR

In [2]:
download_dir = 'D:\Performance_All'
download_dir

'D:\\Performance_All'

In [3]:
DATA_DIR

'data'

In [4]:
outputFileName = 'FMPerfProcessedALL.csv'
outpath = "/".join([DATA_DIR,outputFileName])
outpath

'data/FMPerfProcessedALL.csv'

# Get the Single-Family Loan Performance Data Files from Fannie Mae

From the Fannie Mae [website:](https://loanperformancedata.fanniemae.com/lppub/index.html#Single-Family_Loan_Performance_Data_Files) we downloaded the Performance dataset (26GB zip file)

The [Performance file layout](https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_File_layout.pdf) was as follows:
![Performance File Layout](images/PerformanceFileLayout.jpg)

In [5]:
# Create mapping for the header of the files based on above file format
allheaderline = ['id','rptPeriod','sellerName','currIntRate','currUPB','loanAge','monMatur','adjMonMatur',\
              'maturDate','mSA','deliqStatus','modFlag','zeroBalCode','zeroBalDate','lastPdInstDate',\
              'forecloDate','dispDate','forecloCost','propRepCost','recovCosts','miscCost','holdTaxCost',\
              'saleProceed','credEnhProceed','repurchProceed','otherForecloProceed','nonIntUPB',\
              'prinForgivBal','repurchMakeWholeProceedFlg','forecloPrinWriteOffAmnt','servActivIndicator']
colchoices = [0,9,10,12,16]
headerline = ['id','mSA','deliqStatus','zeroBalCode']

In [6]:
# Loop through each file from the unzipped download file - which gave a set of 76 individual files

#Create an empty array to hold dataframes to later concatonate
li = []

#Loop through every file in the directory and append into array
for in_path in Path(download_dir).glob('Performance*.txt'):
    df = pd.read_csv(in_path, sep="|", index_col=None, header=None, usecols=colchoices)
    df.columns=headerline
    df = df.set_index('id')
    # Isolate the deliqStatus column to speed up aggregation calculations
    deliq = df[['deliqStatus']].copy()
    # deliqStatus has one value of 'X' amongst integer values and NaN values
    # First replace 'X' values with NaN
    deliq['deliqStatus'] = deliq['deliqStatus'].replace('X',np.nan).astype(float)
    # Determine the number of zero (0) values representing successful payments for a loan
    deliqGood = deliq[deliq['deliqStatus'] == 0].groupby('id').count()
    # Determine the number of values > 0 representing delinquent payments for a loan
    deliqBad = deliq[deliq['deliqStatus'] > 0].groupby('id').count()
    # Determine the max() of deliqStatus representing highest numberdelinquent payments in a row for a loan
    deliqMax = deliq[deliq['deliqStatus'] > 0].groupby('id').max()
#     # Select only rows that have a zeroBalCode with a value ## Commented out to include ALL records
#     zbc_df = df[df['zeroBalCode'].notnull()]
    df = df.drop(columns=['deliqStatus'])
    #Merge the zbc_df with the deliqGood
    zbc_merge = pd.merge(df, deliqGood, on=['id'])
    #Merge the zbc_df with the deliqBad
    zbc_merge = pd.merge(zbc_merge, deliqBad, on=['id'])
    #Merge the zbc_df with the deliqMax
    zbc_merge = pd.merge(zbc_merge, deliqMax, on=['id'])
    # Rename all the deqliq* columns
    zbc_merge = zbc_merge.rename(columns={'deliqStatus_x':'deliqGood','deliqStatus_y':'deliqBad','deliqStatus':'deliqMax'})
    # Add the zbc_merge df to the li list to be able to concat them later
    li.append(zbc_merge)

In [7]:
len(li)

78

In [8]:
#Concatonate all the array elements into one giant dataframe
processed_df = pd.concat(li, axis=0)
#df.columns = allheaderline

In [9]:
processed_df.shape

(443356941, 5)

In [10]:
processed_df.head()

Unnamed: 0_level_0,mSA,zeroBalCode,deliqGood,deliqBad,deliqMax
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100021703104,35980,,33,1,1.0
100021703104,35980,,33,1,1.0
100021703104,35980,,33,1,1.0
100021703104,35980,,33,1,1.0
100021703104,35980,,33,1,1.0


In [11]:
#Review the columns for datatypes and how many non-null values 
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 443356941 entries, 100021703104 to 999888644871
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   mSA          int64  
 1   zeroBalCode  float64
 2   deliqGood    int64  
 3   deliqBad     int64  
 4   deliqMax     float64
dtypes: float64(2), int64(3)
memory usage: 19.8 GB


### Convert the origDate,firstPmtDate columns from strings to datetime datatypes
Since this will be such a large dataset I tested a number of methods to change datatypes

In [12]:
processed_df.shape

(443356941, 5)

In [14]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 443356941 entries, 100021703104 to 999888644871
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   mSA          int64  
 1   zeroBalCode  float64
 2   deliqGood    int64  
 3   deliqBad     int64  
 4   deliqMax     float64
dtypes: float64(2), int64(3)
memory usage: 19.8 GB


In [None]:
# Change all occurences of columns with 'Y' and 'N' to 0 and 1
processed_df.replace('Y',1, inplace=True)
processed_df.replace('N',0, inplace=True)

In [None]:
processed_df.info()

In [19]:
processed_df.drop_duplicates( keep='last', inplace=True)

In [20]:
processed_df.head()

Unnamed: 0_level_0,mSA,zeroBalCode,deliqGood,deliqBad,deliqMax
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100023274028,12060,,20,11,4.0
100023274028,12060,1.0,20,11,4.0
100057706656,17140,6.0,13,1,1.0
100118647578,38900,,35,21,2.0
100118647578,38900,1.0,35,21,2.0


# Write the result df to csv

In [21]:
processed_df.to_csv(outpath)