## Import Dependencies

In [10]:
import csv
from pathlib import Path
import pandas as pd
import glob
from datetime import datetime, timedelta
import numpy as np
# from sklearn import preprocessing
import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression, RidgeCV
# from sklearn.metrics import mean_squared_error
import seaborn as sns
import pandas_profiling as pp
from settings import *
# from settings import DATA_DIR

In [11]:
download_dir = 'D:\Performance_All'
download_dir

'D:\\Performance_All'

In [12]:
DATA_DIR

'data'

In [13]:
outputFileName = 'FMPerfProcessedALL.csv'
outpath = "/".join([DATA_DIR,outputFileName])
outpath

'data/FMPerfProcessedALL.csv'

# Get the Single-Family Loan Performance Data Files from Fannie Mae

From the Fannie Mae [website:](https://loanperformancedata.fanniemae.com/lppub/index.html#Single-Family_Loan_Performance_Data_Files) we downloaded the Performance dataset (26GB zip file)

The [Performance file layout](https://loanperformancedata.fanniemae.com/lppub-docs/FNMA_SF_Loan_Performance_File_layout.pdf) was as follows:
![Performance File Layout](images/PerformanceFileLayout.jpg)

In [14]:
# Create mapping for the header of the files based on above file format
allheaderline = ['id','rptPeriod','sellerName','currIntRate','currUPB','loanAge','monMatur','adjMonMatur',\
              'maturDate','mSA','deliqStatus','modFlag','zeroBalCode','zeroBalDate','lastPdInstDate',\
              'forecloDate','dispDate','forecloCost','propRepCost','recovCosts','miscCost','holdTaxCost',\
              'saleProceed','credEnhProceed','repurchProceed','otherForecloProceed','nonIntUPB',\
              'prinForgivBal','repurchMakeWholeProceedFlg','forecloPrinWriteOffAmnt','servActivIndicator']
colchoices = [0,9,12,16]
headerline = ['id','mSA','zeroBalCode','forcloDate']

In [15]:
# Loop through each file from the unzipped download file - which gave a set of 76 individual files

#Create an empty array to hold dataframes to later concatonate
li = []

#Loop through every file in the directory and append into array
for in_path in Path(download_dir).glob('Performance*.txt'):
    df = pd.read_csv(in_path, sep="|", index_col=None, header=None, usecols=colchoices, low_memory=False)
    df.columns=headerline
    df.drop_duplicates(subset = 'id', keep='last', inplace=True)
    li.append(df)

In [16]:
len(li)

78

In [17]:
#Concatonate all the array elements into one giant dataframe
processed_df = pd.concat(li, axis=0)
#df.columns = allheaderline

In [18]:
processed_df.shape

(41294719, 4)

In [19]:
processed_df.head()

Unnamed: 0,id,mSA,zeroBalCode,forcloDate
39,100007365142,0,1.0,
55,100007386460,47900,1.0,
95,100011322040,41180,1.0,
153,100015192562,46660,1.0,
184,100015874399,45300,1.0,


In [20]:
#Review the columns for datatypes and how many non-null values 
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41294719 entries, 39 to 4260470
Data columns (total 4 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   mSA          int64  
 2   zeroBalCode  float64
 3   forcloDate   object 
dtypes: float64(1), int64(2), object(1)
memory usage: 1.5+ GB


In [21]:
processed_df.value_counts()

id            mSA    zeroBalCode  forcloDate
999997685687  40140  9.0          07/01/2014    1
400152441692  37620  9.0          10/01/2008    1
400148122696  41700  9.0          04/01/2012    1
400147529135  40140  3.0          01/01/2012    1
400145249111  14460  9.0          12/01/2010    1
                                               ..
700507706266  43900  2.0          09/01/2018    1
700506198897  19820  9.0          05/01/2012    1
700503637662  33100  2.0          09/01/2015    1
700503574657  33100  3.0          09/01/2011    1
100001031040  12580  9.0          08/01/2014    1
Length: 618401, dtype: int64

In [22]:
processed_df['zeroBalCode'].fillna(0, inplace=True)

In [23]:
processed_df['forcloDate'].fillna(0, inplace=True)

In [24]:
processed_df.loc[processed_df['forcloDate'] != 0, 'forcloDate'] =1

In [25]:
processed_df['zeroBalCode'] = processed_df['zeroBalCode'].astype(int)

In [26]:
processed_df['forcloDate'] = processed_df['forcloDate'].astype(int)

In [27]:
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41294719 entries, 39 to 4260470
Data columns (total 4 columns):
 #   Column       Dtype
---  ------       -----
 0   id           int64
 1   mSA          int64
 2   zeroBalCode  int32
 3   forcloDate   int32
dtypes: int32(2), int64(2)
memory usage: 1.2 GB


In [28]:
#problem = processed_df[processed_df['forcloDate'] == 0 & processed_df['zeroBalCode'] != 0]

# Write the result df to csv

In [29]:
processed_df.to_csv(outpath)

In [30]:
processed_df.forcloDate.value_counts()

0    40676318
1      618401
Name: forcloDate, dtype: int64

In [31]:
processed_df.zeroBalCode.value_counts()

1     29095940
0     11434064
9       440959
3       105884
6        71346
16       62088
2        54151
15       30287
Name: zeroBalCode, dtype: int64