In [1]:
import sys, os
path = os.path.realpath('./scripts/')
sys.path.append(path)
import pandas as pd
pd.options.display.max_columns = 100
from sklearn import datasets, svm, cross_validation, tree, preprocessing, metrics
import sklearn.ensemble as ske
from mung import DataSource



In [2]:
loans_csv = open('input_data/2012_to_2014_loans_data.csv')
institutions_csv = open('input_data/2012_to_2014_institutions_data.csv')
source = DataSource(loans_csv=loans_csv,institutions_csv=institutions_csv)
df = source.hmda_init()
df.describe()
source.quality

Data Quality Report
Total records:1321158


Unnamed: 0,Data Type,Present Values,Missing Values,Unique Values,Minimum Values,Maximum Values
Agency_Code,int64,1321158,0,6.0,,
Applicant_Income_000,float64,1203305,117853,2175.0,0.0,9999.0
As_of_Year,int64,1321158,0,3.0,,
Census_Tract_Number,object,1321158,0,3019.0,,
County_Code,float64,1320321,837,136.0,,
FFIEC_Median_Family_Income,float64,1319481,1677,71.0,45300.0,113400.0
Loan_Amount_000,int64,1321158,0,2944.0,1.0,99625.0
MSA_MD,object,1321158,0,30.0,,
Number_of_Owner_Occupied_Units,float64,1319284,1874,1795.0,4.0,3632.0
Respondent_ID,object,1321158,0,1384.0,,


In [3]:
# Drop some coloumns for better performance
df = df.drop(['Agency_Code','County_Code','MSA_MD','Sequence_Number','MSA_MD_Description','Agency_Code_Description','Loan_Type_Description','State','County_Name','Conventional_Status','Conforming_Status','Conventional_Conforming_Flag','Respondent_Name_TS','Loan_Amount_Groupby_Purpose','Income_Group','Loan_Amount_000','Respondent_ID','Lien_Status_Description'],axis=1)
df.columns.values

array(['Applicant_Income_000', 'As_of_Year', 'Census_Tract_Number',
       'FFIEC_Median_Family_Income', 'Number_of_Owner_Occupied_Units',
       'State_Code', 'Tract_to_MSA_MD_Income_Pct',
       'Loan_Purpose_Description', 'Conforming_Limit_000'], dtype=object)

In [4]:
# Because Scikit is not suitable to process missing values, here we will drop all rows with missing values
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1200657 entries, 0 to 1320857
Data columns (total 9 columns):
Applicant_Income_000              1200657 non-null float64
As_of_Year                        1200657 non-null int64
Census_Tract_Number               1200657 non-null object
FFIEC_Median_Family_Income        1200657 non-null float64
Number_of_Owner_Occupied_Units    1200657 non-null float64
State_Code                        1200657 non-null int64
Tract_to_MSA_MD_Income_Pct        1200657 non-null float64
Loan_Purpose_Description          1200657 non-null object
Conforming_Limit_000              1200657 non-null float64
dtypes: float64(5), int64(2), object(2)
memory usage: 91.6+ MB


In [5]:
# Process the data in preparation for machine learning algo (Categorize)
def preprocess_dataframe(dataframe):
    processed_df = dataframe.copy()
    le = preprocessing.LabelEncoder()
#     Convert string object to categorical variables
    processed_df.Census_Tract_Number = le.fit_transform(processed_df.Census_Tract_Number)
    processed_df.Loan_Purpose_Description = le.fit_transform(processed_df.Loan_Purpose_Description)
    return processed_df
# Process our loan data
processed_df = preprocess_dataframe(df)

In [6]:
x = processed_df.drop(['Loan_Purpose_Description'],axis=1).values
y = processed_df['Loan_Purpose_Description'].values
x_train,x_test,y_train,y_test = cross_validation.train_test_split(x,y,test_size=0.2)

In [7]:
shuffle_validator = cross_validation.ShuffleSplit(len(x), n_iter=20, test_size=0.2, random_state=0)
def test_classifier(clf):
    scores = cross_validation.cross_val_score(clf, x, y, cv=shuffle_validator)
    print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [None]:
# Decision Tree
clf_dt = tree.DecisionTreeClassifier(max_depth=12)
test_classifier(clf_dt)

In [9]:
# Random Forest
clf_rf = ske.RandomForestClassifier(n_estimators=50)
clf_rf.fit(x_train,y_train)
clf_rf.score(x_test,y_test)

0.60042809787949958

In [10]:
# Gradient Boosting
clf_gb = ske.GradientBoostingClassifier(n_estimators=50)
test_classifier(clf_gb)

Accuracy: 0.6530 (+/- 0.00)


In [None]:
# Voting
eclf = ske.VotingClassifier([('dt', clf_dt), ('rf', clf_rf), ('gb', clf_gb)])
test_classifier(eclf)