In [0]:
# loading data with inferred schema

from pyspark.sql.functions import input_file_name, explode, split

filename = "/FileStore/tables/loan*"
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/FileStore/tables/loan_top250k.csv")

df.createOrReplaceTempView("loanData")


In [0]:
len(df.collect())

In [0]:
# Displaying the top 10 rows of the dataframe for inspection

sqlDF = spark.sql("select count(*) from loanData ")
display(sqlDF)

count(1)
250000


In [0]:
# the default schema is all strings so we'll need to remedy this

df.printSchema()

In [0]:
# Inspecting our dependent variable. 

df.groupBy("loan_status").count().sort("loan_status",ascending=False).show()

In [0]:
# converting our variable to binary depending on whether or not it was a charge off
from pyspark.sql.functions import when

df = df.withColumn("isChargeOff",when(df.loan_status == 'Charged Off', 1).otherwise(0))

df.groupBy("isChargeOff").count().sort("isChargeOff",ascending=False).show()

In [0]:
# filling blank continous values as 0
df = df.na.fill(0)

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
categoricalColumns = ["verification_status", "term", "grade", "sub_grade"]
stages = [] # stages in our Pipeline
for categoricalCol in categoricalColumns:
    # Category Indexing with StringIndexer
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Index")
    # Use OneHotEncoder to convert categorical variables into binary SparseVectors
    # encoder = OneHotEncoderEstimator(inputCol=categoricalCol + "Index", outputCol=categoricalCol + "classVec")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    # Add stages.  These are not run here, but will run all at once later on.
    stages += [stringIndexer, encoder]

In [0]:
# Convert label into label indices using the StringIndexer
label_stringIdx = StringIndexer(inputCol="isChargeOff", outputCol="label")
stages += [label_stringIdx]

In [0]:
# Transform all features into a vector using VectorAssembler
numericCols = ["loan_amnt",	"funded_amnt",	"funded_amnt_inv", "annual_inc"]
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [0]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFE
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [0]:
svm = LinearSVC()
recurFS = RFE(svm, 5)#used Recursive Feature Elimination for the feature selection
recurFS = recurFS.fit(df,np.ravel(,order='C'))
print(recurFS.support_)#gives the result of attributes that are used to find the fake accounts
print(recurFS.ranking_)#gives the rank for the attributes

In [0]:
from pyspark.ml.classification import LogisticRegression
  
partialPipeline = Pipeline().setStages(stages)
pipelineModel = partialPipeline.fit(df)
preppedDataDF = pipelineModel.transform(df)

In [0]:
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = preppedDataDF.randomSplit([0.7, 0.3], seed=100)
print(trainingData.count())
print(testData.count())

In [0]:
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=50)

# Train model with Training Data
lrModel = lr.fit(trainingData)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
predictions = lrModel.transform(testData)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

In [0]:
display(predictions)

id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term,isChargeOff,verification_statusIndex,verification_statusclassVec,termIndex,termclassVec,gradeIndex,gradeclassVec,sub_gradeIndex,sub_gradeclassVec,label,features,rawPrediction,probability,prediction
,,1000,1000,750.0,36 months,10.33,32.43,B,B1,Resource teacher,2 years,MORTGAGE,54000.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,871xx,NM,4.37,2,Feb-1998,0,11.0,0.0,5,0,6283,16.8,15,f,952.15,714.11,64.29,48.22,47.85,16.44,0.0,0.0,0.0,Feb-2019,32.43,Mar-2019,Feb-2019,1,12.0,1,Individual,0.0,0.0,,0,0,154754,0,0,0,0,74.0,0,0.0,0,0,4854,17.0,37500,0,0,0,1,30951.0,31217.0,16.8,0,0,74.0,250,41,18,2,41.0,0.0,0.0,0.0,1,3,3,4,9,1,4,11,3,5,0.0,0,1,0,86.7,0.0,0,0,197500,6283,37500,0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 6, List(1), List(1.0))",8.0,"List(0, 34, List(8), List(1.0))",0.0,"List(0, 47, List(1, 2, 4, 17, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 750.0, 54000.0))","List(1, 2, List(), List(6.339828685192818, -6.339828685192818))","List(1, 2, List(), List(0.9982385038302689, 0.001761496169731182))",0.0
,,1000,1000,750.0,36 months,12.98,33.69,B,B5,teacher,10+ years,RENT,95000.0,Not Verified,Dec-2018,Current,n,,,vacation,Vacation,606xx,IL,9.1,1,May-2004,1,17.0,0.0,10,0,2968,10.1,18,f,954.01,715.5,65.58,49.19,45.99,19.59,0.0,0.0,0.0,Feb-2019,33.69,Mar-2019,Feb-2019,0,0.0,1,Individual,0.0,0.0,,0,930,19910,2,2,1,2,4.0,16942,56.0,2,2,1987,33.0,29500,2,0,1,4,1991.0,19058.0,10.1,0,0,175.0,97,5,4,0,35.0,0.0,4.0,17.0,0,3,6,5,6,7,8,11,6,10,0.0,0,0,3,94.1,0.0,0,0,59533,19910,21200,30033,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 6, List(1), List(1.0))",6.0,"List(0, 34, List(6), List(1.0))",0.0,"List(0, 47, List(0, 2, 4, 15, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 750.0, 95000.0))","List(1, 2, List(), List(6.575847004058515, -6.575847004058515))","List(1, 2, List(), List(0.9986083144754672, 0.0013916855245327889))",0.0
,,1000,1000,975.0,36 months,6.46,30.64,A,A1,,,RENT,13400.0,Not Verified,Dec-2018,Current,n,,,credit_card,Credit card refinancing,481xx,MI,4.48,0,Jan-2008,1,0.0,71.0,5,1,991,7.5,7,f,949.35,925.62,60.92,59.4,50.65,10.27,0.0,0.0,0.0,Feb-2019,30.64,Mar-2019,Feb-2019,0,0.0,1,Individual,0.0,0.0,,0,0,991,1,0,0,0,100.0,0,0.0,2,2,539,8.0,13200,0,0,1,2,330.0,12209.0,7.5,0,0,100.0,131,5,5,0,5.0,0.0,5.0,0.0,0,2,2,4,4,1,5,6,2,5,0.0,0,0,2,100.0,0.0,1,0,13200,991,13200,0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,DirectPay,N,,,,0.0,0.0,0.0,0,0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 6, List(0), List(1.0))",0.0,"List(0, 34, List(0), List(1.0))",0.0,"List(0, 47, List(0, 2, 3, 9, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 975.0, 13400.0))","List(1, 2, List(), List(8.192554155605697, -8.192554155605697))","List(1, 2, List(), List(0.9997233703150301, 2.766296849698963E-4))",0.0
,,1000,1000,975.0,36 months,6.46,30.64,A,A1,Director of Strategic Accounts,10+ years,RENT,100000.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,335xx,FL,7.37,0,Dec-1995,0,0.0,0.0,9,0,21778,27.9,31,f,949.35,925.62,60.92,59.4,50.65,10.27,0.0,0.0,0.0,Feb-2019,30.64,Mar-2019,Feb-2019,0,0.0,1,Individual,0.0,0.0,,0,0,41880,0,1,1,2,8.0,20102,67.0,1,2,9899,39.0,78000,0,5,0,4,4653.0,33154.0,38.0,0,0,276.0,163,7,7,3,20.0,0.0,0.0,0.0,0,5,6,5,7,12,7,16,6,9,0.0,0,0,2,100.0,20.0,0,0,108250,41880,53500,30250,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 6, List(0), List(1.0))",0.0,"List(0, 34, List(0), List(1.0))",0.0,"List(0, 47, List(0, 2, 3, 9, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 975.0, 100000.0))","List(1, 2, List(), List(8.309904983913553, -8.309904983913553))","List(1, 2, List(), List(0.9997539931116661, 2.4600688833391966E-4))",0.0
,,1000,1000,975.0,36 months,7.56,31.14,A,A3,,,RENT,9500.0,Not Verified,Dec-2018,Current,n,,,other,Other,295xx,SC,12.26,0,Mar-2004,1,0.0,0.0,7,0,1914,11.5,10,f,950.16,926.41,61.86,60.31,49.84,12.02,0.0,0.0,0.0,Feb-2019,31.14,Mar-2019,Feb-2019,0,0.0,1,Individual,0.0,0.0,,0,0,1914,1,0,1,1,9.0,0,0.0,2,2,950,12.0,16600,1,0,2,3,319.0,4859.0,25.2,0,0,9.0,177,1,1,0,8.0,0.0,2.0,0.0,0,2,4,2,4,1,7,9,4,7,0.0,0,0,3,100.0,0.0,0,0,16600,1914,6500,0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 6, List(0), List(1.0))",9.0,"List(0, 34, List(9), List(1.0))",0.0,"List(0, 47, List(0, 2, 3, 18, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 975.0, 9500.0))","List(1, 2, List(), List(9.480923823554324, -9.480923823554324))","List(1, 2, List(), List(0.9999237123973148, 7.62876026850737E-5))",0.0
,,1000,1000,975.0,36 months,8.19,31.43,A,A4,,,OWN,38400.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,368xx,AL,7.06,0,Dec-1982,0,55.0,0.0,13,0,1659,1.8,26,f,950.62,926.86,62.4,60.84,49.38,13.02,0.0,0.0,0.0,Feb-2019,31.43,Mar-2019,Feb-2019,0,55.0,1,Individual,0.0,0.0,,0,0,1659,0,0,0,2,21.0,0,0.0,0,2,518,2.0,91000,0,3,0,4,128.0,62802.0,2.3,0,0,142.0,432,13,13,1,13.0,55.0,13.0,55.0,0,7,8,8,10,6,13,19,8,13,0.0,0,0,0,92.3,0.0,0,0,91000,1659,64300,0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 6, List(0), List(1.0))",1.0,"List(0, 34, List(1), List(1.0))",0.0,"List(0, 47, List(0, 2, 3, 10, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 975.0, 38400.0))","List(1, 2, List(), List(6.90984324658556, -6.90984324658556))","List(1, 2, List(), List(0.9990030806293223, 9.969193706776984E-4))",0.0
,,1000,1000,975.0,36 months,8.19,31.43,A,A4,,,RENT,10824.0,Source Verified,Dec-2018,Current,n,,,medical,Medical expenses,321xx,FL,5.99,0,Jul-2004,1,0.0,0.0,5,0,203,1.5,32,f,950.62,926.86,62.18,60.63,49.38,12.8,0.0,0.0,0.0,Feb-2019,31.43,Mar-2019,Feb-2019,0,0.0,1,Joint App,23124.0,2.8,Source Verified,0,0,203,2,0,0,0,163.0,0,0.0,2,4,172,2.0,14000,0,0,1,4,51.0,2828.0,5.7,0,0,173.0,99,5,5,0,13.0,0.0,5.0,0.0,0,1,2,2,4,25,5,7,2,5,0.0,0,0,2,100.0,0.0,0,0,14000,203,3000,0,203.0,Aug-2005,0.0,0.0,4.0,1.7,0.0,11.0,0.0,1.0,4.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 6, List(0), List(1.0))",1.0,"List(0, 34, List(1), List(1.0))",0.0,"List(0, 47, List(1, 2, 3, 10, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 975.0, 10824.0))","List(1, 2, List(), List(6.6182903840861, -6.6182903840861))","List(1, 2, List(), List(0.9986660691979573, 0.0013339308020427361))",0.0
,,1000,1000,975.0,36 months,8.19,31.43,A,A4,Police Officer,3 years,MORTGAGE,76000.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,618xx,IL,14.31,0,Aug-2010,1,0.0,0.0,12,0,842,4.6,14,f,950.62,926.86,62.4,60.84,49.38,13.02,0.0,0.0,0.0,Feb-2019,31.43,Mar-2019,Feb-2019,0,0.0,1,Joint App,118000.0,24.24,Source Verified,0,0,185233,1,9,0,0,25.0,43055,69.0,1,2,421,42.0,18500,0,1,2,3,15436.0,17658.0,4.6,0,0,100.0,42,1,1,1,1.0,0.0,1.0,0.0,0,2,2,2,2,10,2,3,2,12,0.0,0,0,2,100.0,0.0,0,0,224445,43897,18500,63445,28930.0,Jun-2004,1.0,0.0,17.0,45.4,4.0,25.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 6, List(0), List(1.0))",1.0,"List(0, 34, List(1), List(1.0))",0.0,"List(0, 47, List(1, 2, 3, 10, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 975.0, 76000.0))","List(1, 2, List(), List(6.706609755746524, -6.706609755746524))","List(1, 2, List(), List(0.9987786907152147, 0.0012213092847852833))",0.0
,,1000,1000,975.0,36 months,12.98,33.69,B,B5,Medical Assistant,3 years,OWN,30000.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,341xx,FL,12.69,0,May-1983,3,89.0,0.0,7,0,6304,48.5,18,f,954.01,930.16,66.66,64.99,45.99,20.67,0.0,0.0,0.0,Feb-2019,33.69,Mar-2019,Feb-2019,0,89.0,1,Individual,0.0,0.0,,0,672,10483,2,1,1,1,5.0,4179,93.0,2,2,5155,60.0,13000,1,0,3,3,1498.0,345.0,93.7,0,0,139.0,427,1,1,1,138.0,89.0,1.0,89.0,1,1,4,1,6,2,6,15,4,7,0.0,0,0,3,94.4,100.0,0,0,17500,10483,5500,4500,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 6, List(1), List(1.0))",6.0,"List(0, 34, List(6), List(1.0))",0.0,"List(0, 47, List(1, 2, 4, 15, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 975.0, 30000.0))","List(1, 2, List(), List(6.231517214370196, -6.231517214370196))","List(1, 2, List(), List(0.9980373933179176, 0.0019626066820825055))",0.0
,,1000,1000,975.0,36 months,12.98,33.69,B,B5,Team Leader,10+ years,MORTGAGE,75000.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,612xx,IL,18.34,0,May-2007,0,0.0,0.0,8,0,10662,86.0,20,f,954.01,930.16,66.66,64.99,45.99,20.67,0.0,0.0,0.0,Feb-2019,33.69,Mar-2019,Feb-2019,0,0.0,1,Individual,0.0,0.0,,0,0,117377,0,2,0,0,25.0,16446,44.0,0,0,7330,54.0,12400,0,8,0,0,14672.0,1222.0,87.7,0,0,109.0,138,49,25,2,49.0,0.0,0.0,0.0,0,3,5,3,4,9,5,9,5,8,0.0,0,0,0,100.0,66.7,0,0,153930,27108,9900,37500,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 6, List(1), List(1.0))",6.0,"List(0, 34, List(6), List(1.0))",0.0,"List(0, 47, List(0, 2, 4, 15, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 975.0, 75000.0))","List(1, 2, List(), List(6.546681171932428, -6.546681171932428))","List(1, 2, List(), List(0.9985671861104606, 0.0014328138895394358))",0.0


In [0]:
y_true = predictions.select(["isChargeOff"]).collect()
y_true

In [0]:
y_pred = predictions.select(['prediction']).collect.map(_.toSeq)
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

In [0]:
display(trainingData)

id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term,isChargeOff,verification_statusIndex,verification_statusclassVec,termIndex,termclassVec,gradeIndex,gradeclassVec,sub_gradeIndex,sub_gradeclassVec,label,features
,,1000,1000,725.0,36 months,10.72,32.61,B,B2,Server,2 years,RENT,48000.0,Not Verified,Dec-2018,Current,n,,,other,Other,060xx,CT,6.2,0,Dec-2012,0,0.0,0.0,10,0,9866,24.0,12,f,952.44,690.52,64.62,46.85,47.56,17.06,0.0,0.0,0.0,Feb-2019,32.61,Mar-2019,Feb-2019,0,0.0,1,Individual,0.0,0.0,,0,0,19363,1,2,1,1,9.0,9497,99.0,3,5,6890,38.0,41100,1,0,2,6,1936.0,27846.0,24.1,0,0,62.0,72,3,3,0,3.0,0.0,9.0,0.0,0,5,6,6,7,3,8,9,6,10,0.0,0,0,4,100.0,0.0,0,0,50650,19363,36700,9550,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 6, List(1), List(1.0))",4.0,"List(0, 34, List(4), List(1.0))",0.0,"List(0, 47, List(0, 2, 4, 13, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 725.0, 48000.0))"
,,1000,1000,725.0,36 months,11.31,32.89,B,B3,,,RENT,24000.0,Not Verified,Dec-2018,Current,n,,,other,Other,148xx,NY,2.05,0,Feb-1999,0,0.0,0.0,1,0,4147,23.7,4,f,952.85,690.82,65.15,47.23,47.15,18.0,0.0,0.0,0.0,Feb-2019,32.89,Mar-2019,Feb-2019,0,0.0,1,Individual,0.0,0.0,,0,0,4147,0,0,0,1,21.0,0,0.0,0,0,4147,24.0,17500,0,0,0,1,4147.0,13353.0,23.7,0,0,21.0,238,25,21,0,25.0,0.0,0.0,0.0,0,1,1,1,2,1,1,3,1,1,0.0,0,0,0,100.0,0.0,0,0,17500,4147,17500,0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 6, List(1), List(1.0))",13.0,"List(0, 34, List(13), List(1.0))",0.0,"List(0, 47, List(0, 2, 4, 22, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 725.0, 24000.0))"
,,1000,1000,725.0,36 months,12.98,33.69,B,B5,Supervisor,10+ years,MORTGAGE,80000.0,Not Verified,Dec-2018,Current,n,,,major_purchase,Major purchase,923xx,CA,17.75,2,Aug-2001,0,18.0,0.0,15,0,15310,85.5,51,f,954.01,691.65,65.94,47.81,45.99,19.95,0.0,0.0,0.0,Feb-2019,33.69,Mar-2019,Feb-2019,0,41.0,1,Individual,0.0,0.0,,0,6099,174715,0,3,0,0,28.0,22614,79.0,0,4,3449,82.0,17900,0,0,0,4,12480.0,1700.0,86.8,0,0,166.0,208,22,22,5,22.0,18.0,22.0,18.0,12,6,7,9,27,7,11,39,7,15,0.0,0,0,0,70.6,62.5,0,0,206500,37924,12900,28600,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 6, List(1), List(1.0))",6.0,"List(0, 34, List(6), List(1.0))",0.0,"List(0, 47, List(0, 2, 4, 15, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 725.0, 80000.0))"
,,1000,1000,750.0,36 months,10.33,32.43,B,B1,Pumper,1 year,MORTGAGE,60000.0,Source Verified,Dec-2018,Current,n,,,other,Other,810xx,CO,21.14,0,Jul-2008,0,0.0,0.0,5,0,3289,23.5,10,f,952.15,714.11,63.71,47.78,47.85,15.86,0.0,0.0,0.0,Feb-2019,32.43,Mar-2019,Feb-2019,0,0.0,1,Joint App,94500.0,19.97,Source Verified,0,0,118206,0,2,0,0,47.0,23481,35.0,0,2,3289,31.0,14000,0,2,0,2,23641.0,211.0,94.0,0,0,125.0,100,18,18,2,100.0,0.0,0.0,0.0,0,1,1,1,1,5,2,3,1,5,0.0,0,0,0,100.0,100.0,0,0,169586,26770,3500,60191,18948.0,Jun-2008,0.0,2.0,14.0,100.3,12.0,6.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 6, List(1), List(1.0))",8.0,"List(0, 34, List(8), List(1.0))",0.0,"List(0, 47, List(1, 2, 4, 17, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 750.0, 60000.0))"
,,1000,1000,750.0,36 months,11.8,33.12,B,B4,,,RENT,40000.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,787xx,TX,15.75,0,Apr-2013,0,62.0,0.0,5,0,1187,65.9,7,f,953.2,714.9,64.93,48.7,46.8,18.13,0.0,0.0,0.0,Feb-2019,33.12,Mar-2019,Feb-2019,0,62.0,1,Individual,0.0,0.0,,0,0,4795,0,2,1,1,9.0,3608,27.0,0,0,618,31.0,1800,1,0,1,1,959.0,613.0,65.9,0,0,42.0,68,47,9,0,47.0,0.0,9.0,0.0,1,3,3,3,3,3,3,3,3,5,0.0,0,0,1,85.7,33.3,0,0,15268,4795,1800,13468,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 6, List(1), List(1.0))",2.0,"List(0, 34, List(2), List(1.0))",0.0,"List(0, 47, List(0, 2, 4, 11, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 750.0, 40000.0))"
,,1000,1000,750.0,36 months,11.8,33.12,B,B4,Buyer,8 years,MORTGAGE,108000.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,923xx,CA,12.51,0,Oct-2004,0,0.0,0.0,8,0,5109,53.8,16,f,953.2,714.9,64.6,48.45,46.8,17.8,0.0,0.0,0.0,Feb-2019,33.12,Mar-2019,Feb-2019,1,0.0,1,Individual,0.0,0.0,,0,748,307155,2,3,2,3,3.0,13065,52.0,0,1,3374,52.0,9500,2,0,3,4,38394.0,2526.0,57.2,0,0,161.0,55,18,3,5,55.0,0.0,8.0,0.0,0,1,2,1,1,8,3,3,2,8,0.0,0,0,2,100.0,0.0,0,0,388355,18174,5900,25197,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",1.0,"List(0, 6, List(1), List(1.0))",2.0,"List(0, 34, List(2), List(1.0))",0.0,"List(0, 47, List(0, 2, 4, 11, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 750.0, 108000.0))"
,,1000,1000,975.0,36 months,6.46,30.64,A,A1,Pepsi Ice Center Assistant Manager,10+ years,OWN,56000.0,Source Verified,Dec-2018,Current,n,,,credit_card,Credit card refinancing,616xx,IL,8.59,0,Apr-2001,0,0.0,0.0,7,0,7570,25.2,21,f,949.35,925.62,60.92,59.4,50.65,10.27,0.0,0.0,0.0,Feb-2019,30.64,Mar-2019,Feb-2019,0,0.0,1,Individual,0.0,0.0,,0,0,10968,0,2,0,2,13.0,3398,57.0,0,1,3706,31.0,30000,1,0,0,3,1567.0,19008.0,28.3,0,0,66.0,212,16,13,1,16.0,0.0,13.0,0.0,0,4,5,4,6,12,5,8,5,7,0.0,0,0,0,100.0,0.0,0,0,36000,10968,26500,6000,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 6, List(0), List(1.0))",0.0,"List(0, 34, List(0), List(1.0))",0.0,"List(0, 47, List(1, 2, 3, 9, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 975.0, 56000.0))"
,,1000,1000,975.0,36 months,6.46,30.64,A,A1,Program Manager,5 years,RENT,90000.0,Source Verified,Dec-2018,Fully Paid,n,,,other,Other,325xx,FL,28.06,0,Jul-2008,0,0.0,0.0,18,0,9322,6.9,31,f,0.0,0.0,1001.2538888884,976.22,1000.0,1.25,0.0,0.0,0.0,Jan-2019,1001.97,,Feb-2019,0,0.0,1,Individual,0.0,0.0,,0,0,79188,0,11,1,2,8.0,69866,62.0,0,1,6578,47.0,47000,1,1,1,3,4399.0,26422.0,0.0,0,0,110.0,125,13,8,0,54.0,0.0,8.0,0.0,0,1,2,4,7,21,7,10,2,18,0.0,0,0,1,100.0,25.0,0,0,160598,79188,33000,113598,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 6, List(0), List(1.0))",0.0,"List(0, 34, List(0), List(1.0))",0.0,"List(0, 47, List(1, 2, 3, 9, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 975.0, 90000.0))"
,,1000,1000,975.0,36 months,6.46,30.64,A,A1,Service Consultant,3 years,MORTGAGE,49989.0,Not Verified,Dec-2018,Current,n,,,credit_card,Credit card refinancing,280xx,NC,15.43,0,Jul-2002,0,39.0,0.0,4,0,3257,88.0,12,f,949.35,925.62,60.74,59.22,50.65,10.09,0.0,0.0,0.0,Feb-2019,30.64,Mar-2019,Feb-2019,0,49.0,1,Individual,0.0,0.0,,0,0,88884,1,1,0,0,75.0,2252,7.0,1,2,2941,14.0,3700,0,1,0,2,22221.0,59.0,98.0,0,0,75.0,197,2,2,2,16.0,58.0,16.0,45.0,4,1,2,1,4,1,2,9,2,4,0.0,0,0,1,33.3,100.0,0,0,160996,5509,3000,34806,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,DirectPay,N,,,,0.0,0.0,0.0,0,0.0,"List(0, 2, List(0), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 6, List(0), List(1.0))",0.0,"List(0, 34, List(0), List(1.0))",0.0,"List(0, 47, List(0, 2, 3, 9, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 975.0, 49989.0))"
,,1000,1000,975.0,36 months,7.02,30.89,A,A2,,< 1 year,OWN,82000.0,Source Verified,Dec-2018,Current,n,,,renewable_energy,Green loan,416xx,KY,17.46,0,Oct-1990,0,0.0,93.0,11,1,3597,11.8,31,f,949.77,926.03,61.39,59.86,50.23,11.16,0.0,0.0,0.0,Feb-2019,30.89,Mar-2019,Feb-2019,0,0.0,1,Individual,0.0,0.0,,0,0,61834,1,4,0,2,17.0,45093,21.0,2,2,1062,17.0,30600,2,3,0,5,5621.0,13980.0,9.8,0,0,141.0,338,2,2,1,2.0,0.0,16.0,0.0,0,2,5,3,5,7,6,22,5,11,0.0,0,0,2,100.0,0.0,1,0,124875,48690,15500,76724,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N,,,,0.0,0.0,,,,0.0,0.0,,0.0,0.0,0.0,Cash,N,,,,0.0,0.0,0.0,0,1.0,"List(0, 2, List(1), List(1.0))",0.0,"List(0, 1, List(0), List(1.0))",0.0,"List(0, 6, List(0), List(1.0))",11.0,"List(0, 34, List(11), List(1.0))",0.0,"List(0, 47, List(1, 2, 3, 20, 43, 44, 45, 46), List(1.0, 1.0, 1.0, 1.0, 1000.0, 1000.0, 975.0, 82000.0))"


In [0]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=30)

# Train model with Training Data
dtModel = dt.fit(trainingData)

In [0]:
print("numNodes = ", dtModel.numNodes)
print("depth = ", dtModel.depth)

In [0]:
predictions = dtModel.transform(testData)

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# Evaluate model
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)