In [1]:
import pandas as pd
from IPython.display import display
from pyspark.sql import SparkSession
import pyspark.sql.functions as sf
from pyspark.sql import Window
from pyspark.sql.types import StringType

In [2]:
spark = SparkSession.builder\
       .master("local[*]")\
       .appName("SparklingWaterApp")\
       .getOrCreate()

24/04/20 10:17:37 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
spark.sparkContext

In [4]:
# pip3 install h2o_pysparkling_3.3

In [5]:
from pysparkling import *
import h2o
conf = H2OConf().setLogLevel("ERROR") # WARN
hc = H2OContext.getOrCreate(conf)

24/04/20 10:17:46 WARN InternalH2OBackend: Increasing 'spark.locality.wait' to value 0 (Infinitive) as we need to ensure we run on the nodes with H2O
Connecting to H2O server at http://623587162df7:54323 ... successful.


0,1
H2O_cluster_uptime:,08 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,1 month and 6 days
H2O_cluster_name:,sparkling-water-NBuser_local-1713608254074
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4



Sparkling Water Context:
 * Sparkling Water Version: 3.46.0.1-1-3.3
 * H2O name: sparkling-water-NBuser_local-1713608254074
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (0,172.17.0.2,54321)
  ------------------------

  Open H2O Flow in browser: http://623587162df7:54323 (CMD + click in Mac OSX)

    


In [6]:
frame = h2o.import_file("loan.csv")
sparkDF = hc.asSparkFrame(frame)
sparkDF.count()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


                                                                                

999

In [7]:
window = Window.orderBy(sf.lit('X'))
sparkDF = sparkDF.select(sf.row_number().over(window).alias("id"), "*")
sparkDF = sparkDF.drop("addr_state")
display(sparkDF.limit(10).toPandas())

Unnamed: 0,id,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length,verification_status
0,1,5000,36 months,10.65,10,RENT,24000.0,credit_card,27.65,0,83.7,9,0,26,verified
1,2,2500,60 months,15.27,0,RENT,30000.0,car,1.0,0,9.4,4,1,12,verified
2,3,2400,36 months,15.96,10,RENT,12252.0,small_business,8.72,0,98.5,10,0,10,not verified
3,4,10000,36 months,13.49,10,RENT,49200.0,other,20.0,0,21.0,37,0,15,verified
4,5,5000,36 months,7.9,3,RENT,36000.0,wedding,11.2,0,28.3,12,0,7,verified
5,6,3000,36 months,18.64,9,RENT,48000.0,car,5.35,0,87.5,4,0,4,verified
6,7,5600,60 months,21.28,4,OWN,40000.0,small_business,5.55,0,32.6,13,1,7,verified
7,8,5375,60 months,12.69,0,RENT,15000.0,other,18.08,0,36.5,3,1,7,verified
8,9,6500,60 months,14.65,5,OWN,72000.0,debt_consolidation,16.12,0,20.6,23,0,13,not verified
9,10,12000,36 months,12.69,10,OWN,75000.0,debt_consolidation,10.78,0,67.1,34,0,22,verified


In [8]:
# Splitting the DataFrame into training and testing sets with stratified sampling
strata = "bad_loan"

strata_list = sparkDF.select(strata).distinct().rdd.flatMap(lambda x: x).collect()
stratified_dfs = []

train_df = sparkDF.limit(0)
test_df = sparkDF.limit(0)

for s in strata_list:
    stratified_df = sparkDF.filter(sparkDF[strata] == s)
    train_strata, test_strata = stratified_df.randomSplit([0.7, 0.3], seed = 42)
    train_df = train_df.union(train_strata)
    test_df = test_df.union(test_strata)

In [9]:
all_df = train_df.union(test_df)
print(all_df.count())

df = all_df\
.groupby("id")\
.count()\
.where("count > 1")

display(df.toPandas())

train_df = train_df.drop("id")
test_df = test_df.drop("id")

999


Unnamed: 0,id,count


In [10]:
target = "bad_loan"

agg_tab1 = train_df.agg(sf.count(target).alias('count'),
            sf.sum(target).alias('sum'),
            sf.mean(target).alias('mean'),
            ).toPandas()

agg_tab1['dataset'] = 'train'

agg_tab2 = test_df.agg(sf.count(target).alias('count'),
            sf.sum(target).alias('sum'),
            sf.mean(target).alias('mean'),
            ).toPandas()

agg_tab2['dataset'] = 'test'

agg_tab = pd.concat([agg_tab1, agg_tab2], ignore_index=True)
agg_tab = agg_tab[['dataset', 'count', 'sum', 'mean']]

display(agg_tab)

Unnamed: 0,dataset,count,sum,mean
0,train,735,139,0.189116
1,test,264,55,0.208333


In [11]:
string_cols = [f.name for f in train_df.schema.fields if isinstance(f.dataType, StringType)]
display(train_df.select(string_cols).limit(5).toPandas())

nonstring_cols = [f.name for f in train_df.schema.fields if not isinstance(f.dataType, StringType)]
display(train_df.select(nonstring_cols).limit(5).toPandas())

Unnamed: 0,term,home_ownership,purpose,verification_status
0,60 months,RENT,car,verified
1,60 months,OWN,small_business,verified
2,36 months,RENT,debt_consolidation,verified
3,36 months,RENT,other,verified
4,36 months,RENT,debt_consolidation,verified


Unnamed: 0,loan_amnt,int_rate,emp_length,annual_inc,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length
0,2500,15.27,0,30000.0,1.0,0,9.4,4,1,12
1,5600,21.28,4,40000.0,5.55,0,32.6,13,1,7
2,9000,13.49,0,30000.0,10.08,0,91.7,9,1,7
3,10000,10.65,3,100000.0,7.06,0,55.5,29,1,20
4,21000,12.42,10,105000.0,13.22,0,90.3,38,1,28


In [12]:
target = "bad_loan"

string_cols = [f.name for f in train_df.schema.fields if isinstance(f.dataType, StringType)]

train_h2o =  hc.asH2OFrame(train_df)
test_h2o =  hc.asH2OFrame(test_df)

replacement_frame = []

for col in string_cols:
    col_grouped = train_h2o.group_by(col).mean(target, na="all").get_frame()
    col_grouped = col_grouped.set_names([col, 'rate_' + col]) 
    display(col_grouped)
    replacement_frame.append(col_grouped)

for idx in list(range(len(string_cols))):
    col_grouped = replacement_frame[idx]
    col_name = string_cols[idx]
    train_h2o = h2o.H2OFrame.merge(train_h2o, col_grouped, by_x = [col_name], by_y=[col_name], all_x = True, all_y = False)
    train_h2o.drop(col_name)

for idx in list(range(len(string_cols))):
    col_grouped = replacement_frame[idx]
    col_name = string_cols[idx]
    test_h2o = h2o.H2OFrame.merge(test_h2o, col_grouped, by_x = [col_name], by_y=[col_name], all_x = True, all_y = False)
    test_h2o.drop(col_name)
    
    
train_rate_df = hc.asSparkFrame(train_h2o)
test_rate_df = hc.asSparkFrame(test_h2o)

term,rate_term
36 months,0.146865
60 months,0.387597


home_ownership,rate_home_ownership
MORTGAGE,0.160338
OWN,0.196429
RENT,0.20362


purpose,rate_purpose
car,0.285714
credit_card,0.13198
debt_consolidation,0.195251
home_improvement,0.285714
house,0.4
major_purchase,0.0666667
medical,0.2
moving,0.333333
other,0.276596
renewable_energy,0.0


verification_status,rate_verification_status
not verified,0.181495
verified,0.193833


In [13]:
from pysparkling.ml import H2OGLM

target = "bad_loan"

predictors = [f.name for f in train_rate_df.schema.fields if not isinstance(f.dataType, StringType)]
predictors.remove(target)

glm_estimator = H2OGLM(family = "binomial",
     labelCol = target,
     featuresCols = predictors,                       
     computePValues = True)

model = glm_estimator.fit(train_rate_df)

|██████████████████████████████████████████████████| 100%
Model Details
H2OGLM
Model Key: GLM_b97deb3c5395

Model summary
Family: binomial
Link: logit
Regularization: Elastic Net (alpha = 0.5, lambda = 1.953E-4 )
Number of Predictors Total: 13
Number of Active Predictors: 13
Number of Iterations: 4
Training Frame: frame_rdd_171560122259

Training metrics
PRAUC: 0.4170437928031774
Nobs: 735.0
Logloss: 0.42473298522934055
Gini: 0.4850683211819806
RMSE: 0.36659863408167936
ResidualDeviance: 624.3574882871306
NullDeviance: 712.8590052906441
ScoringTime: 1.713608350396E12
Loglikelihood: 0.0
MSE: 0.13439455851055304
R2: 0.12361425846937013
NullDegreesOfFreedom: 734.0
MeanPerClassError: 0.33665684901743037
AUC: 0.7425341605909903
AIC: 652.3574882871306
ResidualDegreesOfFreedom: 721.0

More info available using methods like:
getFeatureImportances(), getScoringHistory(), getCrossValidationScoringHistory()


In [14]:
display(model.getCoefficients().toPandas())

Unnamed: 0,names,Coefficients,Std. Error,z value,p value,Standardized Coefficients
0,Intercept,-2.092006,3.707993,-0.564188,0.572626,-1.71077
1,loan_amnt,1.1e-05,1.8e-05,0.5908,0.554654,0.076684
2,int_rate,0.130755,0.037571,3.480224,0.000501,0.488427
3,emp_length,0.002386,0.030768,0.077557,0.938181,0.008184
4,annual_inc,-7e-06,4e-06,-1.550538,0.121012,-0.226723
5,dti,0.020612,0.01875,1.099317,0.27163,0.128267
6,delinq_2yrs,-0.708424,0.481632,-1.470882,0.141323,-0.213575
7,revol_util,0.006255,0.005155,1.213341,0.224999,0.151612
8,total_acc,-0.003624,0.011954,-0.303147,0.761778,-0.036758
9,longest_credit_length,0.015172,0.019109,0.794003,0.427193,0.091163


In [15]:
display(model.getFeatureImportances().toPandas())

Unnamed: 0,Variable,Relative Importance,Scaled Importance,Percentage
0,rate_purpose,0.505175,1.0,0.198907
1,int_rate,0.488427,0.966847,0.192312
2,rate_term,0.326868,0.64704,0.1287
3,annual_inc,0.226723,0.4488,0.089269
4,delinq_2yrs,0.213575,0.422774,0.084092
5,rate_verification_status,0.165643,0.327893,0.06522
6,revol_util,0.151612,0.300118,0.059695
7,dti,0.128267,0.253906,0.050504
8,rate_home_ownership,0.120682,0.23889,0.047517
9,longest_credit_length,0.091163,0.180459,0.035894


In [16]:
display(model.transform(test_rate_df).crosstab(target, "prediction").toPandas())

Unnamed: 0,bad_loan_prediction,0,1
0,1,35,20
1,0,170,39


In [17]:
from ai.h2o.sparkling.ml.algos.classification import H2OXGBoostClassifier

target = "bad_loan"

min_rows = int(train_df.count() * 0.05)

xgb_classifier = H2OXGBoostClassifier(labelCol = target, 
                                 booster = "gbtree",
                                 ntrees = 250, 
                                 minRows = min_rows,
                                 detailedPredictionCol = "prediction")

model = xgb_classifier.fit(train_df)

|__________________________________________________| 0%



|██████████████████████████████████████████████████| 100%
Model Details
H2OXGBoost
Model Key: XGBoost_f3d39419f7fd

Model summary
Number of Trees: 250

Training metrics
PRAUC: 0.404733462211068
Nobs: 735.0
Logloss: 0.4241422787109303
Gini: 0.5145454106513447
RMSE: 0.36602782468888795
ScoringTime: 1.713608417341E12
Loglikelihood: NaN
MSE: 0.13397636844647928
R2: 0.12634127222249947
MeanPerClassError: 0.295615856308242
AUC: 0.7572727053256724
AIC: NaN

More info available using methods like:
getFeatureImportances(), getScoringHistory(), getCrossValidationScoringHistory()


In [18]:
display(model.getFeatureImportances().toPandas())

Unnamed: 0,Variable,Relative Importance,Scaled Importance,Percentage
0,int_rate,70.024689,1.0,0.439647
1,dti,19.789263,0.282604,0.124246
2,revol_util,16.188408,0.231181,0.101638
3,annual_inc,12.767155,0.182324,0.080158
4,loan_amnt,11.429572,0.163222,0.07176
5,total_acc,10.467224,0.149479,0.065718
6,emp_length,6.949555,0.099244,0.043633
7,purpose.credit_card,5.196002,0.074202,0.032623
8,longest_credit_length,3.361048,0.047998,0.021102
9,verification_status.not verified,1.88993,0.026989,0.011866


In [19]:
display(model.transform(test_df).crosstab(target, "prediction").toPandas())

Unnamed: 0,bad_loan_prediction,0,1
0,1,26,29
1,0,132,77


In [20]:
from pysparkling.ml import H2ODeepLearning

target = "bad_loan"

cat_columns = [f.name for f in train_df.schema.fields if isinstance(f.dataType, StringType)]

print("Category Columns: ", cat_columns)

num_columns = [f.name for f in train_df.schema.fields if not isinstance(f.dataType, StringType)]
num_columns.remove(target)

print("Numeric Columns: ", num_columns)

all_features = cat_columns + num_columns

estimator = H2ODeepLearning(
                distribution = "bernoulli",
                hidden = [100, 50, 20],
                epochs = 1000,
                columnsToCategorical = cat_columns,
                featuresCols = all_features,
                labelCol = target,
                detailedPredictionCol = "prediction")

model = estimator.fit(train_df)

Category Columns:  ['term', 'home_ownership', 'purpose', 'verification_status']
Numeric Columns:  ['loan_amnt', 'int_rate', 'emp_length', 'annual_inc', 'dti', 'delinq_2yrs', 'revol_util', 'total_acc', 'longest_credit_length']
|██████████████████████████████████████████████████| 100%
Model Details
H2ODeepLearning
Model Key: DeepLearning_a3543e945958

Model summary
Layer: 1
Units: 33
Type: Input
Dropout: 0.0
L1: null
L2: null
Mean Rate: null
Rate RMS: null
Momentum: null
Mean Weight: null
Weight RMS: null
Mean Bias: null
Bias RMS: null

Layer: 2
Units: 100
Type: Rectifier
Dropout: 0.0
L1: 0.0
L2: 0.0
Mean Rate: 0.3744432093059312
Rate RMS: 0.3864401578903198
Momentum: 0.0
Mean Weight: -0.003917267900782959
Weight RMS: 0.15217208862304688
Mean Bias: 0.44108943178581833
Bias RMS: 0.08271560072898865

Layer: 3
Units: 50
Type: Rectifier
Dropout: 0.0
L1: 0.0
L2: 0.0
Mean Rate: 0.47667539708049733
Rate RMS: 0.3742619752883911
Momentum: 0.0
Mean Weight: -0.01731320270421129
Weight RMS: 0.145883

In [21]:
display(model.getFeatureImportances().toPandas())

Unnamed: 0,Variable,Relative Importance,Scaled Importance,Percentage
0,int_rate,1.0,1.0,0.047236
1,longest_credit_length,0.911846,0.911846,0.043072
2,annual_inc,0.856981,0.856981,0.040481
3,revol_util,0.840451,0.840451,0.0397
4,dti,0.821047,0.821047,0.038783
5,loan_amnt,0.812523,0.812523,0.038381
6,verification_status.verified,0.804269,0.804269,0.037991
7,emp_length,0.793648,0.793648,0.037489
8,purpose.other,0.790809,0.790809,0.037355
9,total_acc,0.788386,0.788386,0.037241


In [22]:
display(model.transform(test_df).crosstab(target, "prediction").toPandas())

Unnamed: 0,bad_loan_prediction,0,1
0,1,44,11
1,0,183,26


In [23]:
spark.stop()