# Predict Credit Approval
## Using dataset from https://archive.ics.uci.edu/ml/datasets/Credit+Approval
## Test accuracy of predictions using Random Forest and Logistic Regression
## Both perform similarly on this dataset


In [115]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('credit_approval').getOrCreate()

In [116]:
#Set Schema 
from pyspark.sql.types import (StructField,
                               StringType,
                               IntegerType,
                               StructType,
                               DoubleType)

data_schema = [
                StructField('c0', StringType(),True), 
                StructField('c1', DoubleType(),True),
                StructField('c2', DoubleType(),True),
                StructField('c3', StringType(),True),
                StructField('c4', StringType(),True),
                StructField('c5', StringType(),True),
                StructField('c6', StringType(),True),
                StructField('c7', DoubleType(),True),
                StructField('c8', StringType(),True),
                StructField('c9', StringType(),True),
                StructField('c10', DoubleType(),True),
                StructField('c11', StringType(),True),
                StructField('c12', StringType(),True),
                StructField('c13', DoubleType(),True),
                StructField('c14', DoubleType(),True),
                StructField('c15', StringType(),True)] 

final_struct = StructType(fields=data_schema)

#Read in data
dataset = spark.read.csv('crx.data', header=False,schema=final_struct)

In [117]:
#Check Schema
dataset.printSchema()

root
 |-- c0: string (nullable = true)
 |-- c1: double (nullable = true)
 |-- c2: double (nullable = true)
 |-- c3: string (nullable = true)
 |-- c4: string (nullable = true)
 |-- c5: string (nullable = true)
 |-- c6: string (nullable = true)
 |-- c7: double (nullable = true)
 |-- c8: string (nullable = true)
 |-- c9: string (nullable = true)
 |-- c10: double (nullable = true)
 |-- c11: string (nullable = true)
 |-- c12: string (nullable = true)
 |-- c13: double (nullable = true)
 |-- c14: double (nullable = true)
 |-- c15: string (nullable = true)



In [149]:
#Create indexed, OneHotEncoded categorical values
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder

indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(dataset)
            for column in 
            list(set(dataset.columns)-set(['c1', 'c2', 'c7', 'c10', 'c13', 'c14'])) ]

indexer_pipeline = Pipeline(stages=indexers)

dataset_indexed = indexer_pipeline.fit(dataset).transform(dataset)

encoders = [OneHotEncoder(inputCol=column, outputCol=column+'_encoded')
           for column in
           ['c0_index', 'c3_index', 'c4_index', 'c5_index', 'c6_index', 'c8_index', 'c9_index',
            'c11_index', 'c12_index']]

encode_pipeline = Pipeline(stages=encoders)

dataset_encoded = encode_pipeline.fit(dataset_indexed).transform(dataset_indexed)

#Check columns in encoded dataset 
dataset_encoded.columns

['c0',
 'c1',
 'c2',
 'c3',
 'c4',
 'c5',
 'c6',
 'c7',
 'c8',
 'c9',
 'c10',
 'c11',
 'c12',
 'c13',
 'c14',
 'c15',
 'c8_index',
 'c3_index',
 'c6_index',
 'c12_index',
 'c0_index',
 'c9_index',
 'c5_index',
 'c4_index',
 'c15_index',
 'c11_index',
 'c0_index_encoded',
 'c3_index_encoded',
 'c4_index_encoded',
 'c5_index_encoded',
 'c6_index_encoded',
 'c8_index_encoded',
 'c9_index_encoded',
 'c11_index_encoded',
 'c12_index_encoded']

In [119]:
#convert to features column
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=[
        'c0_index_encoded',
        'c1',
        'c2',
        'c3_index_encoded',
        'c4_index_encoded',
        'c5_index_encoded',
        'c6_index_encoded',
        'c7',
        'c8_index_encoded',
        'c9_index_encoded',
        'c10',
        'c11_index_encoded',
        'c12_index_encoded',
        'c13',
        'c14'    
    ], 
    outputCol='features')

compiled_data = assembler.transform(dataset_encoded)

In [120]:
#Get just features and labels columns
final_data = compiled_data.select('features', compiled_data['c15_index'].alias('label'))

In [121]:
final_data.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(37,[0,1,3,5,9,20...|  1.0|
|(37,[1,2,3,5,8,21...|  1.0|
|(37,[1,2,3,5,8,21...|  1.0|
|(37,[0,1,2,3,5,9,...|  1.0|
|(37,[0,1,2,3,5,9,...|  1.0|
|(37,[0,1,2,3,5,15...|  1.0|
|(37,[0,1,2,3,5,21...|  1.0|
|(37,[1,2,3,5,14,2...|  1.0|
|(37,[0,1,2,4,6,13...|  1.0|
|(37,[0,1,2,4,6,9,...|  1.0|
|(37,[0,1,2,3,5,7,...|  1.0|
|(37,[0,1,2,3,5,7,...|  1.0|
|(37,[1,2,3,5,13,2...|  1.0|
|(37,[0,1,2,3,5,13...|  1.0|
|(37,[1,2,3,5,8,20...|  1.0|
|(37,[0,1,2,4,6,13...|  1.0|
|(37,[0,1,2,3,5,15...|  1.0|
|(37,[1,2,3,5,8,20...|  1.0|
|(37,[0,1,2,3,5,17...|  1.0|
|(37,[1,2,3,5,14,2...|  1.0|
+--------------------+-----+
only showing top 20 rows



In [125]:
#Create Train and Test sets
train, test = final_data.randomSplit([0.7, 0.3])

In [127]:
#Random Forest Classification
from pyspark.ml.classification import RandomForestClassifier
rfc = RandomForestClassifier(numTrees=100)
rfc_model = rfc.fit(train)
rfc_pred = rfc_model.transform(test)

In [145]:
#Evaluate accuracy of RF classification
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_eval = MulticlassClassificationEvaluator(metricName='accuracy')
print('RFC Accuracy')
print(acc_eval.evaluate(rfc_pred))

Accuracy
0.888235294117647


In [146]:
#Logistic Regression Classification
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression()
lr_model = lr.fit(train)
lr_predict = lr_model.evaluate(test)

In [148]:
#Evaluate Logistic Regression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
eval_bin = BinaryClassificationEvaluator()
print('Logistic Regression Accuracy')
print(eval_bin.evaluate(lr_predict.predictions))

Logistic Regression Accuracy
0.9240701754385956
