## 1. Import Libraries

In [1]:
# Feature_Engineering
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import *
from pyspark.sql.functions import col, count, sum
# Modeling
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import linear_model, datasets
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
import numpy as np
from pyspark.ml.feature import OneHotEncoder

## 2. Load Data and Convert to Spark Data Frame

In [2]:
# read files
sc.textFile("loan.csv").take(1)

[u'id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_il_6m,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m']

In [3]:
# load data as dataframe
loan_df=spark.read.csv("loan.csv",header=True)

In [4]:
loan_df.rdd.getNumPartitions()
type(loan_df)

pyspark.sql.dataframe.DataFrame

## 3. Create response variable and features
### 3.1 Remove some columns based on EDA results

In [5]:
# loan_df1 = loan_df.drop('desc','mths_since_last_delinq','mths_since_last_record','next_pymnt_d',
#                         'mths_since_last_major_derog','annual_inc_joint','dti_joint','verification_status_joint',
#                         'open_acc_6m','open_il_6m','open_il_12m','open_il_24m','mths_since_rcnt_il','total_bal_il',
#                         'il_util','open_rv_12m','open_rv_24m','max_bal_bc','all_util','inq_fi','total_cu_tl',
#                         'inq_last_12m', # with a lot NA
#                         'id','member_id','collection_recovery_fee','last_pymnt_amnt','last_pymnt_d','out_prncp','out_prncp_inv',
#                         'pymnt_plan','recoveries','term','title','total_pymnt','total_pymnt_inv','total_rec_int',
#                         'total_rec_late_fee','total_rec_prncp','url','verification_status', 'initial_list_status', 
#                         'last_credit_pull_d','policy_code','emp_title','last_credit_pull_d' # domain knowledge
#                        )

#is that intentionally not leaving out drops?

In [6]:
# print loan_df1.head(1)
# print len(loan_df1.columns)

In [7]:
loan_df2=loan_df.select(
    loan_df.loan_amnt.cast("float"),
    loan_df.funded_amnt.cast("float"),
    loan_df.funded_amnt_inv.cast("float"),
    loan_df.int_rate.cast("float"),  #convert to float
    loan_df.installment.cast("float"), #convert to float
    'grade',
    'sub_grade',   # NEED TO BE DUMMIED
    'emp_length',
    'home_ownership', #group & make it dummy, done 
    loan_df.annual_inc.cast("float"),
    # 'issue_d',  #why comment out?
    'loan_status', 
    # response variable (pending for change after group discussion)
    'purpose',  #make it dummy
#     'zip_code', #interesting, worth dummying, might run into problems tho
#     'addr_state', #dummy
    'verification_status',
#     'initial_list_status',
    loan_df.dti.cast("float"),#float done
    loan_df.delinq_2yrs.cast("integer"),  
    #take out NAs, 2? 1, 3, mostly 0
    # 'earliest_cr_line',
    loan_df.inq_last_6mths.cast("integer"), #similar to delingq_2yr
    loan_df.open_acc.cast("integer"), 
    loan_df.pub_rec.cast("integer"), #what is it? 0,1
    loan_df.revol_bal.cast("float"), #done
    loan_df.revol_util.cast("float"),#done
    loan_df.total_acc.cast("integer"),
    #loan_df.last_credit_pull_d.cast("integer"), #date
    loan_df.acc_now_delinq.cast("integer"),#ok
    loan_df.tot_coll_amt.cast("float"), #done
    loan_df.tot_cur_bal.cast("float"), #done
    loan_df.total_rev_hi_lim.cast("integer") #what is this
)

### 3.2 Create response variable and remove rows with no valid response variable 

In [8]:
# needs to fix that, encoding doesn't seem to be right

def whetherpaid(x):
    if x in ['Default', 'Charged Off', 'Does not meet the credit policy. Status:Charged Off']:
        return 1
    elif x in ['Does not meet the credit policy. Status:Fully Paid', 'Fully Paid']:
        return 0
    else:
        return -1

In [9]:
paidflag = udf(lambda x: whetherpaid(x))

In [10]:
#subject to change
loan_df3 = loan_df2.withColumn(
    'paid_flag',
    paidflag('loan_status').cast("integer")
        ).where("paid_flag != -1").drop('loan_status')

In [11]:
loan_df3.printSchema()


root
 |-- loan_amnt: float (nullable = true)
 |-- funded_amnt: float (nullable = true)
 |-- funded_amnt_inv: float (nullable = true)
 |-- int_rate: float (nullable = true)
 |-- installment: float (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- purpose: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- dti: float (nullable = true)
 |-- delinq_2yrs: integer (nullable = true)
 |-- inq_last_6mths: integer (nullable = true)
 |-- open_acc: integer (nullable = true)
 |-- pub_rec: integer (nullable = true)
 |-- revol_bal: float (nullable = true)
 |-- revol_util: float (nullable = true)
 |-- total_acc: integer (nullable = true)
 |-- acc_now_delinq: integer (nullable = true)
 |-- tot_coll_amt: float (nullable = true)
 |-- tot_cur_bal: float (nullable = true)
 |-- total_rev_hi_lim: integ

### 3.3 Add more features
#### 3.3.1 Create a numeric feature for "emp_length"

In [12]:
import re
def convert_to_int(s):
    s = re.sub('\\D', '', s)  #remove any non-digital character
    #\d matches any digital, #\D matches any non-digital
    try:
        return s
    except ValueError:
        return 'NaN'

emp_to_num = udf(convert_to_int)
loan_df4 = loan_df3.withColumn('emp_len',emp_to_num('emp_length').cast('integer')).drop('emp_length')

#### 3.3.2 Create numeric variable for grade 

In [13]:
# count how many nulls in each column
def count_null(c):
    return sum(col(c).isNull().cast("integer")).alias(c)

exprs = [count_null(c) for c in loan_df4.columns[0:9]]
loan_df4.agg(*exprs).show()

+---------+-----------+---------------+--------+-----------+-----+---------+--------------+----------+
|loan_amnt|funded_amnt|funded_amnt_inv|int_rate|installment|grade|sub_grade|home_ownership|annual_inc|
+---------+-----------+---------------+--------+-----------+-----+---------+--------------+----------+
|        0|          0|              0|       0|          0|    0|        0|             0|         4|
+---------+-----------+---------------+--------+-----------+-----+---------+--------------+----------+



#### 3.3.3 Create numeric variable for grade 

In [14]:
def home_ownership_func(x):
    if x in ['ANY','OTHER','NONE']:
        return 'Other'
    else: 
        return x

home_ownership = udf(home_ownership_func)  #fixed a function
loan_df5 = loan_df4.withColumn('home_ownership',home_ownership('home_ownership'))

In [15]:
loan_df5.select('home_ownership').groupBy('home_ownership').count().show()

+--------------+------+
|home_ownership| count|
+--------------+------+
|           OWN| 22282|
|         Other|   228|
|          RENT|107831|
|      MORTGAGE|126598|
+--------------+------+



#### 3.3.4 Add loan_inc_ratio feature 

In [16]:
def calculate_ratio(a, b):
    try:
        return a/float(b)
    except TypeError:
        return None
    except ZeroDivisionError:
        return None

ratio = udf(calculate_ratio) #define a udf_function ratio

In [17]:
loan_df6 = loan_df5.withColumn(
    'loan_inc_ratio',ratio('loan_amnt','annual_inc'
                          ).cast('float'))

In [18]:
loan_df7 = loan_df6.withColumn(
    'instal_inc_ratio',
    ratio('installment','annual_inc').cast('float'))

#### 3.3.5 Add feature instal_inc_ratio


In [19]:
def calculate_monthly_ratio(a, b):
    try:
        return a/(float(b)/12)
    except TypeError:
        return None
    except ZeroDivisionError:
        return None
    
monthly_ratio = udf(calculate_monthly_ratio)

In [20]:
loan_df8 = loan_df7.withColumn(
    'instal_inc_ratio',
    ratio('installment','annual_inc').cast('float'))

#### 3.3.6 Create dummy variables
* Use StringIndexer to encode a string column of labels to a column of label indices, and most frequent label gets index 0.

* Use OneHotEncoder to convert indices into dummy vectors

In [21]:
convert_list = [      
               'purpose',
               'grade',
               'sub_grade',
               'verification_status',
               'home_ownership',
                ]

for item in convert_list:
    print item
    indexer = StringIndexer(inputCol=item, outputCol=item + 'Index')    
    loan_df8 = indexer.fit(loan_df8).transform(loan_df8).drop(item)
    onehotenc = OneHotEncoder(inputCol=item + 'Index', outputCol=item+"-onehot", dropLast=False)
    # we can experiment with True
    loan_df8 = onehotenc.transform(loan_df8).drop(item+'Index')
    print item

purpose
purpose
grade
grade
sub_grade
sub_grade
verification_status
verification_status
home_ownership
home_ownership


In [22]:
loan_df8 = loan_df8.fillna(0.0, ['tot_coll_amt','tot_cur_bal', 'total_rev_hi_lim'])
loan_df9 = loan_df8.dropna()

In [23]:
loan_df9.columns

['loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'int_rate',
 'installment',
 'annual_inc',
 'dti',
 'delinq_2yrs',
 'inq_last_6mths',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_bal',
 'total_rev_hi_lim',
 'paid_flag',
 'emp_len',
 'loan_inc_ratio',
 'instal_inc_ratio',
 'purpose-onehot',
 'grade-onehot',
 'sub_grade-onehot',
 'verification_status-onehot',
 'home_ownership-onehot']

In [24]:
loan_df_final = loan_df9.select(
    'paid_flag',
    'loan_amnt',
    'funded_amnt',
    'funded_amnt_inv',
    'int_rate',
    'installment',
    'annual_inc',
    'dti',
    'delinq_2yrs',
    'inq_last_6mths',
    'open_acc',
    'pub_rec',
    'revol_bal',
    'revol_util',
    'total_acc',
    'acc_now_delinq',
    'tot_coll_amt',
    'tot_cur_bal',
    'total_rev_hi_lim',
    'emp_len',
    'loan_inc_ratio',
    'instal_inc_ratio',
    'purpose-onehot',
    'grade-onehot',
    'sub_grade-onehot',
    'verification_status-onehot',
    'home_ownership-onehot'
)

In [25]:
loan_df_final.take(1)
loan_df_final.printSchema()

root
 |-- paid_flag: integer (nullable = true)
 |-- loan_amnt: float (nullable = true)
 |-- funded_amnt: float (nullable = true)
 |-- funded_amnt_inv: float (nullable = true)
 |-- int_rate: float (nullable = true)
 |-- installment: float (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- dti: float (nullable = true)
 |-- delinq_2yrs: integer (nullable = true)
 |-- inq_last_6mths: integer (nullable = true)
 |-- open_acc: integer (nullable = true)
 |-- pub_rec: integer (nullable = true)
 |-- revol_bal: float (nullable = true)
 |-- revol_util: float (nullable = true)
 |-- total_acc: integer (nullable = true)
 |-- acc_now_delinq: integer (nullable = true)
 |-- tot_coll_amt: float (nullable = false)
 |-- tot_cur_bal: float (nullable = false)
 |-- total_rev_hi_lim: integer (nullable = true)
 |-- emp_len: integer (nullable = true)
 |-- loan_inc_ratio: float (nullable = true)
 |-- instal_inc_ratio: float (nullable = true)
 |-- purpose-onehot: vector (nullable = true)
 |-- grade-on

In [26]:
# count how many nulls in each column
# def count_null(c):
#     return sum(col(c).isNull().cast("integer")).alias(c)

# exprs = [count_null(c) for c in loan_df9.columns]
# loan_df9.agg(*exprs).toPandas().T

## 4. Modeling

### 4.1 Splitting and Formatting Data

In [27]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [28]:
 # Split the data into training and test sets (30% held out for testing)
loan_df=loan_df_final.withColumnRenamed('paid_flag','label')
loan_df.head(2)

[Row(label=0, loan_amnt=5000.0, funded_amnt=5000.0, funded_amnt_inv=4975.0, int_rate=10.649999618530273, installment=162.8699951171875, annual_inc=24000.0, dti=27.649999618530273, delinq_2yrs=0, inq_last_6mths=1, open_acc=3, pub_rec=0, revol_bal=13648.0, revol_util=83.69999694824219, total_acc=9, acc_now_delinq=0, tot_coll_amt=0.0, tot_cur_bal=0.0, total_rev_hi_lim=0, emp_len=10, loan_inc_ratio=0.2083333283662796, instal_inc_ratio=0.006786249577999115, purpose-onehot=SparseVector(265, {1: 1.0}), grade-onehot=SparseVector(7, {0: 1.0}), sub_grade-onehot=SparseVector(35, {3: 1.0}), verification_status-onehot=SparseVector(3, {0: 1.0}), home_ownership-onehot=SparseVector(4, {1: 1.0})),
 Row(label=1, loan_amnt=2500.0, funded_amnt=2500.0, funded_amnt_inv=2500.0, int_rate=15.270000457763672, installment=59.83000183105469, annual_inc=30000.0, dti=1.0, delinq_2yrs=0, inq_last_6mths=5, open_acc=3, pub_rec=0, revol_bal=1687.0, revol_util=9.399999618530273, total_acc=4, acc_now_delinq=0, tot_coll_a

In [29]:
# va=VectorAssembler(outputCol='features',inputCols=loan_df.columns[:])
va=VectorAssembler(outputCol='features',inputCols=loan_df.columns[1:])
data=va.transform(loan_df).select('features','label')

In [30]:
data.head(3)

[Row(features=SparseVector(335, {0: 5000.0, 1: 5000.0, 2: 4975.0, 3: 10.65, 4: 162.87, 5: 24000.0, 6: 27.65, 8: 1.0, 9: 3.0, 11: 13648.0, 12: 83.7, 13: 9.0, 18: 10.0, 19: 0.2083, 20: 0.0068, 22: 1.0, 286: 1.0, 296: 1.0, 328: 1.0, 332: 1.0}), label=0),
 Row(features=SparseVector(335, {0: 2500.0, 1: 2500.0, 2: 2500.0, 3: 15.27, 4: 59.83, 5: 30000.0, 6: 1.0, 8: 5.0, 9: 3.0, 11: 1687.0, 12: 9.4, 13: 4.0, 18: 1.0, 19: 0.0833, 20: 0.002, 27: 1.0, 287: 1.0, 300: 1.0, 330: 1.0, 332: 1.0}), label=1),
 Row(features=SparseVector(335, {0: 2400.0, 1: 2400.0, 2: 2400.0, 3: 15.96, 4: 84.33, 5: 12252.0, 6: 8.72, 8: 2.0, 9: 2.0, 11: 2956.0, 12: 98.5, 13: 10.0, 18: 10.0, 19: 0.1959, 20: 0.0069, 26: 1.0, 287: 1.0, 303: 1.0, 329: 1.0, 332: 1.0}), label=0)]

In [31]:
data.show(3)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(335,[0,1,2,3,4,5...|    0|
|(335,[0,1,2,3,4,5...|    1|
|(335,[0,1,2,3,4,5...|    0|
+--------------------+-----+
only showing top 3 rows



In [32]:
# Creating Training and Test Data
data_sets=data.randomSplit([0.8,0.2])
data_test=data_sets[1].cache()
data_train=data_sets[0].cache()

### 4.2 Random Forest

In [33]:
# bootstrap oversample to balance training data

data_train_paid = data_train.filter(data_train.label == 0)
data_train_not_paid = data_train.filter(data_train.label == 1)

num_samples_train = data_train.count()
half_num_samples_train = num_samples_train / 2

data_train_paid = data_train_paid.sample(True, float(half_num_samples_train) / data_train_paid.count())
data_train_not_paid = data_train_not_paid.sample(True, float(half_num_samples_train) / data_train_not_paid.count())

In [34]:
# continued
data_train = data_train_paid.union(data_train_not_paid)
data_train.count()

197211

In [35]:
print data_train.where(data_train.label == 0).count()
print data_train.where(data_train.label == 1).count()

98545
98666


In [36]:
print data_train_paid.count()
print data_train_not_paid.count()

98545
98666


In [37]:
# Fit random forest
rf = RandomForestClassifier(labelCol="label", featuresCol="features",maxDepth=15,numTrees=11)
rfmodel=rf.fit(data_train)

In [38]:
# Make predictions.
predictions = rfmodel.transform(data_test)

In [39]:
predictions.show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(335,[0,1,2,3,4,5...|    0|[4.23621091248352...|[0.38511008295304...|       1.0|
|(335,[0,1,2,3,4,5...|    0|[7.59350564903186...|[0.69031869536653...|       0.0|
|(335,[0,1,2,3,4,5...|    1|[4.75292114150630...|[0.43208374013693...|       1.0|
|(335,[0,1,2,3,4,5...|    0|[4.37522674210312...|[0.39774788564573...|       1.0|
|(335,[0,1,2,3,4,5...|    0|[2.93301389328822...|[0.26663762666256...|       1.0|
|(335,[0,1,2,3,4,5...|    0|[6.28410521533159...|[0.57128229230287...|       0.0|
|(335,[0,1,2,3,4,5...|    1|[4.22569336205473...|[0.38415394200497...|       1.0|
|(335,[0,1,2,3,4,5...|    1|[5.95586569244037...|[0.54144233567639...|       0.0|
|(335,[0,1,2,3,4,5...|    0|[6.26982754587984...|[0.56998432235271...|       0.0|
|(335,[0,1,2,3,4

In [40]:
## This does not give correct error rate

# Select (prediction, true label) and compute test error
# evaluator = MulticlassClassificationEvaluator(
#     labelCol="label", predictionCol="prediction")
# accuracy = evaluator.evaluate(predictions)
# print("Test Error = %g" % (1.0 - accuracy))


In [41]:
prediction_list = predictions.collect()

In [42]:
predictionAndLabels = []
for item in prediction_list:
    predictionAndLabels.append((float(item[4]), float(item[1])))

In [43]:
metrics = MulticlassMetrics(sc.parallelize(predictionAndLabels))
print metrics.confusionMatrix().toArray()
print metrics.precision(1)
print metrics.recall(1)
print metrics.accuracy

[[ 25418.  15163.]
 [  2973.   5790.]]
0.276332744714
0.660732625813
0.632457846952


In [44]:
prediction_list

[Row(features=SparseVector(335, {0: 18500.0, 1: 18500.0, 2: 18500.0, 3: 24.5, 4: 730.68, 5: 110000.0, 6: 10.6, 7: 1.0, 8: 2.0, 9: 14.0, 10: 1.0, 11: 11988.0, 12: 46.1, 13: 22.0, 14: 1.0, 16: 297516.0, 17: 26000.0, 18: 1.0, 19: 0.1682, 20: 0.0066, 24: 1.0, 291: 1.0, 320: 1.0, 328: 1.0, 331: 1.0}), label=0, rawPrediction=DenseVector([4.2362, 6.7638]), probability=DenseVector([0.3851, 0.6149]), prediction=1.0),
 Row(features=SparseVector(335, {0: 4800.0, 1: 4800.0, 2: 4800.0, 3: 13.53, 4: 162.96, 5: 55000.0, 6: 12.79, 7: 1.0, 8: 1.0, 9: 11.0, 10: 1.0, 11: 3255.0, 12: 32.9, 13: 41.0, 15: 734.0, 16: 21704.0, 17: 9900.0, 18: 5.0, 19: 0.0873, 20: 0.003, 21: 1.0, 286: 1.0, 298: 1.0, 330: 1.0, 332: 1.0}), label=0, rawPrediction=DenseVector([7.5935, 3.4065]), probability=DenseVector([0.6903, 0.3097]), prediction=0.0),
 Row(features=SparseVector(335, {0: 9000.0, 1: 9000.0, 2: 9000.0, 3: 19.2, 4: 330.82, 5: 105000.0, 6: 22.87, 7: 1.0, 8: 3.0, 9: 14.0, 10: 1.0, 11: 15953.0, 12: 62.9, 13: 24.0, 15: 

In [45]:
rfmodel.featureImportances

SparseVector(335, {0: 0.0166, 1: 0.0219, 2: 0.0215, 3: 0.219, 4: 0.0203, 5: 0.0383, 6: 0.037, 7: 0.0074, 8: 0.0163, 9: 0.0168, 10: 0.005, 11: 0.0192, 12: 0.0348, 13: 0.0209, 14: 0.0005, 15: 0.0045, 16: 0.0228, 17: 0.0178, 18: 0.0117, 19: 0.0457, 20: 0.0587, 21: 0.0033, 22: 0.0046, 23: 0.0024, 24: 0.0027, 25: 0.0017, 26: 0.0033, 27: 0.0013, 28: 0.0015, 29: 0.0007, 30: 0.0009, 31: 0.0008, 32: 0.0008, 33: 0.0003, 34: 0.0003, 286: 0.0295, 287: 0.0155, 288: 0.0732, 289: 0.0183, 290: 0.0354, 291: 0.0214, 292: 0.0063, 293: 0.0032, 294: 0.0023, 295: 0.0018, 296: 0.0019, 297: 0.0028, 298: 0.0019, 299: 0.0015, 300: 0.0016, 301: 0.0022, 302: 0.0075, 303: 0.0024, 304: 0.0066, 305: 0.002, 306: 0.0015, 307: 0.0025, 308: 0.0036, 309: 0.0103, 310: 0.0012, 311: 0.0058, 312: 0.0144, 313: 0.0009, 314: 0.0015, 315: 0.0013, 316: 0.0013, 317: 0.0023, 318: 0.0005, 319: 0.0009, 320: 0.0016, 321: 0.0007, 322: 0.0012, 323: 0.0004, 324: 0.0004, 325: 0.0002, 326: 0.0001, 327: 0.0006, 328: 0.006, 329: 0.0053, 330:

In [46]:
importance_list = list(enumerate(rfmodel.featureImportances))
importance_list.sort(key=lambda tup: -tup[1])
top_importances = importance_list[:15]
# good_features_idx = [tup[0] for tup in top_importances]
# good_features = [data_train.columns[i] for i in good_features_idx]
top_importances


[(3, 0.21900603321130874),
 (288, 0.073177085708144782),
 (20, 0.058678702461364425),
 (19, 0.045695087257406224),
 (5, 0.038271722043343753),
 (6, 0.036996874412630831),
 (290, 0.035405248444631297),
 (12, 0.034809739748216793),
 (286, 0.02946471470879657),
 (16, 0.022817876715103123),
 (1, 0.021869881864084843),
 (2, 0.021520267494626648),
 (291, 0.021418239943219248),
 (13, 0.02085374587209405),
 (4, 0.020332153325839996)]

### 4.3 Logistic Regression

In [47]:
from pyspark.ml.classification import LogisticRegression

data_test=data_sets[1].cache()
data_train=data_sets[0].cache()

# We can also use the multinomial family for binary classification
mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

# Fit the model
mlrModel = mlr.fit(data_train)

# Print the coefficients and intercepts for logistic regression with multinomial family
# print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
# print("Multinomial intercepts: " + str(mlrModel.interceptVector))

predictions = mlrModel.transform(data_test)

In [48]:
prediction_list = predictions.collect()

In [49]:
predictionAndLabels = []
for item in prediction_list:
    predictionAndLabels.append((float(item[4]), float(item[1])))

In [50]:
metrics = MulticlassMetrics(sc.parallelize(predictionAndLabels))
print metrics.confusionMatrix().toArray()
print metrics.precision(1)
print metrics.recall(1)
print metrics.accuracy

[[ 40581.      0.]
 [  8763.      0.]]
0.0
0.0
0.822410019455


In [51]:
predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(335,[0,1,2,3,4,5...|    0|[0.75333096200490...|[0.81856597718063...|       0.0|
|(335,[0,1,2,3,4,5...|    0|[0.75333096200490...|[0.81856597718063...|       0.0|
|(335,[0,1,2,3,4,5...|    1|[0.75333096200490...|[0.81856597718063...|       0.0|
|(335,[0,1,2,3,4,5...|    0|[0.75333096200490...|[0.81856597718063...|       0.0|
|(335,[0,1,2,3,4,5...|    0|[0.75333096200490...|[0.81856597718063...|       0.0|
|(335,[0,1,2,3,4,5...|    0|[0.75333096200490...|[0.81856597718063...|       0.0|
|(335,[0,1,2,3,4,5...|    1|[0.75333096200490...|[0.81856597718063...|       0.0|
|(335,[0,1,2,3,4,5...|    1|[0.75333096200490...|[0.81856597718063...|       0.0|
|(335,[0,1,2,3,4,5...|    0|[0.75333096200490...|[0.81856597718063...|       0.0|
|(335,[0,1,2,3,4