"""
https://www.guru99.com/pyspark-tutorial.html

Following are the steps to build a Machine Learning program with PySpark:

Step 1) Basic operation with PySpark  
Step 2) Data preprocessing  
Step 3) Build a data processing pipeline  
Step 4) Build the classifier: logistic  
Step 5) Train and evaluate the model  
Step 6) Tune the hyperparameter  

"""

In [2]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
from pyspark import SparkFiles

In [3]:

sc= SparkContext()

# STEP 1:
url = "https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv"
sc.addFile(url)
sqlcontext = SQLContext(sc)

22/03/22 10:07:31 WARN Utils: Your hostname, iamhimanshu0 resolves to a loopback address: 127.0.1.1; using 192.168.43.239 instead (on interface wlo1)
22/03/22 10:07:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/03/22 10:07:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


"""
read csv file ->

You use inferSchema set to True to tell Spark to guess automatically the type of data. By default, it is turn to False.
"""

In [4]:
df = sqlcontext.read.csv(SparkFiles.get('/home/himanshu/Desktop/sparkLearn/getStarted/data/adult_data.csv'),
                            header=True, inferSchema=True)
df.printSchema()



root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



                                                                                

In [5]:

df.show(5, truncate=False)

+---+---------+------+------------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+
|age|workclass|fnlwgt|education   |educational-num|marital-status    |occupation       |relationship|race |gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+---------+------+------------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+
|25 |Private  |226802|11th        |7              |Never-married     |Machine-op-inspct|Own-child   |Black|Male  |0           |0           |40            |United-States |<=50K |
|38 |Private  |89814 |HS-grad     |9              |Married-civ-spouse|Farming-fishing  |Husband     |White|Male  |0           |0           |50            |United-States |<=50K |
|28 |Local-gov|336951|Assoc-acdm  |12             |Married-civ-spouse|Protective-serv  |Husband     |White|Mal

In [6]:
# Selecting columns
df.select('age','fnlwgt').show(5)

+---+------+
|age|fnlwgt|
+---+------+
| 25|226802|
| 38| 89814|
| 28|336951|
| 44|160323|
| 18|103497|
+---+------+
only showing top 5 rows



In [7]:

# count by group (groupby(), count())
"""
together. In the PySpark example below, you count the number of rows by the education level.
"""
df.groupBy("education").count().sort("count", ascending=True).show()



+------------+-----+
|   education|count|
+------------+-----+
|   Preschool|   83|
|     1st-4th|  247|
|     5th-6th|  509|
|   Doctorate|  594|
|        12th|  657|
|         9th|  756|
| Prof-school|  834|
|     7th-8th|  955|
|        10th| 1389|
|  Assoc-acdm| 1601|
|        11th| 1812|
|   Assoc-voc| 2061|
|     Masters| 2657|
|   Bachelors| 8025|
|Some-college|10878|
|     HS-grad|15784|
+------------+-----+



                                                                                

In [8]:
"""
Describe the data
To get a summary statistics, of the data, you can use describe(). It will compute the :

    count
    mean
    standarddeviation
    min
max
"""

df.describe().show()



+-------+------------------+-----------+------------------+------------+------------------+--------------+----------------+------------+------------------+------+------------------+------------------+------------------+--------------+------+
|summary|               age|  workclass|            fnlwgt|   education|   educational-num|marital-status|      occupation|relationship|              race|gender|      capital-gain|      capital-loss|    hours-per-week|native-country|income|
+-------+------------------+-----------+------------------+------------+------------------+--------------+----------------+------------+------------------+------+------------------+------------------+------------------+--------------+------+
|  count|             48842|      48842|             48842|       48842|             48842|         48842|           48842|       48842|             48842| 48842|             48842|             48842|             48842|         48842| 48842|
|   mean| 38.64358543876172|    

                                                                                

In [9]:
# summary statistic of only one column
df.describe('capital-gain').show()

+-------+------------------+
|summary|      capital-gain|
+-------+------------------+
|  count|             48842|
|   mean|1079.0676262233324|
| stddev| 7452.019057655416|
|    min|                 0|
|    max|             99999|
+-------+------------------+



In [21]:
df.columns

['age',
 'age_square',
 'workclass',
 'fnlwgt',
 'education',
 'educational-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'income']

In [22]:
"""
Crosstab computation

In some occasion, it can be interesting to see the descriptive statistics between two pairwise columns. 
For instance, you can count the number of people with income below or above 50k by education level. 
This operation is called a crosstab.
"""
df.crosstab('age', 'income').sort("age_income").show()


+----------+-----+----+
|age_income|<=50K|>50K|
+----------+-----+----+
|        17|  595|   0|
|        18|  862|   0|
|        19| 1050|   3|
|        20| 1112|   1|
|        21| 1090|   6|
|        22| 1161|  17|
|        23| 1307|  22|
|        24| 1162|  44|
|        25| 1119|  76|
|        26| 1068|  85|
|        27| 1117| 115|
|        28| 1101| 179|
|        29| 1025| 198|
|        30| 1031| 247|
|        31| 1050| 275|
|        32|  957| 296|
|        33| 1045| 290|
|        34|  949| 354|
|        35|  997| 340|
|        36|  948| 400|
+----------+-----+----+
only showing top 20 rows



In [23]:
df.select('educational-num').show(5)

+---------------+
|educational-num|
+---------------+
|              7|
|              9|
|             12|
|             10|
|             10|
+---------------+
only showing top 5 rows



In [13]:
"""
Drop column
There are two intuitive API to drop columns:

drop(): Drop a column
dropna(): Drop NA’s
"""
df.drop('educational-num').columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'income']

In [14]:
"""
Filter data
You can use filter() to apply descriptive statistics in a subset of data. 
For instance, you can count the number of people above 40 year old
"""

df.filter(df.age >40).count()

20211

In [15]:
"""
Descriptive statistics by group
Finally, you can group data by group and compute statistical operations like the mean.
"""
df.groupby('marital-status').agg({'capital-gain':'mean'}).show()

+--------------------+------------------+
|      marital-status| avg(capital-gain)|
+--------------------+------------------+
|           Separated| 581.8424836601307|
|       Never-married|  384.382639449029|
|Married-spouse-ab...| 629.0047770700637|
|            Divorced| 793.6755615860094|
|             Widowed| 603.6442687747035|
|   Married-AF-spouse|2971.6216216216217|
|  Married-civ-spouse|1739.7006121810625|
+--------------------+------------------+



In [24]:
"""
STEP 2:- Data Preprocessing
"""

"""
For instance, you know that age is not a linear function with the income. 
When people are young, their income is usually lower than mid-age. 
After retirement, a household uses their saving, meaning a decrease in income. 
To capture this pattern, you can add a square to the age feature

Add age square

To add a new feature, you need to:

Select the column
Apply the transformation and add it to the DataFrame
"""
from pyspark.sql.functions import *

# 1 select the column
age_square = df.select(col('age')**2)

# Apply the transformation and add it to the DataFrame
df = df.withColumn("age_square", col('age')**2)

df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- age_square: double (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [17]:
"""
You can see that age_square has been successfully added to the data frame. 
You can change the order of the variables with select. 
Below, you bring age_square right after age.
"""
COLUMNS = ['age',
            'age_square',
            'workclass',
            'fnlwgt',
            'education',
            'educational-num',
            'marital-status',
            'occupation',
            'relationship',
            'race',
            'gender',
            'capital-gain',
            'capital-loss',
            'hours-per-week',
            'native-country',
            'income']
df = df.select(COLUMNS)
df.first()

Row(age=25, age_square=625.0, workclass='Private', fnlwgt=226802, education='11th', educational-num=7, marital-status='Never-married', occupation='Machine-op-inspct', relationship='Own-child', race='Black', gender='Male', capital-gain=0, capital-loss=0, hours-per-week=40, native-country='United-States', income='<=50K')

In [18]:
# Exclude Holand-Netherlands
"""
When a group within a feature has only one observation, it brings no information to the model. On the contrary, it can lead to an error during the cross-validation.

Let’s check the origin of the household
"""
df.filter(df['native-country'] == "Holand-Netherlands").count()
df.groupby('native-country').agg({'native-country': 'count'}).sort(asc("count(native-country)")).show()



+--------------------+---------------------+
|      native-country|count(native-country)|
+--------------------+---------------------+
|  Holand-Netherlands|                    1|
|             Hungary|                   19|
|            Honduras|                   20|
|            Scotland|                   21|
|          Yugoslavia|                   23|
|Outlying-US(Guam-...|                   23|
|                Laos|                   23|
|     Trinadad&Tobago|                   27|
|            Cambodia|                   28|
|                Hong|                   30|
|            Thailand|                   30|
|             Ireland|                   37|
|              France|                   38|
|             Ecuador|                   45|
|                Peru|                   46|
|              Greece|                   49|
|           Nicaragua|                   49|
|                Iran|                   59|
|              Taiwan|                   65|
|         

                                                                                

In [19]:
# The feature native_country has only one household coming from Netherland. You exclude it.
df_remove = df.filter(df['native-country'] != 'Holand-Netherlands')

In [30]:
df.toPandas()['workclass'].value_counts()

                                                                                

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
?                    2799
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workclass, dtype: int64

In [38]:
"""
    Step 3) Build a data processing pipeline

Similar to scikit-learn, Pyspark has a pipeline API.

A pipeline is very convenient to maintain the structure of the data. You push the data into the pipeline. Inside the pipeline, various operations are done, the output is used to feed the algorithm.

For instance, one universal transformation in machine learning consists of converting a string to one hot encoder, i.e., one column by a group. One hot encoder is usually a matrix full of zeroes.

The steps to transform the data are very similar to scikit-learn. You need to:

Index the string to numeric
Create the one hot encoder
Transform the data
Two APIs do the job: StringIndexer, OneHotEncoder

First of all, you select the string column to index. The inputCol is the name of the column in the dataset. outputCol is the new name given to the transformed column.
"""
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

stringIndexer = StringIndexer(inputCol='workclass',
                             outputCol="workclass_encoded")
# fit the data and transform it
model = stringIndexer.fit(df)
indexed = model.transform(df)

In [41]:
# one hot encoder
encoder = OneHotEncoder(dropLast=False,  
                        inputCol='workclass_encoded',outputCol='workclass_vec')
ohe = encoder.fit(indexed)
encoded = ohe.transform(indexed)
encoded.show(2)

+---+----------+---------+------+---------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+-----------------+-------------+
|age|age_square|workclass|fnlwgt|education|educational-num|    marital-status|       occupation|relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|workclass_encoded|workclass_vec|
+---+----------+---------+------+---------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+-----------------+-------------+
| 25|     625.0|  Private|226802|     11th|              7|     Never-married|Machine-op-inspct|   Own-child|Black|  Male|           0|           0|            40| United-States| <=50K|              0.0|(9,[0],[1.0])|
| 38|    1444.0|  Private| 89814|  HS-grad|              9|Married-civ-spouse|  Farming-fishing|     Husband|White|  Male|      

In [43]:
df.columns

['age',
 'age_square',
 'workclass',
 'fnlwgt',
 'education',
 'educational-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'income']

In [53]:
"""
Build the pipeline

You will build a pipeline to convert all the precise features and add them to the final dataset. The pipeline will have four operations, but feel free to add as many operations as you want.

Encode the categorical data
Index the label feature
Add continuous variable
Assemble the steps.

Each step is stored in a list named stages. This list will tell the VectorAssembler what operation to perform inside the pipeline.

"""
from pyspark.ml import Pipeline

# 1 Encoded the categorical Data
CATE_FEATURES  = ['workclass','education','marital-status',
                    'occupation','relationship','race','gender','native-country'
                    ]

# stages in our pipeline
stages = []

for categoricalCol in CATE_FEATURES:
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+ "Index")
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol+ "classVec"])

    stages += [stringIndexer, encoder]

In [54]:
"""
2. Index the label feature

Spark, like many other libraries, does not accept string values for the label. You convert the label feature with StringIndexer and add it to the list stages
"""

# convert lable into label indices using the stringIndexer
label_stringIdx = StringIndexer(inputCol='income', outputCol='newlabel')
stages += [label_stringIdx]


In [55]:
"""
3. Add continuous variable

The inputCols of the VectorAssembler is a list of columns. 
You can create a new list containing all the new columns. 
The code below popluate the list with encoded categorical features and the continuous features.

"""
assemblerInputs = [c + "classVec" for c in CATE_FEATURES]


In [56]:
"""
4. Assemble the steps.

Finally, you pass all the steps in the VectorAssembler
"""
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [57]:
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df_remove)
model = pipelineModel.transform(df_remove)

In [59]:
model.take(1)

22/03/22 11:13:52 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Row(age=25, age_square=625.0, workclass='Private', fnlwgt=226802, education='11th', educational-num=7, marital-status='Never-married', occupation='Machine-op-inspct', relationship='Own-child', race='Black', gender='Male', capital-gain=0, capital-loss=0, hours-per-week=40, native-country='United-States', income='<=50K', workclassIndex=0.0, workclassclassVec=SparseVector(8, {0: 1.0}), educationIndex=5.0, educationclassVec=SparseVector(15, {5: 1.0}), marital-statusIndex=1.0, marital-statusclassVec=SparseVector(6, {1: 1.0}), occupationIndex=6.0, occupationclassVec=SparseVector(14, {6: 1.0}), relationshipIndex=2.0, relationshipclassVec=SparseVector(5, {2: 1.0}), raceIndex=1.0, raceclassVec=SparseVector(4, {1: 1.0}), genderIndex=0.0, genderclassVec=SparseVector(1, {0: 1.0}), native-countryIndex=0.0, native-countryclassVec=SparseVector(40, {0: 1.0}), newlabel=0.0, features=SparseVector(93, {0: 1.0, 13: 1.0, 24: 1.0, 35: 1.0, 45: 1.0, 49: 1.0, 52: 1.0, 53: 1.0}))]

In [60]:
"""
Step 4) Build the classifier: logistic
To make the computation faster, you convert model to a DataFrame.

You need to select newlabel and features from model using map.
"""

from pyspark.ml.linalg import DenseVector

input_data = model.rdd.map(lambda x:(x['newlabel'],
                             DenseVector(x['features'])))

In [64]:
# You are ready to create the train data as a DataFrame. You use the sqlContext
df_train = sqlcontext.createDataFrame(input_data,['label','features'])

df_train.show(2)

[Stage 108:>                                                        (0 + 1) / 1]

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[1.0,0.0,0.0,0.0,...|
|  0.0|[1.0,0.0,0.0,0.0,...|
+-----+--------------------+
only showing top 2 rows



                                                                                

In [67]:
# create a train/test set
# split dataset 80/20 with randomSplit.
train_data, test_data = df_train.randomSplit([.8,.2], seed=42)

In [68]:
# Let’s count how many people with income below/above 50k in both training and test set

train_data.groupby('label').agg({'label':'count'}).show()

                                                                                

+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|       29770|
|  1.0|        9356|
+-----+------------+



                                                                                

In [69]:
test_data.groupby('label').agg({'label': 'count'}).show()

                                                                                

+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|        7384|
|  1.0|        2331|
+-----+------------+



In [70]:
"""
Build the logistic regressor

Last but not least, you can build the classifier.
Pyspark has an API called LogisticRegression to perform logistic regression.

You initialize lr by indicating the label column and feature columns. 
You set a maximum of 10 iterations and add a regularization parameter with a value of 0.3. 
Note that in the next section, you will use cross-validation with a parameter grid to tune the model

"""

from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol='label',
                        featuresCol='features',
                        maxIter=10,regParam=0.3
                        )
# fit the data to the model
linearModel = lr.fit(train_data)

22/03/22 11:32:12 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/03/22 11:32:12 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [71]:
# You can see the coefficients from the regression
# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(linearModel.coefficients))
print("Intercept: " + str(linearModel.intercept))


Coefficients: [-0.04296911520106022,-0.07500127692241143,-0.0018052188166324114,-0.1712193928916316,-0.09840359491358433,0.3188967776523329,0.2102870094499536,-0.15243862482715165,-0.19326577321309107,-0.07222815655246972,0.2956264829946448,0.5020601619510865,0.00024723220936940686,-0.32551332499712776,0.04783030156331672,-0.3678095657554955,-0.43457431543403924,0.7668002846892464,-0.4005381782107246,-0.2602929778654579,0.7763942021471119,-0.3704534552506413,-0.39844404039569686,0.412175785397178,-0.37484164704664835,-0.16434868933406427,-0.20346383237253668,-0.13878961180776977,-0.12882960068627802,0.2637329120333813,-0.06734394120898034,0.3714931588935748,-0.10248858004997531,0.07856522927514797,-0.3085258590808245,-0.21099796127327966,-0.17210281398684163,-0.09781700963612573,-0.3054132726682894,-0.3243710081848079,0.13623794178868157,0.12623615276978367,-0.2835044953640656,0.34920283454896434,-0.19000762565995316,-0.3345752906713306,-0.22690587392182446,0.4455049468677994,0.0322111

In [72]:
"""Step 5) Train and evaluate the model
To generate prediction for your test set,

You can use linearModel with transform() on test_data"""
# Make predictions on test data using the transform() method.
predictions = linearModel.transform(test_data)

In [73]:
predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [74]:
# You are interested by the label, prediction and the probability
selected = predictions.select("label", "prediction", "probability")
selected.show(20)

[Stage 142:>                                                        (0 + 1) / 1]

+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|  0.0|       0.0|[0.92544916834958...|
|  0.0|       0.0|[0.85366831655152...|
|  0.0|       0.0|[0.80525594630596...|
|  0.0|       0.0|[0.93276775644190...|
|  0.0|       0.0|[0.61829114864347...|
|  0.0|       1.0|[0.28274797784679...|
|  0.0|       0.0|[0.63601188663092...|
|  0.0|       0.0|[0.76497605256408...|
|  0.0|       0.0|[0.92584683038370...|
|  0.0|       0.0|[0.69738282711455...|
|  0.0|       0.0|[0.91074917233796...|
|  0.0|       0.0|[0.86850524368325...|
|  0.0|       0.0|[0.89342913179989...|
|  0.0|       0.0|[0.84917020272080...|
|  0.0|       0.0|[0.84698302681294...|
|  0.0|       0.0|[0.81784879456327...|
|  0.0|       0.0|[0.87875595092525...|
|  0.0|       1.0|[0.46965723138919...|
|  0.0|       0.0|[0.82547651996387...|
|  0.0|       0.0|[0.87470154497818...|
+-----+----------+--------------------+
only showing top 20 rows



                                                                                

In [75]:
"""
Evaluate the model

You need to look at the accuracy metric to see how well (or bad) the model performs. 
Currently, there is no API to compute the accuracy measure in Spark. 
The default value is the ROC, receiver operating characteristic curve. 
It is a different metrics that take into account the false positive rate.

Before you look at the ROC, let’s construct the accuracy measure. 
You are more familiar with this metric. The accuracy measure is 
the sum of the correct prediction over the total number of observations.

You create a DataFrame with the label and the `prediction.
"""
cm = predictions.select('label','prediction')
# you can check the number of class in the label and the prediction
cm.groupby('label').agg({'label':'count'}).show()

                                                                                

+-----+------------+
|label|count(label)|
+-----+------------+
|  0.0|        7384|
|  1.0|        2331|
+-----+------------+



In [76]:
cm.groupby('prediction').agg({'prediction': 'count'}).show()

                                                                                

+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|             8833|
|       1.0|              882|
+----------+-----------------+



In [77]:
# You can compute the accuracy by computing the count when the label are correctly classified over the total number of rows.
cm.filter(cm.label==cm.prediction).count()/cm.count()

                                                                                

0.8115285640761709

In [78]:
def accuracy_m(model): 
    predictions = model.transform(test_data)
    cm = predictions.select("label", "prediction")
    acc = cm.filter(cm.label == cm.prediction).count() / cm.count()
    print("Model accuracy: %.3f%%" % (acc * 100)) 
    
accuracy_m(model = linearModel)



Model accuracy: 81.153%


                                                                                

In [79]:
"""
ROC metrics

The module BinaryClassificationEvaluator includes the ROC measures. 
The Receiver Operating Characteristic curve is another common tool 
used with binary classification. It is very similar to the precision/recall 
curve, but instead of plotting precision versus recall, the ROC curve shows 
the true positive rate (i.e. recall) against the false positive rate. 
The false positive rate is the ratio of negative instances that are incorrectly classified as positive. It is equal to one minus the true negative rate. The true negative rate is also called specificity. Hence the ROC curve plots sensitivity (recall) versus 1 – specificity
"""

# USE ROC
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')
print(evaluator.evaluate(predictions))
print(evaluator.getMetricName())

                                                                                

0.8704129663636716
areaUnderROC


In [80]:
print(evaluator.evaluate(predictions))

                                                                                

0.8704147964711368


In [81]:
"""
Step 6) Tune the hyperparameter

Last but not least, you can tune the hyperparameters. Similar to scikit learn you create a parameter grid, and you add the parameters you want to tune.

To reduce the time of the computation, you only tune the regularization parameter with only two values.
"""
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder().addGrid(lr.regParam,[0.01,0.5]).build())


In [82]:
from time import *
start_time = time()

# create 5-fold crossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid,
                        evaluator=evaluator, numFolds =5
                    )

# Run corss validations
cvModel = cv.fit(train_data)
# likely take a fair amount of time
end_time = time()
elapsed_time = end_time - start_time
print("Time to train model: %.3f seconds" % elapsed_time)

                                                                                

Time to train model: 377.045 seconds


In [83]:
accuracy_m(model=cvModel)



Model accuracy: 82.954%


                                                                                

In [84]:
# You can exctract the recommended parameter by chaining cvModel.bestModel with extractParamMap()
bestModel = cvModel.bestModel
bestModel.extractParamMap()

{Param(parent='LogisticRegression_ecaeab9a0ab7', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2,
 Param(parent='LogisticRegression_ecaeab9a0ab7', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0,
 Param(parent='LogisticRegression_ecaeab9a0ab7', name='family', doc='The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial'): 'auto',
 Param(parent='LogisticRegression_ecaeab9a0ab7', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='LogisticRegression_ecaeab9a0ab7', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LogisticRegression_ecaeab9a0ab7', name='labelCol', doc='label column name.'): 'label',
 Param(parent='LogisticRegression_ecaeab9a0ab7', name='maxBlockSizeInMB', doc='maximum memory in MB for s