In [1]:
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler

In [2]:
spark = SparkSession.builder \
        .master('local[*]') \
        .appName('first_spark_application') \
        .getOrCreate() #if there is a active session it will get or create one

In [3]:
# cars = spark.read.csv("cars.csv", header=True, inferSchema=True, nullValue='NA')

health = spark.read.csv('./data/train.csv', header=True, inferSchema=True)

In [4]:
# Indexing categorical data
indexer1 = StringIndexer(inputCol='Vehicle_Age',
outputCol='Vehicle_Age_idx')

indexer2 = StringIndexer(inputCol='Gender',
outputCol='Gender_idx')

indexer3 = StringIndexer(inputCol='Vehicle_Damage',
outputCol='Vehicle_Damage_idx')


# now we need to combine all the features in a single list
# we are doing this because pyspark.ml what all the features in a list

assembler = VectorAssembler(inputCols=['Age', 'Driving_License', 
                                       'Region_Code', 'Previously_Insured', 
                                       'Annual_Premium', 
                                       'Policy_Sales_Channel', 'Vintage', 
                                       'Vehicle_Age_idx', 'Gender_idx','Vehicle_Damage_idx'],
                            outputCol='features')

pipeline = Pipeline(stages=[indexer1, indexer2, indexer3, assembler])

In [5]:
healthe = pipeline.fit(health).transform(health)

In [6]:
healthe.show(2)

+---+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+---------------+----------+------------------+--------------------+
| id|Gender|Age|Driving_License|Region_Code|Previously_Insured|Vehicle_Age|Vehicle_Damage|Annual_Premium|Policy_Sales_Channel|Vintage|Response|Vehicle_Age_idx|Gender_idx|Vehicle_Damage_idx|            features|
+---+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+---------------+----------+------------------+--------------------+
|  1|  Male| 44|              1|       28.0|                 0|  > 2 Years|           Yes|       40454.0|                26.0|    217|       1|            2.0|       0.0|               0.0|[44.0,1.0,28.0,0....|
|  2|  Male| 76|              1|        3.0|                 0|   1-2 Year|            No|       33536.0|                26.0|    183|       0|            0

# preparing data for model fitting

- We are selecting the features column and response column to get that data needed for predictions

- we need to change the output column name to label for the models to work right in spark


In [7]:
health_ml = healthe.select(['features','Response']).withColumnRenamed('Response', 'label')

In [8]:
health_ml.show(2)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[44.0,1.0,28.0,0....|    1|
|[76.0,1.0,3.0,0.0...|    0|
+--------------------+-----+
only showing top 2 rows



In [9]:
# Specify a seed for reproducibility
# we are spliting the data in test train

health_train, health_test = health_ml.randomSplit([0.8, 0.2], seed= 23 )

In [10]:
print(health_train.count(), health_test.count())

304788 76321


In [11]:
from pyspark.ml.classification import LogisticRegression

In [12]:
model = LogisticRegression(featuresCol='features', labelCol='label')

In [14]:
logistic = model.fit(health_train)

In [15]:
predection = logistic.transform(health_test)

In [17]:
predection.show(2)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(10,[0,1,4,5,6],[...|    0|[0.99842955888785...|[0.73074969914917...|       0.0|
|(10,[0,1,4,5,6],[...|    0|[0.72326996663944...|[0.67332667918291...|       0.0|
+--------------------+-----+--------------------+--------------------+----------+
only showing top 2 rows



In [16]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator()
evaluator.evaluate(predection, {evaluator.metricName: 'weightedPrecision'})

0.7703586309135694