# steps to do

- read file in memory
- find catorical columns and update them to numbers
    - Gender
    - Vehicle_Age
    - Vehicle_Damage
- combine all the features into a single feature list

In [1]:
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler

ModuleNotFoundError: No module named 'pyspark'

In [2]:
spark = SparkSession.builder \
        .master('local[*]') \
        .appName('first_spark_application') \
        .getOrCreate() #if there is a active session it will get or create one

In [3]:
# cars = spark.read.csv("cars.csv", header=True, inferSchema=True, nullValue='NA')

health = spark.read.csv('./data/train.csv', header=True, inferSchema=True)

In [4]:
health.printSchema()

root
 |-- id: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Driving_License: integer (nullable = true)
 |-- Region_Code: double (nullable = true)
 |-- Previously_Insured: integer (nullable = true)
 |-- Vehicle_Age: string (nullable = true)
 |-- Vehicle_Damage: string (nullable = true)
 |-- Annual_Premium: double (nullable = true)
 |-- Policy_Sales_Channel: double (nullable = true)
 |-- Vintage: integer (nullable = true)
 |-- Response: integer (nullable = true)



In [5]:
health.show(5)

+---+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+
| id|Gender|Age|Driving_License|Region_Code|Previously_Insured|Vehicle_Age|Vehicle_Damage|Annual_Premium|Policy_Sales_Channel|Vintage|Response|
+---+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+
|  1|  Male| 44|              1|       28.0|                 0|  > 2 Years|           Yes|       40454.0|                26.0|    217|       1|
|  2|  Male| 76|              1|        3.0|                 0|   1-2 Year|            No|       33536.0|                26.0|    183|       0|
|  3|  Male| 47|              1|       28.0|                 0|  > 2 Years|           Yes|       38294.0|                26.0|     27|       1|
|  4|  Male| 21|              1|       11.0|                 1|   < 1 Year|            No|       28619.0|               152.0|    203|  

In [6]:
# we will not use id so its better to drop it
health = health.drop('id')

In [7]:
# Indexing categorical data
indexer = StringIndexer(inputCol='Vehicle_Age',
outputCol='Vehicle_Age_idx')
health = indexer.fit(health).transform(health)

indexer = StringIndexer(inputCol='Gender',
outputCol='Gender_idx')
health = indexer.fit(health).transform(health)

indexer = StringIndexer(inputCol='Vehicle_Damage',
outputCol='Vehicle_Damage_idx')
health = indexer.fit(health).transform(health)


In [8]:
health.show(5)

+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+---------------+----------+------------------+
|Gender|Age|Driving_License|Region_Code|Previously_Insured|Vehicle_Age|Vehicle_Damage|Annual_Premium|Policy_Sales_Channel|Vintage|Response|Vehicle_Age_idx|Gender_idx|Vehicle_Damage_idx|
+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+---------------+----------+------------------+
|  Male| 44|              1|       28.0|                 0|  > 2 Years|           Yes|       40454.0|                26.0|    217|       1|            2.0|       0.0|               0.0|
|  Male| 76|              1|        3.0|                 0|   1-2 Year|            No|       33536.0|                26.0|    183|       0|            0.0|       0.0|               1.0|
|  Male| 47|              1|       28.0|                 0|  > 2 Years

In [11]:
health.dtypes

[('Gender', 'string'),
 ('Age', 'int'),
 ('Driving_License', 'int'),
 ('Region_Code', 'double'),
 ('Previously_Insured', 'int'),
 ('Vehicle_Age', 'string'),
 ('Vehicle_Damage', 'string'),
 ('Annual_Premium', 'double'),
 ('Policy_Sales_Channel', 'double'),
 ('Vintage', 'int'),
 ('Response', 'int'),
 ('Vehicle_Age_idx', 'double'),
 ('Gender_idx', 'double'),
 ('Vehicle_Damage_idx', 'double')]

In [16]:
# now we need to combine all the features in a single list
# we are doing this because pyspark.ml what all the features in a list

assembler = VectorAssembler(inputCols=['Age', 'Driving_License', 
                                       'Region_Code', 'Previously_Insured', 
                                       'Annual_Premium', 
                                       'Policy_Sales_Channel', 'Vintage', 
                                       'Vehicle_Age_idx', 'Gender_idx','Vehicle_Damage_idx'],
                            outputCol='features')
health = assembler.transform(health)

In [17]:
health.dtypes

[('Gender', 'string'),
 ('Age', 'int'),
 ('Driving_License', 'int'),
 ('Region_Code', 'double'),
 ('Previously_Insured', 'int'),
 ('Vehicle_Age', 'string'),
 ('Vehicle_Damage', 'string'),
 ('Annual_Premium', 'double'),
 ('Policy_Sales_Channel', 'double'),
 ('Vintage', 'int'),
 ('Response', 'int'),
 ('Vehicle_Age_idx', 'double'),
 ('Gender_idx', 'double'),
 ('Vehicle_Damage_idx', 'double'),
 ('features', 'vector')]

In [18]:
health.show(1)

+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+---------------+----------+------------------+--------------------+
|Gender|Age|Driving_License|Region_Code|Previously_Insured|Vehicle_Age|Vehicle_Damage|Annual_Premium|Policy_Sales_Channel|Vintage|Response|Vehicle_Age_idx|Gender_idx|Vehicle_Damage_idx|            features|
+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+---------------+----------+------------------+--------------------+
|  Male| 44|              1|       28.0|                 0|  > 2 Years|           Yes|       40454.0|                26.0|    217|       1|            2.0|       0.0|               0.0|[44.0,1.0,28.0,0....|
+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+---------------+----------+------

# Creating pipline for all the above steps

In [21]:
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler

In [2]:
spark = SparkSession.builder \
        .master('local[*]') \
        .appName('first_spark_application') \
        .getOrCreate() #if there is a active session it will get or create one

In [27]:
# cars = spark.read.csv("cars.csv", header=True, inferSchema=True, nullValue='NA')

health = spark.read.csv('./data/train.csv', header=True, inferSchema=True)

In [28]:
# Indexing categorical data
indexer1 = StringIndexer(inputCol='Vehicle_Age',
outputCol='Vehicle_Age_idx')

indexer2 = StringIndexer(inputCol='Gender',
outputCol='Gender_idx')

indexer3 = StringIndexer(inputCol='Vehicle_Damage',
outputCol='Vehicle_Damage_idx')


# now we need to combine all the features in a single list
# we are doing this because pyspark.ml what all the features in a list

assembler = VectorAssembler(inputCols=['Age', 'Driving_License', 
                                       'Region_Code', 'Previously_Insured', 
                                       'Annual_Premium', 
                                       'Policy_Sales_Channel', 'Vintage', 
                                       'Vehicle_Age_idx', 'Gender_idx','Vehicle_Damage_idx',
                                        'Response'],
                            outputCol='features')

pipeline = Pipeline(stages=[indexer1, indexer2, indexer3, assembler])

In [29]:
healthe = pipeline.fit(health).transform(health)

In [30]:
healthe.show(5)

+---+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+---------------+----------+------------------+--------------------+
| id|Gender|Age|Driving_License|Region_Code|Previously_Insured|Vehicle_Age|Vehicle_Damage|Annual_Premium|Policy_Sales_Channel|Vintage|Response|Vehicle_Age_idx|Gender_idx|Vehicle_Damage_idx|            features|
+---+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+---------------+----------+------------------+--------------------+
|  1|  Male| 44|              1|       28.0|                 0|  > 2 Years|           Yes|       40454.0|                26.0|    217|       1|            2.0|       0.0|               0.0|[44.0,1.0,28.0,0....|
|  2|  Male| 76|              1|        3.0|                 0|   1-2 Year|            No|       33536.0|                26.0|    183|       0|            0