In [1]:
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler

In [2]:
spark = SparkSession.builder \
        .master('local[*]') \
        .appName('first_spark_application') \
        .getOrCreate() #if there is a active session it will get or create one

In [3]:
# cars = spark.read.csv("cars.csv", header=True, inferSchema=True, nullValue='NA')

health = spark.read.csv('./data/train.csv', header=True, inferSchema=True)

In [4]:
# Indexing categorical data
indexer1 = StringIndexer(inputCol='Vehicle_Age',
outputCol='Vehicle_Age_idx')

indexer2 = StringIndexer(inputCol='Gender',
outputCol='Gender_idx')

indexer3 = StringIndexer(inputCol='Vehicle_Damage',
outputCol='Vehicle_Damage_idx')


# now we need to combine all the features in a single list
# we are doing this because pyspark.ml what all the features in a list

assembler = VectorAssembler(inputCols=['Age', 'Driving_License', 
                                       'Region_Code', 'Previously_Insured', 
                                       'Annual_Premium', 
                                       'Policy_Sales_Channel', 'Vintage', 
                                       'Vehicle_Age_idx', 'Gender_idx','Vehicle_Damage_idx',
                                      'Response'],
                            outputCol='features')

pipeline = Pipeline(stages=[indexer1, indexer2, indexer3, assembler])

In [5]:
healthe = pipeline.fit(health).transform(health)

In [6]:
healthe.show(2)

+---+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+---------------+----------+------------------+--------------------+
| id|Gender|Age|Driving_License|Region_Code|Previously_Insured|Vehicle_Age|Vehicle_Damage|Annual_Premium|Policy_Sales_Channel|Vintage|Response|Vehicle_Age_idx|Gender_idx|Vehicle_Damage_idx|            features|
+---+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+---------------+----------+------------------+--------------------+
|  1|  Male| 44|              1|       28.0|                 0|  > 2 Years|           Yes|       40454.0|                26.0|    217|       1|            2.0|       0.0|               0.0|[44.0,1.0,28.0,0....|
|  2|  Male| 76|              1|        3.0|                 0|   1-2 Year|            No|       33536.0|                26.0|    183|       0|            0

# preparing data for model fitting

In [7]:
# Specify a seed for reproducibility
health_train, health_test = healthe.randomSplit([0.8, 0.2], seed= 23 )

In [8]:
health_test.show(2)

+---+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+---------------+----------+------------------+--------------------+
| id|Gender|Age|Driving_License|Region_Code|Previously_Insured|Vehicle_Age|Vehicle_Damage|Annual_Premium|Policy_Sales_Channel|Vintage|Response|Vehicle_Age_idx|Gender_idx|Vehicle_Damage_idx|            features|
+---+------+---+---------------+-----------+------------------+-----------+--------------+--------------+--------------------+-------+--------+---------------+----------+------------------+--------------------+
|  1|  Male| 44|              1|       28.0|                 0|  > 2 Years|           Yes|       40454.0|                26.0|    217|       1|            2.0|       0.0|               0.0|[44.0,1.0,28.0,0....|
|  8|Female| 56|              1|       28.0|                 0|   1-2 Year|           Yes|       32031.0|                26.0|     72|       1|            0

In [9]:
from pyspark.ml.regression import LinearRegression

In [10]:
regression = LinearRegression(labelCol='Response')

In [11]:
regression.fit(health_train)

LinearRegressionModel: uid=LinearRegression_bd567bd47635, numFeatures=11

In [12]:
predictions = regression.transform(health_test)

AttributeError: 'LinearRegression' object has no attribute 'transform'

In [None]:
# prediction = logistic.transform(cars_test)


In [None]:
health_train.show(2)

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier

# Create a Decision Tree classier.
tree = DecisionTreeClassifier()
# Learn from the training data.
tree_model = tree.fit(health_train)

In [None]:
# Import the Decision Tree Classifier class
from pyspark.ml.classification import DecisionTreeClassifier

# Create a classifier object and fit to the training data
tree = DecisionTreeClassifier()
tree_model = tree.fit(health_train)

# Create predictions for the testing data and take a look at the predictions
prediction = tree_model.transform(health_test)
prediction.select('label', 'prediction', 'probability').show(5, False)