In [1]:
# Import the PySpark module
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [2]:
##Create SparkContext
sc = SparkContext.getOrCreate()

# Create SparkSession object
spark = SparkSession.builder.master('local[*]').appName('oneHot').getOrCreate()

In [3]:
## Is not the best choose for large data sets
flights = spark.read.csv('flights.csv',sep=',',header=True,inferSchema=True,nullValue='NA')

In [4]:
flights = flights.withColumnRenamed('mile','km')

In [5]:
flights.head()

Row(mon=11, dom=20, dow=6, carrier='US', flight=19, org='JFK', km=2153, depart=9.48, duration=351, delay=None)

### Pipeline stages

In [6]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

# Convert categorical strings to index values
indexer = StringIndexer(inputCol='org', outputCol='org_idx')

# One-hot encode index values
onehot = OneHotEncoderEstimator(
    inputCols=['org_idx','dow'],
    outputCols=['org_dummy','dow_dummy']
)

# Assemble predictors into a single column
assembler = VectorAssembler(inputCols=['km','org_dummy','dow_dummy'], outputCol='features')

# A linear regression object
regression = LinearRegression(labelCol='duration')

In [7]:
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=17)

### Pipeline model

In [8]:
# Import class for creating a pipeline
from pyspark.ml import Pipeline 

# Construct a pipeline
pipeline = Pipeline(stages =[indexer, onehot, assembler, regression])

# Train the pipeline on the training data
pipeline = pipeline.fit(flights_train)

# Make predictions on the testing data
predictions = pipeline.transform(flights_test)