In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
3,application_1490086759585_0007,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


## Basic example on Transformer and Estimator

In [None]:
# Prepare training data from a list of (label, features) tuples.
# Dense Vectors are just NumPy arrays

training = spark.createDataFrame([
    (1, Vectors.dense([0.0, 1.1, 0.1])),
    (0, Vectors.dense([2.0, 1.0, -1.0])),
    (0, Vectors.dense([2.0, 1.3, 1.0])),
    (1, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)

# Print out the parameters, documentation, and any default values.
print lr.explainParams()

In [None]:
# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)

# model1 is a Model (i.e., a transformer produced by an Estimator)
print "Model 1's trained coefficients: ", model1.coefficients

In [None]:
# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap = {lr.maxIter: 20}
paramMap[lr.maxIter] = 30  # Specify 1 Param, overwriting the original maxIter.
paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55})  # Specify multiple Params.

# You can combine paramMaps, which are python dictionaries.
paramMap[lr.probabilityCol] = "myProbability"  # Change output column name

# Now learn a new model using the paramMapCombined parameters.
# paramMapCombined overrides all parameters set earlier via lr.set* methods.
model2 = lr.fit(training, paramMap)
print "Model 2's trained coefficients: ", model2.coefficients

In [None]:
# Prepare test data
test = spark.createDataFrame([
    (1, Vectors.dense([-1.0, 1.5, 1.3])),
    (2, Vectors.dense([3.0, 2.0, -0.1])),
    (3, Vectors.dense([0.0, 2.2, -1.5]))], ["id", "features"])

# Make predictions on test data using the Transformer.transform() method.
# LogisticRegression.transform will only use the 'features' column.
# Note that model2.transform() outputs a "myProbability" column instead of the usual
# 'probability' column since we renamed the lr.probabilityCol parameter previously.

model1.transform(test).show()
model2.transform(test).show()

## Pipline example

In [None]:
# Prepare training documents from a list of (id, text, label) tuples.
training = spark.createDataFrame([
    (0, "a b c d spark spark", 1),
    (1, "b d", 0),
    (2, "spark f g h", 1),
    (3, "hadoop mapreduce", 0)
], ["id", "text", "label"])

In [None]:
# A tokenizer converts the input string to lowercase and then splits it by white spaces.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
tokenizer.transform(training).show()

In [None]:
# The same can be achieved by DataFrameAPI:
# But you will need to wrap it as a transformer to use it in a pipeline.

training.select('*', split(training['text'],' ').alias('words')).show()

In [None]:
# Maps a sequence of terms to their term frequencies using the hashing trick.
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
a = hashingTF.transform(tokenizer.transform(training))
a.show(truncate=False)

print a.select('features').first()

In [None]:
# lr is an estimator
lr = LogisticRegression(maxIter=10, regParam=0.001)

# Now we are ready to assumble the pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
                            
# Fit the pipeline to training documents.
model = pipeline.fit(training)

# Prepare test documents, which are unlabeled (id, text) tuples.
test = spark.createDataFrame([
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "spark hadoop spark"),
    (7, "apache hadoop")
], ["id", "text"])

# Make predictions on test documents and print columns of interest.
model.transform(test).show()

In [None]:
# Example showing a DAG pipeline

tokenizer = Tokenizer(inputCol="text", outputCol="words")

# Using two different hash functions to turn the words into vectors
hashingTF1 = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="feature1")
hashingTF2 = HashingTF(numFeatures = 1 << 10,
                       inputCol=tokenizer.getOutputCol(), outputCol="feature2")

# Combine two vectors into one.  VectorAssember is an transformer
combineFeature = VectorAssembler(inputCols=["feature1", "feature2"],
                                 outputCol="features")

lr = LogisticRegression(maxIter=10, regParam=0.001)

# Stages must be in topological order
pipeline = Pipeline(stages=[tokenizer, hashingTF1, hashingTF2, combineFeature, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(training)

# Make predictions on test documents and print columns of interest.
model.transform(test).show()

--------

## Example: Analyzing food inspection data using logistic regression

In [None]:
inspections = spark.read.csv('wasb://cluster@msbd.blob.core.windows.net/HdiSamples/HdiSamples/FoodInspectionData/Food_Inspections1.csv', inferSchema=True)

Let's take a look at its schema:

In [None]:
inspections.printSchema()

In [None]:
inspections.show()

We now have the CSV file as a DataFrame. It has some columns we will not use. Dropping them can save memory when caching the DataFrame. Also, we should give these columns meaningful names.

In [None]:
# Drop unused columns and rename interesting columns.

# Keep interesting columns and rename them to something meaningful

# Mapping column index to name.
columnNames = {0: "id", 1: "name", 12: "results", 13: "violations"}
    
# Rename column from '_c{id}' to something meaningful.
cols = [inspections[i].alias(columnNames[i]) for i in columnNames.keys()]
   
# Drop columns we are not using.
df = inspections.select(cols).where(col('violations').isNotNull())

df.cache()
df.show()

In [None]:
df.take(1)

The output of the above cell gives us an idea of the schema of the input file; the file includes the name of every establishment, the type of establishment, the address, the data of the inspections, and the location, among other things.

Let's start to get a sense of what our dataset contains. For example, what are the different values in the `results` column?

In [None]:
df.select('results').distinct().show()

In [None]:
df.groupBy('results').count().show()

Let us develop a model that can guess the outcome of a food inspection, given the violations. Since logistic regression is a binary classification method, it makes sense to group our data into two categories: **Fail** and **Pass**. A "Pass w/ Conditions" is still a Pass, so when we train the model, we will consider the two results equivalent. Data with the other results ("Business Not Located", "Out of Business") are not useful so we will remove them from our training set. This should be okay since these two categories make up a very small percentage of the results anyway.

Let us go ahead and convert our existing dataframe(`df`) into a new dataframe where each inspection is represented as a label-violations pair. In our case, a label of `0.0` represents a failure, a label of `1.0` represents a success, and a label of `-1.0` represents some results besides those two. We will filter those other results out when computing the new data frame.

In [None]:
# The function to clean the data

labeledData = df.select(when(df.results == 'Fail', 0)
                        .when(df.results == 'Pass', 1)
                        .when(df.results == 'Pass w/ Conditions', 1)
                        .alias('label'), 
                        'violations') \
                .where('label >= 0')

labeledData = cleanData(df)
    
labeledData.show()

Train a logistic regression model from the input dataframe

In [None]:
trainingData, testData = labeledData.randomSplit([0.8, 0.2])

tokenizer = Tokenizer(inputCol="violations", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.01)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

model = pipeline.fit(trainingData)

predictionsDf = model.transform(testData)
predictionsDf.show()

In [None]:
numSuccesses = predictionsDf.where('label == prediction').count()
numInspections = predictionsDf.count()

print ("There were %d inspections and there were %d successful predictions" % (numInspections, numSuccesses))
print("This is a %d%% success rate" % (float(numSuccesses) / float(numInspections) * 100))

## Cross-Validation

CrossValidator begins by splitting the dataset into a set of folds which are used as separate training and test datasets. E.g., with k=3 folds, CrossValidator will generate 3 (training, test) dataset pairs, each of which uses 2/3 of the data for training and 1/3 for testing. To evaluate a particular ParamMap, CrossValidator computes the average evaluation metric for the 3 Models produced by fitting the Estimator on the 3 different (training, test) dataset pairs.

After identifying the best ParamMap, CrossValidator finally re-fits the Estimator using the best ParamMap and the entire dataset.

In [None]:
# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.

paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=3)  

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(trainingData)

predictionsDf = cvModel.transform(testData)

numSuccesses = predictionsDf.where('label == prediction').count()
numInspections = predictionsDf.count()

print ("There were %d inspections and there were %d successful predictions" % (numInspections, numSuccesses))
print("This is a %d%% success rate" % (float(numSuccesses) / float(numInspections) * 100))

In [None]:
cvModel.explainParams()