In [None]:
from pyspark.sql import SparkSession as SS
spark = SS.builder.getOrCreate()

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
training_data = [
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0),
    (4, "b spark who", 1.0),
    (5, "g d a y", 0.0),
    (6, "spark fly", 1.0),
    (7, "was mapreduce", 0.0),
    (8, "e spark program", 1.0),
    (9, "a e c l", 0.0),
    (10, "spark compile", 1.0),
    (11, "hadoop software", 0.0),
    (12, "hansraj j", 3.0),
    (13, "hridhaaan j", 4.0),    
]

training = spark.createDataFrame(training_data, ["id", "text", "label"])
training.show()

# Prepare test documents, which are unlabeled.
test_data = [
    (4, "spark i j k"),
    (5, "l m n"),
    (6, "mapreduce spark"),
    (7, "apache hadoop"),
    (15, "priyanka j")
]
test = spark.createDataFrame(test_data, ["id", "text"],)
test.show()

In [12]:
tokens = Tokenizer(inputCol="text", outputCol="textTokens")
hashingTF = HashingTF(inputCol=tokens.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10)

pipeline = Pipeline(stages=[tokens, hashingTF, lr])

#using parambuilder for settings params for all blocks in the estimator pipeline
params = (
    ParamGridBuilder()
    .addGrid(hashingTF.numFeatures, [10, 100, 1000])
    .addGrid(lr.regParam, [0.1, 0.01])
    .build()
)

crossval = CrossValidator(
    estimator = pipeline,
    estimatorParamMaps = params,
    evaluator = BinaryClassificationEvaluator(),
    numFolds = 2
)

cvmodel = crossval.fit(training)

predicted = cvmodel.transform(test)
predicted.show()



+---+---------------+------------------+--------------------+--------------------+--------------------+----------+
| id|           text|        textTokens|            features|       rawPrediction|         probability|prediction|
+---+---------------+------------------+--------------------+--------------------+--------------------+----------+
|  4|    spark i j k|  [spark, i, j, k]|(10,[5,6,9],[1.0,...|[-4.7173593050707...|[3.42025018211054...|       3.0|
|  5|          l m n|         [l, m, n]|(10,[5,6,8],[1.0,...|[0.48878749430331...|[0.04190855768144...|       1.0|
|  6|mapreduce spark|[mapreduce, spark]|(10,[3,5],[1.0,1.0])|[1.00233194766876...|[0.06379700497879...|       1.0|
|  7|  apache hadoop|  [apache, hadoop]|(10,[1,5],[1.0,1.0])|[0.73870457222680...|[0.02301841639741...|       1.0|
| 15|     priyanka j|     [priyanka, j]|(10,[4,9],[1.0,1.0])|[0.27060785504197...|[0.14290516243719...|       3.0|
+---+---------------+------------------+--------------------+-------------------