# Model Tuning Quiz
Use this Jupyter notebook to find the answer to the quiz in the previous section. There is an answer key in the next part of the lesson.

In [14]:
from pyspark.sql import SparkSession

from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, IDF, StringIndexer
from pyspark.ml.feature import VectorAssembler, Normalizer, StandardScaler, MinMaxScaler
from pyspark.sql.functions import udf, sum as Fsum, pow as Fpow, col, sqrt as Fsqrt, concat, lit
from pyspark.sql.types import IntegerType, FloatType
from pyspark.ml import Pipeline
import numpy as np
import re
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


In [10]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Creating Features") \
    .getOrCreate()

In [11]:
stack_overflow_data = 'Train_onetag_small.json'

In [15]:
df = spark.read.json(stack_overflow_data)
df.persist()

DataFrame[Body: string, Id: bigint, Tags: string, Title: string, oneTag: string]

# Question
What is the accuracy of the best model trained with the parameter grid described above (and keeping all other parameters at their default value computed on the 10% untouched data?

### Step 1. Train Test Split
As a first step break your data set into 90% of training data and set aside 10%. Set random seed to `42`.

In [16]:
df = df.withColumn("Desc", concat(col("Title"), lit(' '), col("Body")))

In [17]:
train, test = df.randomSplit([0.9, 0.1], seed=42)

### Step 2. Build Pipeline

In [None]:
# On the first 90% of the data let's find the most accurate logistic regression model using 3-fold cross-validation with the following parameter grid:

# CountVectorizer vocabulary size: [1000, 5000]
# LogisticRegression regularization parameter: [0.0, 0.1]
# LogisticRegression max Iteration number: [10]
# Set the random seeds of all stages of the pipeline to 42.

In [18]:
Tokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
cv = CountVectorizer(inputCol="words", outputCol="TF", vocabSize=10000)
idf = IDF(inputCol="TF", outputCol="features") # this is a vector no need for extra vector assembler
indexer = StringIndexer(inputCol="oneTag", outputCol="label")
log_reg = LogisticRegression(maxIter=10, regParam=0.0, elasticNetParam=0)

pipeline_ = Pipeline(stages=[Tokenizer, cv, idf, indexer, log_reg])

In [19]:
pl_model_ = pipeline_.fit(train)
results_ = pl_model_.transform(test)

In [29]:
#results_.head(1) #label, raw prediction, probability (softmax), prediction

In [34]:
print(results_.filter(results_.label == results_.prediction).count())
print(results_.count())
print('Accuracy', results_.filter(results_.label == results_.prediction).count()/results_.count())

3874
9919
Accuracy 0.3905635648754915


In [36]:
evaluator = MulticlassClassificationEvaluator()
#data_test_ = test.select(col("label").alias("label"), col("DescGroup").alias("prediction"))) 

In [38]:
print(evaluator.evaluate(results_))   # defaults evaluator is f1 F1 SCORE without crossvaliation

0.3746224080568351


In [39]:
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
accuracy = evaluator.evaluate(results_)
print('Accuracy', accuracy)

Accuracy 0.3905635648754915


In [40]:
evaluator = MulticlassClassificationEvaluator(metricName="f1")
f1 = evaluator.evaluate(results_)
print('f1_score', f1)

f1_score 0.3746224080568351


In [41]:
evaluator = MulticlassClassificationEvaluator(metricName="weightedPrecision")
weightedPrecision = evaluator.evaluate(results_)
print('weightedPrecision', weightedPrecision)

weightedPrecision 0.40999515806826886


In [42]:
evaluator = MulticlassClassificationEvaluator(metricName="weightedRecall")
weightedRecall = evaluator.evaluate(results_)
print('weightedRecall', weightedRecall)

weightedRecall 0.3905635648754911


### Step 3. Tune Model
On the first 90% of the data let's find the most accurate logistic regression model using 3-fold cross-validation with the following parameter grid:

- CountVectorizer vocabulary size: `[1000, 5000]`
- LogisticRegression regularization parameter: `[0.0, 0.1]`
- LogisticRegression max Iteration number: `[10]`

In [47]:
paramgrid = ParamGridBuilder() \
        .addGrid(cv.vocabSize, [1000, 5000]) \
        .addGrid(log_reg.regParam, [0.0, 0.1]) \
        .addGrid(log_reg.maxIter, [10]).build()

crossval = CrossValidator(estimator=pipeline_,
                          estimatorParamMaps=paramgrid,
                          evaluator=MulticlassClassificationEvaluator(), #metricName
                          numFolds=3,
                          seed=42) #can set the metrics here in multiclass, default f1 without cross valiation, 
                                   # seem to be accuracy with cross validation

### Step 4: Compute Accuracy of Best Model

In [48]:
model_cross_val = crossval.fit(train)


Exception ignored in: <object repr() failed>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 105, in __del__
    SparkContext._active_spark_context._gateway.detach(self._java_obj)
AttributeError: 'MulticlassClassificationEvaluator' object has no attribute '_java_obj'


In [49]:
model_cross_val.avgMetrics # f1 for train

[0.3041127737507769,
 0.23193062521929064,
 0.3695808987443635,
 0.2846052258245492]

In [52]:
results_2 = model_cross_val.transform(test)

In [53]:
evaluator.evaluate(results_2) # test, look like accuracy! expected to be f1

0.3923782639378968

In [54]:
print(results_2.filter(results_2.label == results_2.prediction).count())
print(results_2.count())
print('Accuracy', results_2.filter(results_2.label == results_2.prediction).count()/results_2.count())

3892
9919
Accuracy 0.392378263937897


In [None]:
# only slightly better  cross validation defaults metrics is acccuracy ????!!!!!

In [55]:
params = [{p.name: v for p, v in m.items()} for m in model_cross_val.getEstimatorParamMaps()]

In [56]:
print(params)

[{'vocabSize': 1000, 'regParam': 0.0, 'maxIter': 10}, {'vocabSize': 1000, 'regParam': 0.1, 'maxIter': 10}, {'vocabSize': 5000, 'regParam': 0.0, 'maxIter': 10}, {'vocabSize': 5000, 'regParam': 0.1, 'maxIter': 10}]


In [59]:
model_cross_val.getEvaluator().getMetricName()

'f1'