# Creating Features Quiz
Use this Jupyter notebook to find the answers to the quiz in the previous section. There is an answer key in the next part of the lesson.

In [1]:
from pyspark.sql import SparkSession

# TODOS: 
# 1) import any other libraries you might need
# 2) run the cells below to read dataset and build body length feature
# 3) write code to answer the quiz questions 
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, \
    IDF, StringIndexer, VectorAssembler, Normalizer, StandardScaler, MinMaxScaler
from pyspark.sql.functions import udf
from pyspark.sql.functions import avg
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import concat
from pyspark.sql import functions as f

import re

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Creating Features") \
    .getOrCreate()

### Read Dataset

In [3]:
stack_overflow_data = 'Train_onetag_small.json'

In [4]:
df = spark.read.json(stack_overflow_data)
df.persist()

DataFrame[Body: string, Id: bigint, Tags: string, Title: string, oneTag: string]

### Build Body Length Feature

In [5]:
regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
df = regexTokenizer.transform(df)

In [6]:
body_length = udf(lambda x: len(x), IntegerType())
df = df.withColumn("BodyLength", body_length(df.words))

In [7]:
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

# Question 1
Select the question with Id = 1112. How many words does its body contain (check the BodyLength column)?

In [8]:
# TODO: write your code to answer question 1
df.select('BodyLength').where(df.Id == '5123').show()

+----------+
|BodyLength|
+----------+
|       132|
+----------+



# Question 2
Create a new column that concatenates the question title and body. Apply the same functions we used before to compute the number of words in this combined column. What's the value in this new column for Id = 5123?

In [9]:
# TODO: write your code to answer question 2
df_1 = df.withColumn('body_title',concat(f.col('Body'), f.lit('_'), f.col('Title')))
df_1 = df_1.drop('Body', 'Title', 'words')
regexTokenizer = RegexTokenizer(inputCol="body_title", outputCol="words1", pattern="\\W")
df_1= regexTokenizer.transform(df_1)
df_1= df_1.withColumn("BodyTitleLength", body_length(df_1.words1))
df_1.select('BodyTitleLength').where(df_1.Id == '5123').show()

+---------------+
|BodyTitleLength|
+---------------+
|            135|
+---------------+



# Create a Vector
Create a vector from the combined Title + Body length column. In the next few questions, you'll try different normalizer/scaler methods on this new column.

In [10]:
# TODO: write your code to create this vector
assembler = VectorAssembler(inputCols=["BodyTitleLength"], outputCol="NumFeatures")
df_1 = assembler.transform(df_1)

# Question 3
Using the Normalizer method what's the normalized value for question Id = 512?

In [11]:
# TODO: write your code to answer question 3
scaler = Normalizer(inputCol = "NumFeatures", outputCol = "ScalNumFeatures_1")
df_1 = scaler.transform(df_1)
df_1.select('ScalNumFeatures_1').where(df_1.Id == 512).show()

+-----------------+
|ScalNumFeatures_1|
+-----------------+
|            [1.0]|
+-----------------+



# Question 4
Using the StandardScaler method (scaling both the mean and the standard deviation) what's the normalized value for question Id = 512?

In [12]:
# TODO: write your code to answer question 4
scaler2 = StandardScaler(inputCol="NumFeatures", outputCol="ScaledNumFeatures_3", withMean = True, withStd=True)
scalerModel = scaler2.fit(df_1)
df_1 = scalerModel.transform(df_1)
df_1.select('ScaledNumFeatures_3').where(df_1.Id == 512).show()

+--------------------+
| ScaledNumFeatures_3|
+--------------------+
|[-0.6417775027593...|
+--------------------+



# Question 5
Using the MinMAxScaler method what's the normalized value for question Id = 512?

In [13]:
# TODO: write your code to answer question 5
scaler3 = MinMaxScaler(inputCol="NumFeatures", outputCol="ScaledNumFeatures_MM")
MinMaxscalerModel = scaler3.fit(df_1)
df_1 = MinMaxscalerModel.transform(df_1)
df_1.select('ScaledNumFeatures_MM').where(df_1.Id == 512).show()

+--------------------+
|ScaledNumFeatures_MM|
+--------------------+
|[0.00624833820792...|
+--------------------+



Linear Regression

In [14]:
num_of_tags = udf(lambda x: len(x.split(" ")), IntegerType())
df_1 = df_1.withColumn("NumTags", num_of_tags(df_1.Tags))

In [15]:
df_1.head()

Row(Id=1, Tags='php image-processing file-upload upload mime-types', oneTag='php', BodyLength=83, body_title="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n_How to check if an uploaded file is an image without mime type?", words1=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'ma

In [16]:
df_1.groupby('NumTags').count().orderBy('NumTags').collect()

[Row(NumTags=1, count=13858),
 Row(NumTags=2, count=26540),
 Row(NumTags=3, count=28769),
 Row(NumTags=4, count=19108),
 Row(NumTags=5, count=11725)]

In [17]:
df_1.groupBy('NumTags').agg(avg(col('BodyTitleLength'))).orderBy('NumTags').show()

+-------+--------------------+
|NumTags|avg(BodyTitleLength)|
+-------+--------------------+
|      1|  143.69656516091788|
|      2|  162.16220798794274|
|      3|  181.26890055267822|
|      4|  201.47456562696252|
|      5|  227.65304904051172|
+-------+--------------------+



In [18]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = ['BodyTitleLength'], outputCol = 'LengthFeatures')
df_1 = assembler.transform(df_1)

In [19]:
from pyspark.ml.regression import LinearRegression
lr =LinearRegression(maxIter = 5, regParam = 0.0, fitIntercept = False, solver = 'normal')
data = df_1.select(col('NumTags').alias('label'), col('LengthFeatures').alias('features'))

In [20]:
lr_model = lr.fit(data)

In [21]:
lr_model.summary.r2

0.44553646396469937

In [22]:
lr =LinearRegression(maxIter = 5, regParam = 0.0, fitIntercept = False, solver = 'normal')
data = df_1.select(col('NumTags').alias('label'), col('BodyTitleLength').alias('features'))
lr_model = lr.fit(data)

IllegalArgumentException: 'requirement failed: Column features must be of type struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually int.'

In [23]:
df_1.createOrReplaceTempView("df_1_table")

In [24]:
longest_words = spark.sql("SELECT MAX(BodyTitleLength) FROM df_1_table").show()

+--------------------+
|max(BodyTitleLength)|
+--------------------+
|                7532|
+--------------------+



In [25]:
shortest_words = spark.sql("SELECT MIN(BodyTitleLength) FROM df_1_table").show()

+--------------------+
|min(BodyTitleLength)|
+--------------------+
|                  10|
+--------------------+



In [26]:
mean_words = spark.sql("SELECT AVG(BodyTitleLength) FROM df_1_table").show()

+--------------------+
|avg(BodyTitleLength)|
+--------------------+
|           180.29065|
+--------------------+



In [27]:
from pyspark.sql.functions import stddev

stddev_words = spark.sql("SELECT stddev(BodyTitleLength) FROM df_1_table").show()

+--------------------------------------------+
|stddev_samp(CAST(BodyTitleLength AS DOUBLE))|
+--------------------------------------------+
|                          192.10808959478612|
+--------------------------------------------+



In [28]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans().setParams(k = 5, seed = 42)
data2 = df_1.select(col('LengthFeatures').alias('features'))
model = kmeans.fit(data2)
centers = model.clusterCenters()
print("Cluster Centers: ", centers)

Cluster Centers:  [array([ 97.09557962]), array([ 1077.93843448]), array([ 502.40120937]), array([ 2731.09467456]), array([ 242.32514245])]


Model Tuning

In [29]:
train, test = df_1.randomSplit([0.9, 0.1], seed = 42)

In [32]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from itertools import product
from pyspark.ml.classification import LogisticRegression 
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

regexTokenizer = RegexTokenizer(inputCol = 'body_title', outputCol = 'words', pattern = '\\W')
cv = CountVectorizer(inputCol = 'words', outputCol = 'TF', vocabSize = 1000)
idf = IDF(inputCol = 'TF', outputCol = 'features')
indexer = StringIndexer(inputCol = 'oneTag', outputCol = 'label')
lr = LogisticRegression(maxIter = 10, regParam = 0.0, elasticNetParam = 0)
pipeline = Pipeline(stages = [regexTokenizer, cv, idf, indexer, lr])
pmodel = pipeline.fit(train)
results = pmodel.transform(test)

paramGrid = ParamGridBuilder().addGrid(cv.vocabSize, [1000, 5000]).addGrid(lr.maxIter, [10]).addGrid(lr.regParam, [0.0, 0.1]).build()
crossval = CrossValidator(estimator = pipeline, estimatorParamMaps = paramGrid, evaluator = MulticlassClassificationEvaluator(), numFolds = 3)
cvModel = crossval.fit(train)
cvModel.avgMetrics


[0.326529285016418,
 0.25279077628064905,
 0.41084712724831995,
 0.33523971479076553]