# Creating Features Quiz
Use this Jupyter notebook to find the answers to the quiz in the previous section. There is an answer key in the next part of the lesson.

In [1]:
# spark modules
# Docs: https://spark.apache.org/docs/latest/api/python/pyspark.ml.html
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql import functions as sf
from pyspark.sql.types import IntegerType, StringType
from pyspark.ml.feature import RegexTokenizer, VectorAssembler, \
Normalizer, StandardScaler, MinMaxScaler

import re

# TODOS: 
# 1) import any other libraries you might need
# 2) run the cells below to read dataset and build body length feature
# 3) write code to answer the quiz questions 

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Creating Features") \
    .getOrCreate()

### Read Dataset

In [3]:
stack_overflow_data = 'Train_onetag_small.json'

In [4]:
df = spark.read.json(stack_overflow_data)
df.persist()

DataFrame[Body: string, Id: bigint, Tags: string, Title: string, oneTag: string]

### Build Body Length Feature

In [5]:
regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
df = regexTokenizer.transform(df)

In [6]:
body_length = udf(lambda x: len(x), IntegerType())
df = df.withColumn("BodyLength", body_length(df.words))

In [7]:
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

# Question 1
Select the question with Id = 1112. How many words does its body contain (check the BodyLength column)?

In [8]:
# TODO: write your code to answer question 1
df.createOrReplaceTempView("user_data")
spark.sql("SELECT BodyLength FROM user_data WHERE Id=1112 LIMIT 2").show()

+----------+
|BodyLength|
+----------+
|        63|
+----------+



# Question 2
Create a new column that concatenates the question title and body. Apply the same functions we used before to compute the number of words in this combined column. What's the value in this new column for Id = 5123?

In [9]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[Body: string, Id: bigint, Tags: string, Title: string, oneTag: string, words: array<string>, BodyLength: int]>

In [10]:
# TODO: write your code to answer question 2
df = df.withColumn('joined_column', sf.concat(sf.col('Title'), sf.lit('_'), sf.col('Body')))

In [11]:
df.head(1) # check it works

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

In [12]:
# split the body text into separate words
df = RegexTokenizer(inputCol="joined_column", outputCol="joined_column_words", pattern="\\W").transform(df)
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

In [13]:
# count the number of words in each body tag
body_length = udf(lambda x: len(x), IntegerType()) # define knew user defined function
# IntergerType() defines data type
df = df.withColumn("joined_column_words_length", body_length(df.joined_column_words)) # create the new column

In [14]:
df.createOrReplaceTempView("user_data")
spark.sql("SELECT joined_column_words_length FROM user_data WHERE Id=5123 LIMIT 2").show()

+--------------------------+
|joined_column_words_length|
+--------------------------+
|                       135|
+--------------------------+



# Create a Vector
Create a vector from the combined Title + Body length column. In the next few questions, you'll try different normalizer/scaler methods on this new column.

In [15]:
# TODO: write your code to create this vector
# this vector is created in prep for ml
assembler = VectorAssembler(inputCols=["joined_column_words_length"],
                            outputCol="NumFeatures")
df = assembler.transform(df)


In [16]:
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

# Question 3
Using the Normalizer method what's the normalized value for question Id = 512?

In [17]:
# TODO: write your code to answer question 3
scaler = Normalizer(inputCol="NumFeatures", outputCol="Scaled_Normalizer")
df = scaler.transform(df) # adds up to one

In [18]:
df.createOrReplaceTempView("user_data")
spark.sql("SELECT Scaled_Normalizer FROM user_data WHERE Id=512 LIMIT 2").show()

+-----------------+
|Scaled_Normalizer|
+-----------------+
|            [1.0]|
+-----------------+



# Question 4
Using the StandardScaler method (scaling both the mean and the standard deviation) what's the normalized value for question Id = 512?

In [19]:
# TODO: write your code to answer question 4
scaler2 = StandardScaler(inputCol="NumFeatures", outputCol="Scaled_StandardScalar",
                         withStd=True,
                         withMean=True)
scalerModel = scaler2.fit(df)  # 0 mean
df = scalerModel.transform(df)

In [20]:
df.createOrReplaceTempView("user_data")
spark.sql("SELECT Scaled_StandardScalar FROM user_data WHERE Id=512 LIMIT 2").show()

+---------------------+
|Scaled_StandardScalar|
+---------------------+
| [-0.6383651503965...|
+---------------------+



# Question 5
Using the MinMAxScaler method what's the normalized value for question Id = 512?

In [21]:
# TODO: write your code to answer question 5
mmscaler = MinMaxScaler(inputCol="NumFeatures", outputCol="Scaled_MinMaxScalar")
scalerModel = mmscaler.fit(df)
df = scalerModel.transform(df)

In [22]:
df.createOrReplaceTempView("user_data")
spark.sql("SELECT Scaled_MinMaxScalar FROM user_data WHERE Id=512 LIMIT 2").show()

+--------------------+
| Scaled_MinMaxScalar|
+--------------------+
|[0.00624916899348...|
+--------------------+

