# Screencast Code

The follow code is the same used in the "Text Processing" screencast. Run each code cell to see how 

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, \
    IDF, StringIndexer
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

import re

In [2]:
# create a SparkSession: note this step was left out of the screencast
spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .getOrCreate()

# Read in the Data Set

In [3]:
stack_overflow_data = 'Train_onetag_small.json'

In [4]:
df = spark.read.json(stack_overflow_data)

In [5]:
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php')

In [6]:
df.show()

+--------------------+---+--------------------+--------------------+----------------+
|                Body| Id|                Tags|               Title|          oneTag|
+--------------------+---+--------------------+--------------------+----------------+
|<p>I'd like to ch...|  1|php image-process...|How to check if a...|             php|
|<p>In my favorite...|  2|             firefox|How can I prevent...|         firefox|
|<p>I am import ma...|  3|r matlab machine-...|R Error Invalid t...|               r|
|<p>This is probab...|  4|     c# url encoding|How do I replace ...|              c#|
|<pre><code>functi...|  5|php api file-get-...|How to modify who...|             php|
|<p>I am using a m...|  6|proxy active-dire...|setting proxy in ...|active-directory|
|<p>My image is ca...|  7|           core-plot|How to draw barpl...|           other|
|<p>I've decided t...|  8|c# asp.net window...|How to fetch an X...|              c#|
|<p>Do you know of...|  9|.net javascript c...|.NET li

# Tokenization

Tokenization splits strings into separate words. Spark has a [Tokenizer](https://spark.apache.org/docs/latest/ml-features.html#tokenizer) class as well as RegexTokenizer, which allows for more control over the tokenization process.

In [7]:
# split the body text into separate words
regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
df = regexTokenizer.transform(df)
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

# CountVectorizer

In [8]:
# find the term frequencies of the words
cv = CountVectorizer(inputCol="words", outputCol="TF", vocabSize=1000)
cvmodel = cv.fit(df)
df = cvmodel.transform(df)
df.take(1)

# this start by going through the top 1000 most frequent words from countvectorizer then uses
# index 0 for most frequent words across text and index 999th as 1000th frequent words. one by one goes through each text
# start fillin gup dictionary for rank: count in sentence, so order is not order of the words it is dictionary with keys by 
# rank of the top 1000 words vs. value are counts 0:4.0 means, 'p' (0th rank, top) coming 4 time in sentence, no order. 'jpg'
# (rank 490) coming one time in sentence

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

In [13]:
# show the vocabulary in order of 
cvmodel.vocabulary[0:20]

['p',
 'the',
 'i',
 'to',
 'code',
 'a',
 'gt',
 'lt',
 'is',
 'and',
 'pre',
 'in',
 'this',
 'of',
 'it',
 'that',
 'for',
 '0',
 '1',
 'have']

In [10]:
cvmodel.vocabulary[832]

'upload'

In [14]:
# show the last 10 terms in the vocabulary
cvmodel.vocabulary[-10:]

['customer',
 'desktop',
 'buttons',
 'previous',
 'math',
 'master',
 '000',
 'blog',
 'comes',
 'wordpress']

# Inter-document Frequency (Inverse-Document frequency)

In [15]:
idf = IDF(inputCol="TF", outputCol="TFIDF")
idfModel = idf.fit(df)
df = idfModel.transform(df)
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

In [16]:
cvmodel.vocabulary[490]

'jpg'

# StringIndexer

In [17]:
indexer = StringIndexer(inputCol="oneTag", outputCol="label")
df = indexer.fit(df).transform(df)
# among onetag php is top frequenecy so label of 3 is due to that

In [18]:
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

In [27]:
indexer.stringOrderType

Param(parent='StringIndexer_42268f1dc8b9e4cc7bc0', name='stringOrderType', doc='How to order labels of string column. The first label after ordering is assigned an index of 0. Supported options: frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc.')

In [29]:
indexer.getStringOrderType()

'frequencyDesc'

In [30]:
indexer.params

[Param(parent='StringIndexer_42268f1dc8b9e4cc7bc0', name='handleInvalid', doc="how to handle invalid data (unseen or NULL values) in features and label column of string type. Options are 'skip' (filter out rows with invalid data), error (throw an error), or 'keep' (put invalid data in a special additional bucket, at index numLabels)."),
 Param(parent='StringIndexer_42268f1dc8b9e4cc7bc0', name='inputCol', doc='input column name.'),
 Param(parent='StringIndexer_42268f1dc8b9e4cc7bc0', name='outputCol', doc='output column name.'),
 Param(parent='StringIndexer_42268f1dc8b9e4cc7bc0', name='stringOrderType', doc='How to order labels of string column. The first label after ordering is assigned an index of 0. Supported options: frequencyDesc, frequencyAsc, alphabetDesc, alphabetAsc.')]

In [36]:
#   c# is second frequent in one tag
df.head(1)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

In [46]:
df.select(['oneTag','label']).dropDuplicates().sort('label', ascending=False).show()

+------------------+-----+
|            oneTag|label|
+------------------+-----+
|       filesystems|300.0|
|        encryption|299.0|
|      trigonometry|298.0|
|     jquery-mobile|297.0|
|          printing|296.0|
|               cpu|295.0|
|          integral|294.0|
|          vbscript|293.0|
|     documentation|292.0|
|       mathematica|291.0|
|            mobile|290.0|
|                f#|289.0|
|project-management|288.0|
|        algorithms|287.0|
|        formatting|286.0|
|     soft-question|285.0|
|           msbuild|284.0|
|       asp-classic|283.0|
|       linq-to-sql|282.0|
|        coldfusion|281.0|
+------------------+-----+
only showing top 20 rows



In [47]:
df.select(['oneTag','label']).dropDuplicates().sort('label', ascending=True).show()

+-------------+-----+
|       oneTag|label|
+-------------+-----+
|        other|  0.0|
|           c#|  1.0|
|         java|  2.0|
|          php|  3.0|
|   javascript|  4.0|
|      android|  5.0|
|          c++|  6.0|
|       iphone|  7.0|
|       python|  8.0|
|       jquery|  9.0|
|        linux| 10.0|
|ruby-on-rails| 11.0|
|      asp.net| 12.0|
|        mysql| 13.0|
|          sql| 14.0|
|          ios| 15.0|
|         html| 16.0|
|      windows| 17.0|
|            c| 18.0|
|         .net| 19.0|
+-------------+-----+
only showing top 20 rows

