In [1]:
import findspark 
from pyspark.sql import SparkSession 
from pyspark.sql.functions import rand 

In [2]:
spark = SparkSession.builder.appName("Feature Extraction and Transformation using Spark").getOrCreate()

# TOKENIZER 

### A tokenizer is used to break a sentence into words

In [3]:
from pyspark.ml.feature import Tokenizer 

In [4]:
sentenceDataFrame = spark.createDataFrame([
    (1, "Spark is a distributed computing system."),
    (2, "It provides interfaces for multiple languages"),
    (3, "Spark is built on top of Hadoop")
], ["id", "sentence"])

In [5]:
sentenceDataFrame.show(truncate = False)

+---+---------------------------------------------+
|id |sentence                                     |
+---+---------------------------------------------+
|1  |Spark is a distributed computing system.     |
|2  |It provides interfaces for multiple languages|
|3  |Spark is built on top of Hadoop              |
+---+---------------------------------------------+



In [6]:
tokenizer = Tokenizer(inputCol = 'sentence', outputCol = 'wods')

In [7]:
token_df = tokenizer.transform(sentenceDataFrame)

In [10]:
token_df.show(truncate=False)

+---+---------------------------------------------+----------------------------------------------------+
|id |sentence                                     |wods                                                |
+---+---------------------------------------------+----------------------------------------------------+
|1  |Spark is a distributed computing system.     |[spark, is, a, distributed, computing, system.]     |
|2  |It provides interfaces for multiple languages|[it, provides, interfaces, for, multiple, languages]|
|3  |Spark is built on top of Hadoop              |[spark, is, built, on, top, of, hadoop]             |
+---+---------------------------------------------+----------------------------------------------------+



# CountVectorizer

### countVectorizer is used to convert text into numerical format. It gives the count of each word in a given document

In [11]:
from pyspark.ml.feature import CountVectorizer

In [12]:
textdata = [(1, "I love Spark Spark provides Python API ".split()),
            (2, "I love Python Spark supports Python".split()),
            (3, "Spark solves the big problem of big data".split())]

textdata = spark.createDataFrame(textdata, ["id", "words"])

textdata.show(truncate=False)

+---+-------------------------------------------------+
|id |words                                            |
+---+-------------------------------------------------+
|1  |[I, love, Spark, Spark, provides, Python, API]   |
|2  |[I, love, Python, Spark, supports, Python]       |
|3  |[Spark, solves, the, big, problem, of, big, data]|
+---+-------------------------------------------------+



In [13]:
cv = CountVectorizer(inputCol = 'words', outputCol = 'features')

In [14]:
model = cv.fit(textdata)

In [15]:
result = model.transform(textdata)

In [16]:
result.show(truncate=False)

+---+-------------------------------------------------+----------------------------------------------------+
|id |words                                            |features                                            |
+---+-------------------------------------------------+----------------------------------------------------+
|1  |[I, love, Spark, Spark, provides, Python, API]   |(13,[0,1,2,3,5,7],[2.0,1.0,1.0,1.0,1.0,1.0])        |
|2  |[I, love, Python, Spark, supports, Python]       |(13,[0,1,2,3,12],[1.0,2.0,1.0,1.0,1.0])             |
|3  |[Spark, solves, the, big, problem, of, big, data]|(13,[0,4,6,8,9,10,11],[1.0,2.0,1.0,1.0,1.0,1.0,1.0])|
+---+-------------------------------------------------+----------------------------------------------------+



# TF-IDF

### Term Frequency - Inverse Document Frequency is used to quantify the importante of a word in a document. 

### TF-IDF is computed by multiplying the number of times a word occurs in a document by the inverse document frequency of the word.

In [17]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [18]:
sentenceData = spark.createDataFrame([
        (1, "Spark supports python"),
        (2, "Spark is fast"),
        (3, "Spark is easy")
    ], ["id", "sentence"])

sentenceData.show(truncate=False)

+---+---------------------+
|id |sentence             |
+---+---------------------+
|1  |Spark supports python|
|2  |Spark is fast        |
|3  |Spark is easy        |
+---+---------------------+



In [19]:
tokenizer = Tokenizer(inputCol = 'sentence', outputCol = 'words')
wordsData = tokenizer.transform(sentenceData)
wordsData.show(truncate = False)

+---+---------------------+-------------------------+
|id |sentence             |words                    |
+---+---------------------+-------------------------+
|1  |Spark supports python|[spark, supports, python]|
|2  |Spark is fast        |[spark, is, fast]        |
|3  |Spark is easy        |[spark, is, easy]        |
+---+---------------------+-------------------------+



In [20]:
hashingTF = HashingTF(inputCol = 'words', outputCol = 'rawFeatures', numFeatures = 10)
featurizedData = hashingTF.transform(wordsData)
featurizedData.show(truncate=False) 

+---+---------------------+-------------------------+--------------------------+
|id |sentence             |words                    |rawFeatures               |
+---+---------------------+-------------------------+--------------------------+
|1  |Spark supports python|[spark, supports, python]|(10,[4,6,9],[1.0,1.0,1.0])|
|2  |Spark is fast        |[spark, is, fast]        |(10,[3,6,9],[1.0,1.0,1.0])|
|3  |Spark is easy        |[spark, is, easy]        |(10,[0,6,9],[1.0,1.0,1.0])|
+---+---------------------+-------------------------+--------------------------+



In [22]:
# create an IDF object 
# mention the rawFeatures column as input 
# mention the features column as output

idf = IDF(inputCol = 'rawFeatures', outputCol = 'features')
idfMdodel = idf.fit(featurizedData)
tfidData = idfMdodel.transform(featurizedData)


In [23]:
tfidData.select("sentence", "features").show(truncate = False)

+---------------------+-----------------------------------------+
|sentence             |features                                 |
+---------------------+-----------------------------------------+
|Spark supports python|(10,[4,6,9],[0.6931471805599453,0.0,0.0])|
|Spark is fast        |(10,[3,6,9],[0.6931471805599453,0.0,0.0])|
|Spark is easy        |(10,[0,6,9],[0.6931471805599453,0.0,0.0])|
+---------------------+-----------------------------------------+



# StopWordsRemover

### StopWordsRemover is a transformer that filters out stop words like 'an', 'a', 'the'.

In [24]:
from pyspark.ml.feature import StopWordsRemover 


In [25]:
textData = spark.createDataFrame([
    (1, ['Spark', 'is', 'an', 'open-source', 'distributed', 'computing', 'system']),
    (2, ['IT', 'has', 'interfaces', 'for', 'multiple', 'languages']),
    (3, ['It', 'has', 'a', 'wide', 'range', 'of', 'libraries', 'and', 'APIs'])
], ["id", "sentence"])

In [26]:
textData.show(truncate = False)

+---+------------------------------------------------------------+
|id |sentence                                                    |
+---+------------------------------------------------------------+
|1  |[Spark, is, an, open-source, distributed, computing, system]|
|2  |[IT, has, interfaces, for, multiple, languages]             |
|3  |[It, has, a, wide, range, of, libraries, and, APIs]         |
+---+------------------------------------------------------------+



In [27]:
remover = StopWordsRemover(inputCol = 'sentence', outputCol = 'filtered_sentence')
textData = remover.transform(textData)

In [28]:
textData.show(truncate = False)

+---+------------------------------------------------------------+----------------------------------------------------+
|id |sentence                                                    |filtered_sentence                                   |
+---+------------------------------------------------------------+----------------------------------------------------+
|1  |[Spark, is, an, open-source, distributed, computing, system]|[Spark, open-source, distributed, computing, system]|
|2  |[IT, has, interfaces, for, multiple, languages]             |[interfaces, multiple, languages]                   |
|3  |[It, has, a, wide, range, of, libraries, and, APIs]         |[wide, range, libraries, APIs]                      |
+---+------------------------------------------------------------+----------------------------------------------------+



# StringIndexer

### StringIndexer converts a column of strings into a column of integers

In [29]:
from pyspark.ml.feature import StringIndexer

In [30]:
colors = spark.createDataFrame(
    [(0, "red"), (1, "red"), (2, "blue"), (3, "yellow" ), (4, "yellow"), (5, "yellow")],
    ["id", "color"])

In [31]:
colors.show()

+---+------+
| id| color|
+---+------+
|  0|   red|
|  1|   red|
|  2|  blue|
|  3|yellow|
|  4|yellow|
|  5|yellow|
+---+------+



In [33]:
indexer = StringIndexer(inputCol = 'color', outputCol= 'colorIndex')
indexed = indexer.fit(colors).transform(colors)

In [35]:
indexed.show()

+---+------+----------+
| id| color|colorIndex|
+---+------+----------+
|  0|   red|       1.0|
|  1|   red|       1.0|
|  2|  blue|       2.0|
|  3|yellow|       0.0|
|  4|yellow|       0.0|
|  5|yellow|       0.0|
+---+------+----------+



# StandardScaler

### StandardScaler transformed the data so that it has a mean of 0 and a standard deviation of 1

In [36]:
from pyspark.ml.feature import StandardScaler

In [37]:
from pyspark.ml.linalg import Vectors
data = [(1, Vectors.dense([70, 170, 17])),
        (2, Vectors.dense([80, 165, 25])),
        (3, Vectors.dense([65, 150, 135]))]
df = spark.createDataFrame(data, ["id", "features"])

df.show()

+---+------------------+
| id|          features|
+---+------------------+
|  1| [70.0,170.0,17.0]|
|  2| [80.0,165.0,25.0]|
|  3|[65.0,150.0,135.0]|
+---+------------------+



In [39]:
scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures', withStd = True, withMean = True)
scalerModel = scaler.fit(df)
scaledData = scalerModel.transform(df)
scaledData.show(truncate= False)

+---+------------------+-----------------------------------------------------------+
|id |features          |scaledFeatures                                             |
+---+------------------+-----------------------------------------------------------+
|1  |[70.0,170.0,17.0] |[-0.218217890235993,0.8006407690254367,-0.6369487984517485]|
|2  |[80.0,165.0,25.0] |[1.0910894511799611,0.3202563076101752,-0.5156252177942725]|
|3  |[65.0,150.0,135.0]|[-0.8728715609439701,-1.120897076635609,1.152574016246021] |
+---+------------------+-----------------------------------------------------------+



In [40]:
spark.stop()

# Exercises

In [41]:
spark = SparkSession.builder.appName("Exercise - Feature Extraction and Transformation using Spark").getOrCreate()

In [42]:
textData = spark.read.csv('proverbs.csv', header = True, inferSchema=True)

In [43]:
textData.show(truncate = False)

+---+-----------------------------------------------------------+
|id |text                                                       |
+---+-----------------------------------------------------------+
|1  |When in Rome do as the Romans do.                          |
|2  |Do not judge a book by its cover.                          |
|3  |Actions speak louder than words.                           |
|4  |A picture is worth a thousand words.                       |
|5  |If at first you do not succeed try try again.              |
|6  |Practice makes perfect.                                    |
|7  |An apple a day keeps the doctor away.                      |
|8  |When the going gets tough the tough get going.             |
|9  |All is fair in love and war.                               |
|10 |Too many cooks spoil the broth.                            |
|11 |You can not make an omelette without breaking eggs.        |
|12 |The early bird catches the worm.                           |
|13 |Bette

In [44]:
mpg_data = spark.read.csv("mpg.csv", header = True, inferSchema=True)

In [46]:
mpg_data.show(5)

+----+---------+-----------+----------+------+----------+----+--------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|
+----+---------+-----------+----------+------+----------+----+--------+
|15.0|        8|      390.0|       190|  3850|       8.5|  70|American|
|21.0|        6|      199.0|        90|  2648|      15.0|  70|American|
|18.0|        6|      199.0|        97|  2774|      15.5|  70|American|
|16.0|        8|      304.0|       150|  3433|      12.0|  70|American|
|14.0|        8|      455.0|       225|  3086|      10.0|  70|American|
+----+---------+-----------+----------+------+----------+----+--------+
only showing top 5 rows



In [50]:
tokenizer = Tokenizer(inputCol = 'text', outputCol = 'words')
textData = tokenizer.transform(textData)
textData.show(5, truncate = False)

+---+---------------------------------------------+--------------------------------------------------------+
|id |text                                         |words                                                   |
+---+---------------------------------------------+--------------------------------------------------------+
|1  |When in Rome do as the Romans do.            |[when, in, rome, do, as, the, romans, do.]              |
|2  |Do not judge a book by its cover.            |[do, not, judge, a, book, by, its, cover.]              |
|3  |Actions speak louder than words.             |[actions, speak, louder, than, words.]                  |
|4  |A picture is worth a thousand words.         |[a, picture, is, worth, a, thousand, words.]            |
|5  |If at first you do not succeed try try again.|[if, at, first, you, do, not, succeed, try, try, again.]|
+---+---------------------------------------------+--------------------------------------------------------+
only showing top 5 

In [52]:
countVector = CountVectorizer(inputCol = 'words', outputCol = 'features')
textData = countVector.fit(textData).transform(textData)
textData.show(5, truncate=False)

+---+---------------------------------------------+--------------------------------------------------------+---------------------------------------------------------------------+
|id |text                                         |words                                                   |features                                                             |
+---+---------------------------------------------+--------------------------------------------------------+---------------------------------------------------------------------+
|1  |When in Rome do as the Romans do.            |[when, in, rome, do, as, the, romans, do.]              |(99,[0,4,5,11,12,41,69,93],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])        |
|2  |Do not judge a book by its cover.            |[do, not, judge, a, book, by, its, cover.]              |(99,[1,3,4,19,20,31,44,54],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])        |
|3  |Actions speak louder than words.             |[actions, speak, louder, than, words.]                

In [54]:
stringIndex = StringIndexer(inputCol = 'Origin', outputCol = 'OriginIndex')
mpg_data = stringIndex.fit(mpg_data).transform(mpg_data)
mpg_data.show(5, truncate = False)

+----+---------+-----------+----------+------+----------+----+--------+-----------+
|MPG |Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|Origin  |OriginIndex|
+----+---------+-----------+----------+------+----------+----+--------+-----------+
|15.0|8        |390.0      |190       |3850  |8.5       |70  |American|0.0        |
|21.0|6        |199.0      |90        |2648  |15.0      |70  |American|0.0        |
|18.0|6        |199.0      |97        |2774  |15.5      |70  |American|0.0        |
|16.0|8        |304.0      |150       |3433  |12.0      |70  |American|0.0        |
|14.0|8        |455.0      |225       |3086  |10.0      |70  |American|0.0        |
+----+---------+-----------+----------+------+----------+----+--------+-----------+
only showing top 5 rows



In [55]:
mpg_data.orderBy('OriginIndex').show()

+----+---------+-----------+----------+------+----------+----+--------+-----------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|OriginIndex|
+----+---------+-----------+----------+------+----------+----+--------+-----------+
|15.0|        8|      390.0|       190|  3850|       8.5|  70|American|        0.0|
|21.0|        6|      199.0|        90|  2648|      15.0|  70|American|        0.0|
|18.0|        6|      199.0|        97|  2774|      15.5|  70|American|        0.0|
|16.0|        8|      304.0|       150|  3433|      12.0|  70|American|        0.0|
|14.0|        8|      455.0|       225|  3086|      10.0|  70|American|        0.0|
|15.0|        8|      350.0|       165|  3693|      11.5|  70|American|        0.0|
|18.0|        8|      307.0|       130|  3504|      12.0|  70|American|        0.0|
|14.0|        8|      454.0|       220|  4354|       9.0|  70|American|        0.0|
|15.0|        8|      400.0|       150|  3761|       9.5|  70|American|     

In [57]:
from pyspark.ml.feature import VectorAssembler 

assembler = VectorAssembler(inputCols = ['Cylinders', 'Engine Disp', 'Horsepower', 'Weight'], outputCol = 'features')
mpg_transformed_data = assembler.transform(mpg_data)
mpg_transformed_data.show(5, truncate = False)

+----+---------+-----------+----------+------+----------+----+--------+-----------+------------------------+
|MPG |Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|Origin  |OriginIndex|features                |
+----+---------+-----------+----------+------+----------+----+--------+-----------+------------------------+
|15.0|8        |390.0      |190       |3850  |8.5       |70  |American|0.0        |[8.0,390.0,190.0,3850.0]|
|21.0|6        |199.0      |90        |2648  |15.0      |70  |American|0.0        |[6.0,199.0,90.0,2648.0] |
|18.0|6        |199.0      |97        |2774  |15.5      |70  |American|0.0        |[6.0,199.0,97.0,2774.0] |
|16.0|8        |304.0      |150       |3433  |12.0      |70  |American|0.0        |[8.0,304.0,150.0,3433.0]|
|14.0|8        |455.0      |225       |3086  |10.0      |70  |American|0.0        |[8.0,455.0,225.0,3086.0]|
+----+---------+-----------+----------+------+----------+----+--------+-----------+------------------------+
only showing top 5 

In [59]:
from pyspark.ml.feature import StandardScaler

In [61]:
scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeature', withStd = True, withMean=True)
scalerModel = scaler.fit(mpg_transformed_data)
scaledData = scalerModel.transform(mpg_transformed_data)

In [62]:
scaledData.show(5, truncate=False)

+----+---------+-----------+----------+------+----------+----+--------+-----------+------------------------+-----------------------------------------------------------------------------------+
|MPG |Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|Origin  |OriginIndex|features                |scaledFeature                                                                      |
+----+---------+-----------+----------+------+----------+----+--------+-----------+------------------------+-----------------------------------------------------------------------------------+
|15.0|8        |390.0      |190       |3850  |8.5       |70  |American|0.0        |[8.0,390.0,190.0,3850.0]|[1.48205302652896,1.869079955831451,2.222084561602166,1.027093462353608]           |
|21.0|6        |199.0      |90        |2648  |15.0      |70  |American|0.0        |[6.0,199.0,90.0,2648.0] |[0.3095711165403583,0.043843985634147174,-0.37591456792553746,-0.38801882543985255]|
|18.0|6        |199.0      |97     

In [63]:
spark.stop()