In [1]:
def warn(*args, **kwargs):
    pass

import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("Feature Extraction and Transformation using Spark").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/02 17:13:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
sentence_dataframe = spark.createDataFrame([
    (1, "Spark is a distributed computing system"),
    (2, "It provides interfaces for multiple languages"),
    (3, "Spark is built on top of Hadoop")
], ["id", "sentences"])

sentence_dataframe.show(truncate = False)

                                                                                

+---+---------------------------------------------+
|id |sentences                                    |
+---+---------------------------------------------+
|1  |Spark is a distributed computing system      |
|2  |It provides interfaces for multiple languages|
|3  |Spark is built on top of Hadoop              |
+---+---------------------------------------------+



In [6]:
from pyspark.ml.feature import Tokenizer

In [7]:
tokenizer = Tokenizer(inputCol="sentences", outputCol="words")

In [8]:
token_df = tokenizer.transform(sentence_dataframe)
token_df.show(truncate=False)

+---+---------------------------------------------+----------------------------------------------------+
|id |sentences                                    |words                                               |
+---+---------------------------------------------+----------------------------------------------------+
|1  |Spark is a distributed computing system      |[spark, is, a, distributed, computing, system]      |
|2  |It provides interfaces for multiple languages|[it, provides, interfaces, for, multiple, languages]|
|3  |Spark is built on top of Hadoop              |[spark, is, built, on, top, of, hadoop]             |
+---+---------------------------------------------+----------------------------------------------------+



In [9]:
from pyspark.ml.feature import CountVectorizer

In [10]:
textdata = [(1, "I love Spark Spark provides Python API".split()),
            (2, "I love Python Spark supports Python".split()),
            (3, "Spark solves the big problem of big data".split())]

textdata = spark.createDataFrame(textdata, ["id","words"])
textdata.show(truncate=False)

+---+-------------------------------------------------+
|id |words                                            |
+---+-------------------------------------------------+
|1  |[I, love, Spark, Spark, provides, Python, API]   |
|2  |[I, love, Python, Spark, supports, Python]       |
|3  |[Spark, solves, the, big, problem, of, big, data]|
+---+-------------------------------------------------+



In [11]:
cv = CountVectorizer(inputCol="words", outputCol="features")
model = cv.fit(textdata)

                                                                                

In [13]:
result = model.transform(textdata)
result.show(truncate=False)

+---+-------------------------------------------------+---------------------------------------------------+
|id |words                                            |features                                           |
+---+-------------------------------------------------+---------------------------------------------------+
|1  |[I, love, Spark, Spark, provides, Python, API]   |(13,[0,1,2,3,8,10],[2.0,1.0,1.0,1.0,1.0,1.0])      |
|2  |[I, love, Python, Spark, supports, Python]       |(13,[0,1,2,3,11],[1.0,2.0,1.0,1.0,1.0])            |
|3  |[Spark, solves, the, big, problem, of, big, data]|(13,[0,4,5,6,7,9,12],[1.0,2.0,1.0,1.0,1.0,1.0,1.0])|
+---+-------------------------------------------------+---------------------------------------------------+



In [14]:
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF

In [15]:
sentence_data = spark.createDataFrame([
    (1, "Spark supports python"),
    (2, "Spark is fast"),
    (3, "Spark is easy")
], ["id", "sentence"])

sentence_data.show(truncate = False)

+---+---------------------+
|id |sentence             |
+---+---------------------+
|1  |Spark supports python|
|2  |Spark is fast        |
|3  |Spark is easy        |
+---+---------------------+



In [16]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
words_data = tokenizer.transform(sentence_data)
words_data.show(truncate=False)

+---+---------------------+-------------------------+
|id |sentence             |words                    |
+---+---------------------+-------------------------+
|1  |Spark supports python|[spark, supports, python]|
|2  |Spark is fast        |[spark, is, fast]        |
|3  |Spark is easy        |[spark, is, easy]        |
+---+---------------------+-------------------------+



In [17]:
hashingTF = HashingTF(inputCol='words', outputCol='raw_features', numFeatures=10)
featurized_data = hashingTF.transform(words_data)
featurized_data.show(truncate=False)

+---+---------------------+-------------------------+--------------------------+
|id |sentence             |words                    |raw_features              |
+---+---------------------+-------------------------+--------------------------+
|1  |Spark supports python|[spark, supports, python]|(10,[4,6,9],[1.0,1.0,1.0])|
|2  |Spark is fast        |[spark, is, fast]        |(10,[3,6,9],[1.0,1.0,1.0])|
|3  |Spark is easy        |[spark, is, easy]        |(10,[0,6,9],[1.0,1.0,1.0])|
+---+---------------------+-------------------------+--------------------------+



In [18]:
idf = IDF(inputCol='raw_features', outputCol='features')
idf_model = idf.fit(featurized_data)
tfid_model = idf_model.transform(featurized_data)

                                                                                

In [19]:
tfid_model.select('sentence','features').show(truncate=False)

+---------------------+-----------------------------------------+
|sentence             |features                                 |
+---------------------+-----------------------------------------+
|Spark supports python|(10,[4,6,9],[0.6931471805599453,0.0,0.0])|
|Spark is fast        |(10,[3,6,9],[0.6931471805599453,0.0,0.0])|
|Spark is easy        |(10,[0,6,9],[0.6931471805599453,0.0,0.0])|
+---------------------+-----------------------------------------+



In [20]:
from pyspark.ml.feature import StopWordsRemover

In [21]:
textData = spark.createDataFrame([
    (1, ['Spark', 'is', 'an', 'open-source', 'distributed', 'computing', 'system']),
    (2, ['IT', 'has', 'interfaces', 'for', 'multiple', 'languages']),
    (3, ['It', 'has', 'a', 'wide', 'range', 'of', 'libraries', 'and', 'APIs'])
], ["id", "sentence"])

textData.show(truncate = False)

+---+------------------------------------------------------------+
|id |sentence                                                    |
+---+------------------------------------------------------------+
|1  |[Spark, is, an, open-source, distributed, computing, system]|
|2  |[IT, has, interfaces, for, multiple, languages]             |
|3  |[It, has, a, wide, range, of, libraries, and, APIs]         |
+---+------------------------------------------------------------+



In [22]:
remover = StopWordsRemover(inputCol='sentence', outputCol='filtered_sentence')
textData = remover.transform(textData)
textData.show(truncate=False)

+---+------------------------------------------------------------+----------------------------------------------------+
|id |sentence                                                    |filtered_sentence                                   |
+---+------------------------------------------------------------+----------------------------------------------------+
|1  |[Spark, is, an, open-source, distributed, computing, system]|[Spark, open-source, distributed, computing, system]|
|2  |[IT, has, interfaces, for, multiple, languages]             |[interfaces, multiple, languages]                   |
|3  |[It, has, a, wide, range, of, libraries, and, APIs]         |[wide, range, libraries, APIs]                      |
+---+------------------------------------------------------------+----------------------------------------------------+



In [23]:
from pyspark.ml.feature import StringIndexer

colors = spark.createDataFrame(
    [(0, "red"), (1, "red"), (2, "blue"), (3, "yellow" ), (4, "yellow"), (5, "yellow")],
    ["id", "color"])

colors.show()

+---+------+
| id| color|
+---+------+
|  0|   red|
|  1|   red|
|  2|  blue|
|  3|yellow|
|  4|yellow|
|  5|yellow|
+---+------+



In [24]:
indexer = StringIndexer(inputCol='color', outputCol='color_index')
color_indexed = indexer.fit(colors).transform(colors)
color_indexed.show()

                                                                                

+---+------+-----------+
| id| color|color_index|
+---+------+-----------+
|  0|   red|        1.0|
|  1|   red|        1.0|
|  2|  blue|        2.0|
|  3|yellow|        0.0|
|  4|yellow|        0.0|
|  5|yellow|        0.0|
+---+------+-----------+



In [25]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors

In [26]:
data = [(1, Vectors.dense([70, 170, 17])),
        (2, Vectors.dense([80, 165, 25])),
        (3, Vectors.dense([65, 150, 135]))]
df = spark.createDataFrame(data, ["id", "features"])

df.show()

+---+------------------+
| id|          features|
+---+------------------+
|  1| [70.0,170.0,17.0]|
|  2| [80.0,165.0,25.0]|
|  3|[65.0,150.0,135.0]|
+---+------------------+



In [27]:
scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withStd=True, withMean=True)
scaler_model = scaler.fit(df)
scaled_df = scaler_model.transform(df)
scaled_df.show(truncate=False)

+---+------------------+-----------------------------------------------------------+
|id |features          |scaled_features                                            |
+---+------------------+-----------------------------------------------------------+
|1  |[70.0,170.0,17.0] |[-0.218217890235993,0.8006407690254367,-0.6369487984517485]|
|2  |[80.0,165.0,25.0] |[1.0910894511799611,0.3202563076101752,-0.5156252177942725]|
|3  |[65.0,150.0,135.0]|[-0.8728715609439701,-1.120897076635609,1.152574016246021] |
+---+------------------+-----------------------------------------------------------+



In [28]:
spark.stop()

In [29]:
spark = SparkSession.builder.appName("Feature Extraction and Traformation").getOrCreate()

In [30]:
text_data = spark.read.csv("proverbs.csv", header=True, inferSchema=True)
text_data.show(truncate=False)

+---+-----------------------------------------------------------+
|id |text                                                       |
+---+-----------------------------------------------------------+
|1  |When in Rome do as the Romans do.                          |
|2  |Do not judge a book by its cover.                          |
|3  |Actions speak louder than words.                           |
|4  |A picture is worth a thousand words.                       |
|5  |If at first you do not succeed try try again.              |
|6  |Practice makes perfect.                                    |
|7  |An apple a day keeps the doctor away.                      |
|8  |When the going gets tough the tough get going.             |
|9  |All is fair in love and war.                               |
|10 |Too many cooks spoil the broth.                            |
|11 |You can not make an omelette without breaking eggs.        |
|12 |The early bird catches the worm.                           |
|13 |Bette

In [32]:
mpg_data = spark.read.csv("mpg.csv", header=True, inferSchema= True)
mpg_data.show(5)

+----+---------+-----------+----------+------+----------+----+--------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|
+----+---------+-----------+----------+------+----------+----+--------+
|15.0|        8|      390.0|       190|  3850|       8.5|  70|American|
|21.0|        6|      199.0|        90|  2648|      15.0|  70|American|
|18.0|        6|      199.0|        97|  2774|      15.5|  70|American|
|16.0|        8|      304.0|       150|  3433|      12.0|  70|American|
|14.0|        8|      455.0|       225|  3086|      10.0|  70|American|
+----+---------+-----------+----------+------+----------+----+--------+
only showing top 5 rows



In [34]:
# Write code to tokenize the "text" column of the "textdata" dataframe and store the tokens in the column "words"

tokenizer = Tokenizer(inputCol='text', outputCol='words')
text_data = tokenizer.transform(text_data)

In [35]:
text_data.select("id", "words").show(truncate=False)

+---+------------------------------------------------------------------------+
|id |words                                                                   |
+---+------------------------------------------------------------------------+
|1  |[when, in, rome, do, as, the, romans, do.]                              |
|2  |[do, not, judge, a, book, by, its, cover.]                              |
|3  |[actions, speak, louder, than, words.]                                  |
|4  |[a, picture, is, worth, a, thousand, words.]                            |
|5  |[if, at, first, you, do, not, succeed, try, try, again.]                |
|6  |[practice, makes, perfect.]                                             |
|7  |[an, apple, a, day, keeps, the, doctor, away.]                          |
|8  |[when, the, going, gets, tough, the, tough, get, going.]                |
|9  |[all, is, fair, in, love, and, war.]                                    |
|10 |[too, many, cooks, spoil, the, broth.]         

In [36]:
cv = CountVectorizer(inputCol="words", outputCol="features")
model = cv.fit(text_data)
text_data = model.transform(text_data)

In [37]:
text_data.select('id','features').show(truncate=False)

+---+----------------------------------------------------------------------------+
|id |features                                                                    |
+---+----------------------------------------------------------------------------+
|1  |(99,[0,4,5,11,12,41,69,93],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])               |
|2  |(99,[1,3,4,19,20,31,44,54],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])               |
|3  |(99,[7,10,81,86,97],[1.0,1.0,1.0,1.0,1.0])                                  |
|4  |(99,[1,2,10,70,77,87],[2.0,1.0,1.0,1.0,1.0,1.0])                            |
|5  |(99,[3,4,16,17,22,35,53,62,64],[1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0])       |
|6  |(99,[24,27,34],[1.0,1.0,1.0])                                               |
|7  |(99,[0,1,13,48,57,60,63,89],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])              |
|8  |(99,[0,6,8,11,23,42,85],[2.0,1.0,2.0,1.0,1.0,1.0,1.0])                      |
|9  |(99,[2,5,37,39,61,68,96],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])                     |
|10 

In [38]:
indexer = StringIndexer(inputCol="Origin", outputCol="origin_index")

mpg_transformed = indexer.fit(mpg_data).transform(mpg_data)

In [39]:
from pyspark.sql.functions import rand

mpg_transformed.orderBy(rand()).show()

+----+---------+-----------+----------+------+----------+----+--------+------------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|origin_index|
+----+---------+-----------+----------+------+----------+----+--------+------------+
|22.0|        4|      108.0|        94|  2379|      16.5|  73|Japanese|         1.0|
|14.0|        8|      455.0|       225|  3086|      10.0|  70|American|         0.0|
|16.2|        6|      163.0|       133|  3410|      15.8|  78|European|         2.0|
|15.0|        6|      250.0|        72|  3432|      21.0|  75|American|         0.0|
|20.2|        6|      200.0|        88|  3060|      17.1|  81|American|         0.0|
|21.0|        6|      155.0|       107|  2472|      14.0|  73|American|         0.0|
|14.0|        8|      351.0|       153|  4129|      13.0|  72|American|         0.0|
|14.0|        8|      350.0|       165|  4209|      12.0|  71|American|         0.0|
|19.4|        8|      318.0|       140|  3735|      13.2|  78|Ame

In [41]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["Cylinders", "Engine Disp", "Horsepower", "Weight"],
                            outputCol="features")
mpg_transformed = assembler.transform(mpg_transformed)

mpg_transformed.select('MPG', 'features').show(10)

+----+--------------------+
| MPG|            features|
+----+--------------------+
|15.0|[8.0,390.0,190.0,...|
|21.0|[6.0,199.0,90.0,2...|
|18.0|[6.0,199.0,97.0,2...|
|16.0|[8.0,304.0,150.0,...|
|14.0|[8.0,455.0,225.0,...|
|15.0|[8.0,350.0,165.0,...|
|18.0|[8.0,307.0,130.0,...|
|14.0|[8.0,454.0,220.0,...|
|15.0|[8.0,400.0,150.0,...|
|10.0|[8.0,307.0,200.0,...|
+----+--------------------+
only showing top 10 rows



In [42]:
scaler = StandardScaler(inputCol="features", outputCol="scaled_features",
                        withMean=True, withStd=True)

scaled_mpg_transformed = scaler.fit(mpg_transformed).transform(mpg_transformed)

In [48]:
scaled_mpg_transformed.select('features', 'scaled_features').show(truncate=False)

+------------------------+-----------------------------------------------------------------------------------+
|features                |scaled_features                                                                    |
+------------------------+-----------------------------------------------------------------------------------+
|[8.0,390.0,190.0,3850.0]|[1.48205302652896,1.869079955831451,2.222084561602166,1.027093462353608]           |
|[6.0,199.0,90.0,2648.0] |[0.3095711165403583,0.043843985634147174,-0.37591456792553746,-0.38801882543985255]|
|[6.0,199.0,97.0,2774.0] |[0.3095711165403583,0.043843985634147174,-0.1940546288585982,-0.2396792678175763]  |
|[8.0,304.0,150.0,3433.0]|[1.48205302652896,1.0472459587792617,1.1828849097910845,0.5361601645084557]        |
|[8.0,455.0,225.0,3086.0]|[1.48205302652896,2.4902335582546176,3.131384256936862,0.12763773200901246]        |
|[8.0,350.0,165.0,3693.0]|[1.48205302652896,1.4868315851095026,1.57258477922024,0.8422576643639463]          |
|

In [49]:
spark.stop()