In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql.functions import *

spark = SparkSession \
    .builder \
    .appName("Session-6") \
    .getOrCreate()

In [62]:
%%capture
! wget 'https://raw.githubusercontent.com/databricks/Spark-The-Definitive-Guide/master/data/retail-data/all/online-retail-dataset.csv'

In [64]:
sales = spark.read.csv('online-retail-dataset.csv', header= True, inferSchema= True)
sales.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 5 rows



In [66]:
# SQL Transformer
from pyspark.ml.feature import SQLTransformer
basicTransformation = SQLTransformer()\
.setStatement("""
SELECT sum(Quantity), count(*), CustomerID
FROM __THIS__
GROUP BY CustomerID
""")
basicTransformation.transform(sales).show(5)

+-------------+--------+----------+
|sum(Quantity)|count(1)|CustomerID|
+-------------+--------+----------+
|          265|      30|     17420|
|          133|       8|     16861|
|          567|      86|     16503|
|         3065|     302|     15727|
|         7430|     224|     17389|
+-------------+--------+----------+
only showing top 5 rows



In [68]:
# RFormula based feature transformation
from pyspark.ml.feature import RFormula

dataset = spark.createDataFrame(
    [(7, "US", 18, 1.0),
     (8, "CA", 12, 0.0),
     (9, "NZ", 15, 0.0)],
    ["id", "country", "hour", "clicked"])

formula = RFormula(
    formula="clicked ~ country + hour",
    featuresCol="features",
    labelCol="label")

output = formula.fit(dataset).transform(dataset)
output.show()

+---+-------+----+-------+--------------+-----+
| id|country|hour|clicked|      features|label|
+---+-------+----+-------+--------------+-----+
|  7|     US|  18|    1.0|[0.0,0.0,18.0]|  1.0|
|  8|     CA|  12|    0.0|[1.0,0.0,12.0]|  0.0|
|  9|     NZ|  15|    0.0|[0.0,1.0,15.0]|  0.0|
+---+-------+----+-------+--------------+-----+



## Feature Assembling


In [78]:
# .. means that we are going to the preceding directory beofre reading the corresponding path
df = spark.read.json("../data/airlines.json")
airport_delays = df.select("Airport.Code", 
                          "Statistics.# of Delays.Security", 
                          "Time.Month",
                          "Time.Year")
airport_delays.show(5)

+----+--------+-----+----+
|Code|Security|Month|Year|
+----+--------+-----+----+
| ATL|      17|    6|2003|
| BOS|       3|    6|2003|
| BWI|       8|    6|2003|
| CLT|       2|    6|2003|
| DCA|       4|    6|2003|
+----+--------+-----+----+
only showing top 5 rows



In [82]:
# Vector Assembler
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler().setInputCols(["Month", "Year"])
va.transform(airport_delays).show(5)

+----+--------+-----+----+------------------------------------+
|Code|Security|Month|Year|VectorAssembler_2ed8900e8557__output|
+----+--------+-----+----+------------------------------------+
| ATL|      17|    6|2003|                        [6.0,2003.0]|
| BOS|       3|    6|2003|                        [6.0,2003.0]|
| BWI|       8|    6|2003|                        [6.0,2003.0]|
| CLT|       2|    6|2003|                        [6.0,2003.0]|
| DCA|       4|    6|2003|                        [6.0,2003.0]|
+----+--------+-----+----+------------------------------------+
only showing top 5 rows



## Working with Continuous features



### Bucketizer



In [84]:
from pyspark.ml.feature import Bucketizer

splits = [-float("inf"), -0.5, 0.0, 0.5, float("inf")]
data = [(-999.9,), (-0.5,), (-0.3,), (0.0,), (0.2,), (999.9,)]
dataFrame = spark.createDataFrame(data, ["features"])
bucketizer = Bucketizer(splits=splits, inputCol="features", outputCol="bucketedFeatures")

# Transform original data into its bucket index.
bucketedData = bucketizer.transform(dataFrame)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits())-1))
bucketedData.show()

Bucketizer output with 4 buckets
+--------+----------------+
|features|bucketedFeatures|
+--------+----------------+
|  -999.9|             0.0|
|    -0.5|             1.0|
|    -0.3|             1.0|
|     0.0|             2.0|
|     0.2|             2.0|
|   999.9|             3.0|
+--------+----------------+



### Scaling and Normalization


In [None]:
%%capture
! wget 'https://raw.githubusercontent.com/Aditya-Mankar/Diabetes-Prediction/master/diabetes.csv'

In [89]:
df = spark.read.csv("diabetes.csv", header= True)
df= df.select(*(col(c).cast("float").alias(c) for c in df.columns))
va = VectorAssembler().setInputCols(['Pregnancies', 'Glucose', 'BloodPressure'])
df= va.transform(df)
vector_column= df.columns[-1]
df_scale= df.select(col(vector_column).alias("features"))

In [91]:
# Standard Scaler
from pyspark.ml.feature import StandardScaler
sScaler = StandardScaler().setInputCol("features")
sScaler.fit(df_scale).transform(df_scale).show(5, truncate= False)

+----------------+----------------------------------------------------------+
|features        |StandardScaler_26be9854c13e__output                       |
+----------------+----------------------------------------------------------+
|[6.0,148.0,72.0]|[1.7806383732194306,4.628960915766174,3.7198138711154307] |
|[1.0,85.0,66.0] |[0.29677306220323846,2.658524850271114,3.4098293818558116]|
|[8.0,183.0,64.0]|[2.3741844976259077,5.723647618818986,3.306501218769272]  |
|[1.0,89.0,66.0] |[0.29677306220323846,2.783631902048578,3.4098293818558116]|
|[0.0,137.0,40.0]|[0.0,4.284916523378148,2.0665632617307947]                |
+----------------+----------------------------------------------------------+
only showing top 5 rows



## Working with Categorical Features

In [99]:
%%capture
! wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip'
! unzip 'bank.zip'

In [101]:
bank= spark.read.csv("bank.csv", header= True, inferSchema= True, sep= ';')
bank.select("job", "marital", "education", "y").show(5)

+-----------+-------+---------+---+
|        job|marital|education|  y|
+-----------+-------+---------+---+
| unemployed|married|  primary| no|
|   services|married|secondary| no|
| management| single| tertiary| no|
| management|married| tertiary| no|
|blue-collar|married|secondary| no|
+-----------+-------+---------+---+
only showing top 5 rows



In [110]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
sringIndexer = StringIndexer().setInputCol("job").setOutputCol("job_x")
bank= sringIndexer.fit(bank).transform(bank)
bank.select("job", "job_x").show(10)

+-------------+-----+
|          job|job_x|
+-------------+-----+
|   unemployed|  8.0|
|     services|  4.0|
|   management|  0.0|
|   management|  0.0|
|  blue-collar|  1.0|
|   management|  0.0|
|self-employed|  6.0|
|   technician|  2.0|
| entrepreneur|  7.0|
|     services|  4.0|
+-------------+-----+
only showing top 10 rows



In [114]:
encoder = OneHotEncoder(inputCols=["job_x"],
                        outputCols=["job_ohe"])
model = encoder.fit(bank)
encoded = model.transform(bank)
encoded.select("job", "job_x", "job_ohe").show(10, truncate= False)

+-------------+-----+--------------+
|job          |job_x|job_ohe       |
+-------------+-----+--------------+
|unemployed   |8.0  |(11,[8],[1.0])|
|services     |4.0  |(11,[4],[1.0])|
|management   |0.0  |(11,[0],[1.0])|
|management   |0.0  |(11,[0],[1.0])|
|blue-collar  |1.0  |(11,[1],[1.0])|
|management   |0.0  |(11,[0],[1.0])|
|self-employed|6.0  |(11,[6],[1.0])|
|technician   |2.0  |(11,[2],[1.0])|
|entrepreneur |7.0  |(11,[7],[1.0])|
|services     |4.0  |(11,[4],[1.0])|
+-------------+-----+--------------+
only showing top 10 rows



## Working with Textual Features

In [116]:
# Sales data
# Tokenization using WhiteSpace
from pyspark.ml.feature import Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("Tokenized_Description")
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(5, False)

+-----------------------------------+------------------------------------------+
|Description                        |Tokenized_Description                     |
+-----------------------------------+------------------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER |[white, hanging, heart, t-light, holder]  |
|WHITE METAL LANTERN                |[white, metal, lantern]                   |
|CREAM CUPID HEARTS COAT HANGER     |[cream, cupid, hearts, coat, hanger]      |
|KNITTED UNION FLAG HOT WATER BOTTLE|[knitted, union, flag, hot, water, bottle]|
|RED WOOLLY HOTTIE WHITE HEART.     |[red, woolly, hottie, white, heart.]      |
+-----------------------------------+------------------------------------------+
only showing top 5 rows



In [121]:
# Tokenization using Hyphen 
from pyspark.ml.feature import RegexTokenizer
rt = RegexTokenizer()\
.setInputCol("Description")\
.setOutputCol("Tokenized_Description")\
.setPattern("-")\
.setToLowercase(True)
rt.transform(sales.select("Description")).show(5, False)

+-----------------------------------+-------------------------------------+
|Description                        |Tokenized_Description                |
+-----------------------------------+-------------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER |[white hanging heart t, light holder]|
|WHITE METAL LANTERN                |[white metal lantern]                |
|CREAM CUPID HEARTS COAT HANGER     |[cream cupid hearts coat hanger]     |
|KNITTED UNION FLAG HOT WATER BOTTLE|[knitted union flag hot water bottle]|
|RED WOOLLY HOTTIE WHITE HEART.     |[red woolly hottie white heart.]     |
+-----------------------------------+-------------------------------------+
only showing top 5 rows



In [127]:
# Stopwords removal
from pyspark.ml.feature import StopWordsRemover
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
.setStopWords(englishStopWords)\
.setInputCol("Tokenized_Description")
stops.transform(tokenized).show(20)

+--------------------+---------------------+-------------------------------------+
|         Description|Tokenized_Description|StopWordsRemover_b1e9d2b661bf__output|
+--------------------+---------------------+-------------------------------------+
|WHITE HANGING HEA...| [white, hanging, ...|                 [white, hanging, ...|
| WHITE METAL LANTERN| [white, metal, la...|                 [white, metal, la...|
|CREAM CUPID HEART...| [cream, cupid, he...|                 [cream, cupid, he...|
|KNITTED UNION FLA...| [knitted, union, ...|                 [knitted, union, ...|
|RED WOOLLY HOTTIE...| [red, woolly, hot...|                 [red, woolly, hot...|
|SET 7 BABUSHKA NE...| [set, 7, babushka...|                 [set, 7, babushka...|
|GLASS STAR FROSTE...| [glass, star, fro...|                 [glass, star, fro...|
|HAND WARMER UNION...| [hand, warmer, un...|                 [hand, warmer, un...|
|HAND WARMER RED P...| [hand, warmer, re...|                 [hand, warmer, re...|
|ASS

In [None]:
# Creating word combinations- NGrams
from pyspark.ml.feature import NGram
bigram = NGram().setInputCol("Tokenized_Description").setN(2)
bigram.transform(tokenized.select("Tokenized_Description")).show(False)

In [139]:
# TF-IDF
tfIdfIn = tokenized\
.where("array_contains(Tokenized_Description, 'red')")\
.limit(10)
tfIdfIn.show(10, False)

+-------------------------------+------------------------------------+
|Description                    |Tokenized_Description               |
+-------------------------------+------------------------------------+
|RED WOOLLY HOTTIE WHITE HEART. |[red, woolly, hottie, white, heart.]|
|HAND WARMER RED POLKA DOT      |[hand, warmer, red, polka, dot]     |
|RED COAT RACK PARIS FASHION    |[red, coat, rack, paris, fashion]   |
|ALARM CLOCK BAKELIKE RED       |[alarm, clock, bakelike, red]       |
|SET/2 RED RETROSPOT TEA TOWELS |[set/2, red, retrospot, tea, towels]|
|RED TOADSTOOL LED NIGHT LIGHT  |[red, toadstool, led, night, light] |
|HAND WARMER RED POLKA DOT      |[hand, warmer, red, polka, dot]     |
|EDWARDIAN PARASOL RED          |[edwardian, parasol, red]           |
|RED WOOLLY HOTTIE WHITE HEART. |[red, woolly, hottie, white, heart.]|
|EDWARDIAN PARASOL RED          |[edwardian, parasol, red]           |
+-------------------------------+------------------------------------+



In [160]:
from pyspark.ml.feature import HashingTF, IDF
tf = HashingTF()\
.setInputCol("Tokenized_Description")\
.setOutputCol("TFOut")\
.setNumFeatures(10000)

idf = IDF()\
.setInputCol("TFOut")\
.setOutputCol("IDFOut")\
.setMinDocFreq(2)

idf.fit(tf.transform(tfIdfIn))\
.transform(tf.transform(tfIdfIn))\
.select("Tokenized_Description", "TFOut", "IDFOut")\
.show(10)

+---------------------+--------------------+--------------------+
|Tokenized_Description|               TFOut|              IDFOut|
+---------------------+--------------------+--------------------+
| [red, woolly, hot...|(10000,[52,388,69...|(10000,[52,388,69...|
| [hand, warmer, re...|(10000,[52,3197,3...|(10000,[52,3197,3...|
| [red, coat, rack,...|(10000,[52,477,20...|(10000,[52,477,20...|
| [alarm, clock, ba...|(10000,[52,4995,8...|(10000,[52,4995,8...|
| [set/2, red, retr...|(10000,[52,129,29...|(10000,[52,129,29...|
| [red, toadstool, ...|(10000,[52,773,17...|(10000,[52,773,17...|
| [hand, warmer, re...|(10000,[52,3197,3...|(10000,[52,3197,3...|
| [edwardian, paras...|(10000,[52,2397,5...|(10000,[52,2397,5...|
| [red, woolly, hot...|(10000,[52,388,69...|(10000,[52,388,69...|
| [edwardian, paras...|(10000,[52,2397,5...|(10000,[52,2397,5...|
+---------------------+--------------------+--------------------+



## Feature Manipulation

In [58]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

df = spark.read.csv("diabetes.csv", header= True)
df= df.select(*(col(c).cast("float").alias(c) for c in df.columns))

# Vector Assembler to assemble all the features into a single vector
va = VectorAssembler().setInputCols(['Pregnancies', 'Glucose', 'BloodPressure'])
df= va.transform(df)
vector_column= df.columns[-1]
df_scale= df.select(col(vector_column).alias("features"))

# Standard Scaler to scale the data
sScaler = StandardScaler().setInputCol("features")
df_scale= sScaler.fit(df_scale).transform(df_scale)
scaled_column= df.columns[-1]
df_scale= df.select(col(scaled_column).alias("scaled_features"))

In [59]:
from pyspark.ml.feature import PCA
pca = PCA().setInputCol("scaled_features").setK(2)
pca= pca.fit(df_scale).transform(df_scale)
pca_features= pca.columns[-1]
pca.select(col("scaled_features"), col(pca_features).alias("principal_components")).show(5, False)

+----------------+----------------------------------------+
|scaled_features |principal_components                    |
+----------------+----------------------------------------+
|[6.0,148.0,72.0]|[-156.77339030024655,50.38873651658909] |
|[1.0,85.0,66.0] |[-93.49160365513833,53.28385715428625]  |
|[8.0,183.0,64.0]|[-190.31428823265458,37.548082093334486]|
|[1.0,89.0,66.0] |[-97.45089137263243,52.716540398185955] |
|[0.0,137.0,40.0]|[-141.2674182991563,20.15663231791124]  |
+----------------+----------------------------------------+
only showing top 5 rows



In [60]:
from pyspark.ml.feature import PolynomialExpansion
pe = PolynomialExpansion().setInputCol("scaled_features").setDegree(2)
pe.transform(df_scale).show(5, truncate= False)

+----------------+---------------------------------------------------------+
|scaled_features |PolynomialExpansion_f4e60579d79f__output                 |
+----------------+---------------------------------------------------------+
|[6.0,148.0,72.0]|[6.0,36.0,148.0,888.0,21904.0,72.0,432.0,10656.0,5184.0] |
|[1.0,85.0,66.0] |[1.0,1.0,85.0,85.0,7225.0,66.0,66.0,5610.0,4356.0]       |
|[8.0,183.0,64.0]|[8.0,64.0,183.0,1464.0,33489.0,64.0,512.0,11712.0,4096.0]|
|[1.0,89.0,66.0] |[1.0,1.0,89.0,89.0,7921.0,66.0,66.0,5874.0,4356.0]       |
|[0.0,137.0,40.0]|[0.0,0.0,137.0,0.0,18769.0,40.0,0.0,5480.0,1600.0]       |
+----------------+---------------------------------------------------------+
only showing top 5 rows



## Feature Selection

In [70]:
# Chi-square based feature selection
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([
    (7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0,),
    (8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0,),
    (9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0,)], ["id", "features", "clicked"])

selector = ChiSqSelector(numTopFeatures=1, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="clicked")

result = selector.fit(df).transform(df)

print("ChiSqSelector output with top %d features selected" 
      % selector.getNumTopFeatures())
result.show()

ChiSqSelector output with top 1 features selected
+---+------------------+-------+----------------+
| id|          features|clicked|selectedFeatures|
+---+------------------+-------+----------------+
|  7|[0.0,0.0,18.0,1.0]|    1.0|          [18.0]|
|  8|[0.0,1.0,12.0,0.0]|    0.0|          [12.0]|
|  9|[1.0,0.0,15.0,0.1]|    0.0|          [15.0]|
+---+------------------+-------+----------------+



## Data Pipeline

In [182]:
from pyspark.ml import Pipeline
tkn = Tokenizer().setInputCol("Description").setOutputCol("Tokenized_Description")
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover().setStopWords(englishStopWords)\
.setInputCol("Tokenized_Description")\
.setOutputCol("Removed_Stopwords")

tf = HashingTF()\
.setInputCol("Removed_Stopwords")\
.setOutputCol("TFOut")\
.setNumFeatures(10000)

stages= [tkn, stops, tf]
pipeline = Pipeline(stages = stages)

In [184]:
pipelineModel = pipeline.fit(sales)
df = pipelineModel.transform(sales)
df = df.select("Tokenized_Description", "Removed_Stopwords", "TFOut").show(5)

+---------------------+--------------------+--------------------+
|Tokenized_Description|   Removed_Stopwords|               TFOut|
+---------------------+--------------------+--------------------+
| [white, hanging, ...|[white, hanging, ...|(10000,[4618,4667...|
| [white, metal, la...|[white, metal, la...|(10000,[426,671,5...|
| [cream, cupid, he...|[cream, cupid, he...|(10000,[477,6575,...|
| [knitted, union, ...|[knitted, union, ...|(10000,[2352,2589...|
| [red, woolly, hot...|[red, woolly, hot...|(10000,[52,388,69...|
+---------------------+--------------------+--------------------+
only showing top 5 rows

