# What is Spark MLlib?

Apache Spark’s Machine Learning Library (MLlib) is designed for simplicity, scalability, and easy integration with other tools. With the scalability, language compatibility, and speed of Spark, data scientists can focus on their data problems and models instead of solving the complexities surrounding distributed data (such as infrastructure, configurations, and so on). Built on top of Spark, MLlib is a scalable machine learning library consisting of common learning algorithms and utilities, including classification, regression, clustering, collaborative filtering, dimensionality reduction, and underlying optimization primitives. Spark MLLib seamlessly integrates with other Spark components such as Spark SQL, Spark Streaming, and DataFrames and is installed in the Databricks runtime. The library is usable in Java, Scala, and Python as part of Spark applications, so that you can include it in complete workflows. MLlib allows for preprocessing, munging, training of models, and making predictions at scale on data. You can even use models trained in MLlib to make predictions in Structured Streaming. Spark provides a sophisticated machine learning API for performing a variety of machine learning tasks, from classification to regression, clustering to deep learning. 

(https://databricks.com/glossary/what-is-machine-learning-library)

# Loading Dataset

In [0]:
data = spark.sql('SELECT * FROM JEWELLERY')
data.display()

Age,Income,SpendingScore,Savings
58,77769,0.7913287771988531,6559.829923004832
59,81799,0.7910820467274178,5417.661426197439
62,74751,0.7026569520102857,9258.992965034067
59,74373,0.7656795619984281,7346.334503537976
87,17760,0.3487775484305074,16869.507130301474
29,131578,0.847034102512837,3535.5143522162816
54,76500,0.7851978501165687,6878.884248553975
87,42592,0.3552896820382759,18086.287157859304
83,34384,0.3247186821733416,14783.3790857945
84,27693,0.3670629731870412,17879.558905551898


In [0]:
data.printSchema()

# Vector Assembler

In [0]:
# import vector assembler
from pyspark.ml.feature import VectorAssembler

In [0]:
data.columns

In [0]:
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')

In [0]:
final_data = assembler.transform(data)

In [0]:
# notice a new column features created using VectorAssembler
# it is same as what we have seen in supervised experiments
final_data.display()

Age,Income,SpendingScore,Savings,features
58,77769,0.7913287771988531,6559.829923004832,"Map(vectorType -> dense, length -> 4, values -> List(58.0, 77769.0, 0.7913287771988531, 6559.8299230048315))"
59,81799,0.7910820467274178,5417.661426197439,"Map(vectorType -> dense, length -> 4, values -> List(59.0, 81799.0, 0.7910820467274178, 5417.661426197439))"
62,74751,0.7026569520102857,9258.992965034067,"Map(vectorType -> dense, length -> 4, values -> List(62.0, 74751.0, 0.7026569520102857, 9258.992965034067))"
59,74373,0.7656795619984281,7346.334503537976,"Map(vectorType -> dense, length -> 4, values -> List(59.0, 74373.0, 0.7656795619984281, 7346.334503537976))"
87,17760,0.3487775484305074,16869.507130301474,"Map(vectorType -> dense, length -> 4, values -> List(87.0, 17760.0, 0.34877754843050746, 16869.507130301474))"
29,131578,0.847034102512837,3535.5143522162816,"Map(vectorType -> dense, length -> 4, values -> List(29.0, 131578.0, 0.847034102512837, 3535.5143522162816))"
54,76500,0.7851978501165687,6878.884248553975,"Map(vectorType -> dense, length -> 4, values -> List(54.0, 76500.0, 0.7851978501165687, 6878.884248553975))"
87,42592,0.3552896820382759,18086.287157859304,"Map(vectorType -> dense, length -> 4, values -> List(87.0, 42592.0, 0.35528968203827593, 18086.287157859304))"
83,34384,0.3247186821733416,14783.3790857945,"Map(vectorType -> dense, length -> 4, values -> List(83.0, 34384.0, 0.32471868217334166, 14783.3790857945))"
84,27693,0.3670629731870412,17879.558905551898,"Map(vectorType -> dense, length -> 4, values -> List(84.0, 27693.0, 0.3670629731870412, 17879.558905551898))"


# Data Scaling

In [0]:
from pyspark.ml.feature import StandardScaler

In [0]:
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')

In [0]:
# fit and transform in one step
final_data = scaler.fit(final_data).transform(final_data)

In [0]:
final_data.display()

Age,Income,SpendingScore,Savings,features,scaledFeatures
58,77769,0.7913287771988531,6559.829923004832,"Map(vectorType -> dense, length -> 4, values -> List(58.0, 77769.0, 0.7913287771988531, 6559.8299230048315))","Map(vectorType -> dense, length -> 4, values -> List(2.402646932775959, 2.1606748016282364, 3.047865672431294, 1.3254245435220333))"
59,81799,0.7910820467274178,5417.661426197439,"Map(vectorType -> dense, length -> 4, values -> List(59.0, 81799.0, 0.7910820467274178, 5417.661426197439))","Map(vectorType -> dense, length -> 4, values -> List(2.444071879892786, 2.272641259349974, 3.0469153704128433, 1.0946474995628297))"
62,74751,0.7026569520102857,9258.992965034067,"Map(vectorType -> dense, length -> 4, values -> List(62.0, 74751.0, 0.7026569520102857, 9258.992965034067))","Map(vectorType -> dense, length -> 4, values -> List(2.5683467212432665, 2.076824982917516, 2.7063390909505483, 1.8707949242886122))"
59,74373,0.7656795619984281,7346.334503537976,"Map(vectorType -> dense, length -> 4, values -> List(59.0, 74373.0, 0.7656795619984281, 7346.334503537976))","Map(vectorType -> dense, length -> 4, values -> List(2.444071879892786, 2.066322918148579, 2.9490756817387993, 1.4843391018058278))"
87,17760,0.3487775484305074,16869.507130301474,"Map(vectorType -> dense, length -> 4, values -> List(87.0, 17760.0, 0.34877754843050746, 16869.507130301474))","Map(vectorType -> dense, length -> 4, values -> List(3.6039703991639382, 0.4934303446992694, 1.3433444451988616, 3.4085119660205323))"
29,131578,0.847034102512837,3535.5143522162816,"Map(vectorType -> dense, length -> 4, values -> List(29.0, 131578.0, 0.847034102512837, 3535.5143522162816))","Map(vectorType -> dense, length -> 4, values -> List(1.2013234663879795, 3.655663169754531, 3.262419159791005, 0.7143565536612786))"
54,76500,0.7851978501165687,6878.884248553975,"Map(vectorType -> dense, length -> 4, values -> List(54.0, 76500.0, 0.7851978501165687, 6878.884248553975))","Map(vectorType -> dense, length -> 4, values -> List(2.2369471443086515, 2.1254178699039477, 3.024251919547923, 1.3898900005176007))"
87,42592,0.3552896820382759,18086.287157859304,"Map(vectorType -> dense, length -> 4, values -> List(87.0, 42592.0, 0.35528968203827593, 18086.287157859304))","Map(vectorType -> dense, length -> 4, values -> List(3.6039703991639382, 1.1833437635941038, 1.3684264453096906, 3.6543643938307064))"
83,34384,0.3247186821733416,14783.3790857945,"Map(vectorType -> dense, length -> 4, values -> List(83.0, 34384.0, 0.32471868217334166, 14783.3790857945))","Map(vectorType -> dense, length -> 4, values -> List(3.4382706106966308, 0.9552989286114684, 1.2506798098466652, 2.9870063258479878))"
84,27693,0.3670629731870412,17879.558905551898,"Map(vectorType -> dense, length -> 4, values -> List(84.0, 27693.0, 0.3670629731870412, 17879.558905551898))","Map(vectorType -> dense, length -> 4, values -> List(3.4796955578134576, 0.769401268905229, 1.4137722117948686, 3.6125946067076065))"


In [0]:
final_data.head(1)

# KMeans Clustering

In [0]:
from pyspark.ml.clustering import KMeans

In [0]:
kmeans = KMeans(featuresCol='scaledFeatures', k=4)

In [0]:
kmeans_fit = kmeans.fit(final_data)

In [0]:
# check cluster centers
kmeans_fit.clusterCenters()

In [0]:
# which distance measure it is using
kmeans_fit.getDistanceMeasure()

In [0]:
# do you know how to check all the methods and attributes available?
# using python's builtin goodness "dir"

dir(kmeans_fit)

# Generate Cluster Labels?

In [0]:
kmeans_fit.transform(final_data).display()

Age,Income,SpendingScore,Savings,features,scaledFeatures,prediction
58,77769,0.7913287771988531,6559.829923004832,"Map(vectorType -> dense, length -> 4, values -> List(58.0, 77769.0, 0.7913287771988531, 6559.8299230048315))","Map(vectorType -> dense, length -> 4, values -> List(2.402646932775959, 2.1606748016282364, 3.047865672431294, 1.3254245435220333))",1
59,81799,0.7910820467274178,5417.661426197439,"Map(vectorType -> dense, length -> 4, values -> List(59.0, 81799.0, 0.7910820467274178, 5417.661426197439))","Map(vectorType -> dense, length -> 4, values -> List(2.444071879892786, 2.272641259349974, 3.0469153704128433, 1.0946474995628297))",1
62,74751,0.7026569520102857,9258.992965034067,"Map(vectorType -> dense, length -> 4, values -> List(62.0, 74751.0, 0.7026569520102857, 9258.992965034067))","Map(vectorType -> dense, length -> 4, values -> List(2.5683467212432665, 2.076824982917516, 2.7063390909505483, 1.8707949242886122))",1
59,74373,0.7656795619984281,7346.334503537976,"Map(vectorType -> dense, length -> 4, values -> List(59.0, 74373.0, 0.7656795619984281, 7346.334503537976))","Map(vectorType -> dense, length -> 4, values -> List(2.444071879892786, 2.066322918148579, 2.9490756817387993, 1.4843391018058278))",1
87,17760,0.3487775484305074,16869.507130301474,"Map(vectorType -> dense, length -> 4, values -> List(87.0, 17760.0, 0.34877754843050746, 16869.507130301474))","Map(vectorType -> dense, length -> 4, values -> List(3.6039703991639382, 0.4934303446992694, 1.3433444451988616, 3.4085119660205323))",0
29,131578,0.847034102512837,3535.5143522162816,"Map(vectorType -> dense, length -> 4, values -> List(29.0, 131578.0, 0.847034102512837, 3535.5143522162816))","Map(vectorType -> dense, length -> 4, values -> List(1.2013234663879795, 3.655663169754531, 3.262419159791005, 0.7143565536612786))",3
54,76500,0.7851978501165687,6878.884248553975,"Map(vectorType -> dense, length -> 4, values -> List(54.0, 76500.0, 0.7851978501165687, 6878.884248553975))","Map(vectorType -> dense, length -> 4, values -> List(2.2369471443086515, 2.1254178699039477, 3.024251919547923, 1.3898900005176007))",1
87,42592,0.3552896820382759,18086.287157859304,"Map(vectorType -> dense, length -> 4, values -> List(87.0, 42592.0, 0.35528968203827593, 18086.287157859304))","Map(vectorType -> dense, length -> 4, values -> List(3.6039703991639382, 1.1833437635941038, 1.3684264453096906, 3.6543643938307064))",0
83,34384,0.3247186821733416,14783.3790857945,"Map(vectorType -> dense, length -> 4, values -> List(83.0, 34384.0, 0.32471868217334166, 14783.3790857945))","Map(vectorType -> dense, length -> 4, values -> List(3.4382706106966308, 0.9552989286114684, 1.2506798098466652, 2.9870063258479878))",0
84,27693,0.3670629731870412,17879.558905551898,"Map(vectorType -> dense, length -> 4, values -> List(84.0, 27693.0, 0.3670629731870412, 17879.558905551898))","Map(vectorType -> dense, length -> 4, values -> List(3.4796955578134576, 0.769401268905229, 1.4137722117948686, 3.6125946067076065))",0
