In [1]:
import findspark 
findspark.init()

from IPython.display import display
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext

spark = SparkSession.builder.appName('laptop_everis').getOrCreate()

from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [2]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.clustering import KMeans, GaussianMixture, BisectingKMeans
from pyspark.mllib.evaluation import MulticlassMetrics

%matplotlib inline 
import matplotlib.pyplot as plt, numpy as np

In [3]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [None]:
%%time
ds = spark.read.csv("train.csv", header=True, nullValue="?", inferSchema=True)
#ds.printSchema()

# SELECT COLUMNS
cols_features = ds.columns[1:-1]
print(ds.count(), len(cols_features))

#VECTORIZER FEATURES
assembler = VectorAssembler(inputCols=cols_features, outputCol="features")
ds = assembler.transform(ds).drop(*cols_features)
print(type(ds))
display(ds.show(1))

# SCALED FEATURES
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
ds = scaler.fit(ds).transform(ds).drop("features")
print(type(ds))
display(ds.show(1))

# DECLARE PARAMETERS
clusters = 3
max_iter = 500

# TRAIN AND PREDICT
kmeans_clf = GaussianMixture()\
         .setK(clusters)\
         .setMaxIter(max_iter)\
         .setFeaturesCol("scaled_features")\
         .setPredictionCol("cluster")

model_gauss = kmeans_clf.fit(ds)
print(dir(model_gauss))

predict_gauss = model_gauss.transform(ds)
display(predict_gauss.show(50))

262144 256
<class 'pyspark.sql.dataframe.DataFrame'>
+--------------------+------+--------------------+
|                  id|target|            features|
+--------------------+------+--------------------+
|707b395ecdcbb4dc2...|     0|[-2.070654,1.0181...|
+--------------------+------+--------------------+
only showing top 1 row



None

<class 'pyspark.sql.dataframe.DataFrame'>
+--------------------+------+--------------------+
|                  id|target|     scaled_features|
+--------------------+------+--------------------+
|707b395ecdcbb4dc2...|     0|[0.43070225358398...|
+--------------------+------+--------------------+
only showing top 1 row



None

# BISECTING-KMEANS

In [None]:
%%time
ds = spark.read.csv("train.csv", header=True, nullValue="?", inferSchema=True)
#ds.printSchema()

# SELECT COLUMNS
cols_features = ds.columns[1:-1]
print(ds.count(), len(cols_features))

#VECTORIZER FEATURES
assembler = VectorAssembler(inputCols=cols_features, outputCol="features")
ds = assembler.transform(ds).drop(*cols_features)
print(type(ds))
display(ds.show(1))

# SCALED FEATURES
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
ds = scaler.fit(ds).transform(ds).drop("features")
print(type(ds))
display(ds.show(1))

# DECLARE PARAMETERS
clusters = 3
max_iter = 100

# TRAIN AND PREDICT
kmeans_clf = BisectingKMeans()\
         .setK(clusters)\
         .setMaxIter(max_iter)\
         .setFeaturesCol("scaled_features")\
         .setPredictionCol("cluster")\
         .setDistanceMeasure("euclidean")

model_km = kmeans_clf.fit(ds)
print(dir(model_km))

predict_knn = model_knn.transform(ds)
display(predict_km.show(10))


# Mostrar los centroides.
centers = model_km.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

    
# EVALUATION OF RESULTS
# Obtener la suma cuadrada de errores 'SSE'
SSE = model_km.computeCost(predict_knn)
print ("Suma cuadrada de errores: " + str(SSE))

# Obtener el número de elementos
n = predict_knn.count()

# Calcular el error cuadratico medio 'RMSE'
RMSE = math.sqrt(SSE/n)
print ("Error cuadratico medio: " + str(RMSE))

262144 256
<class 'pyspark.sql.dataframe.DataFrame'>
+--------------------+------+--------------------+
|                  id|target|            features|
+--------------------+------+--------------------+
|707b395ecdcbb4dc2...|     0|[-2.070654,1.0181...|
+--------------------+------+--------------------+
only showing top 1 row



None

<class 'pyspark.sql.dataframe.DataFrame'>
+--------------------+------+--------------------+
|                  id|target|     scaled_features|
+--------------------+------+--------------------+
|707b395ecdcbb4dc2...|     0|[0.43070225358398...|
+--------------------+------+--------------------+
only showing top 1 row



None

# COSINE

In [None]:
%%time
ds = spark.read.csv("train.csv", header=True, nullValue="?", inferSchema=True)
#ds.printSchema()

# SELECT COLUMNS
cols_features = ds.columns[1:-1]
print(ds.count(), len(cols_features))

#VECTORIZER FEATURES
assembler = VectorAssembler(inputCols=cols_features, outputCol="features")
ds = assembler.transform(ds).drop(*cols_features)
print(type(ds))
display(ds.show(1))

# SCALED FEATURES
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
ds = scaler.fit(ds).transform(ds).drop("features")
print(type(ds))
display(ds.show(1))

# DECLARE PARAMETERS
clusters = 3
max_iter = 500

# TRAIN AND PREDICT
kmeans_clf = KMeans()\
         .setK(clusters)\
         .setMaxIter(max_iter)\
         .setFeaturesCol("scaled_features")\
         .setPredictionCol("cluster")\
         .setDistanceMeasure("cosine")

model_km = kmeans_clf.fit(ds)
print(dir(model_km))

predict_knn = model_knn.transform(ds)
display(predict_km.show(10))


# Mostrar los centroides.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

    
# EVALUATION OF RESULTS
# Obtener la suma cuadrada de errores 'SSE'
SSE = model_km.computeCost(predict_knn)
print ("Suma cuadrada de errores: " + str(SSE))

# Obtener el número de elementos
n = predict_knn.count()

# Calcular el error cuadratico medio 'RMSE'
RMSE = math.sqrt(SSE/n)
print ("Error cuadratico medio: " + str(RMSE))

In [None]:

%%time
from pyspark.ml.clustering import BisectingKMeans
ds = spark.read.csv("train.csv", header=True, nullValue="?", inferSchema=True)
#ds.printSchema()

# SELECT COLUMNS
cols_features = ds.columns[1:-1]
print(ds.count(), len(cols_features))

#VECTORIZER FEATURES
assembler = VectorAssembler(inputCols=cols_features, outputCol="features")
ds = assembler.transform(ds).drop(*cols_features)
print(type(ds))
display(ds.show(1))

# SCALED FEATURES
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
ds = scaler.fit(ds).transform(ds).drop("features")
print(type(ds))
display(ds.show(1))

# DECLARE PARAMETERS
clusters = 3
max_iter = 500

# TRAIN AND PREDICT
kmeans_clf = BisectingKMeans()\
         .setK(clusters)\
         .setMaxIter(max_iter)\
         .setFeaturesCol("scaled_features")\
         .setPredictionCol("cluster")\
         .setDistanceMeasure("cosine")

model_km = kmeans_clf.fit(ds)
print(dir(model_km))

predict_knn = model_knn.transform(ds)
display(predict_km.show(10))


# Mostrar los centroides.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

    
# EVALUATION OF RESULTS
# Obtener la suma cuadrada de errores 'SSE'
SSE = model_km.computeCost(predict_knn)
print ("Suma cuadrada de errores: " + str(SSE))

# Obtener el número de elementos
n = predict_knn.count()

# Calcular el error cuadratico medio 'RMSE'
RMSE = math.sqrt(SSE/n)
print ("Error cuadratico medio: " + str(RMSE))

In [None]:
%%time
ds = spark.read.csv("train.csv", header=True, nullValue="?", inferSchema=True)
#ds.printSchema()

cols_features = ds.columns[1:-1]
print(ds.count(), len(cols_features))

assembler = VectorAssembler(inputCols=cols_features, outputCol="features")
ds = assembler.transform(ds).drop(*cols_features)
print(type(ds))

display(ds.show(1))

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
ds = scaler.fit(ds).transform(ds).drop("features")

display(ds.show(1))

SEED = 29082013
from pyspark.ml.clustering import KMeans

cost = np.zeros(20)
for k in range(2,20):
    kmeans = KMeans().setK(k).setSeed(SEED).setFeaturesCol("scaled_features")
    model = kmeans.fit(ds.sample(False, 0.5, seed=SEED))
    cost[k] = model.computeCost(ds) # requires Spark 2.0 or later

fig, ax = plt.subplots(1,1, figsize =(14,6))
ax.plot(range(2,20), cost[2:20])
ax.set_xlabel('k')
ax.set_ylabel('cost')