In [1]:
import findspark 
findspark.init()

from pyspark.ml.feature import MinHashLSH, BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

from pyspark.ml import Pipeline
from IPython.display import display
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.clustering import KMeans, GaussianMixture, BisectingKMeans
from pyspark.mllib.evaluation import MulticlassMetrics

spark = SparkSession.builder.appName('laptop_everis').getOrCreate()

from pyspark import SparkContext
sc = SparkContext.getOrCreate()

# 1ER METODO USANDO KMEANS

In [2]:
SEED = 29082013

df = spark.read.csv("train.csv", header=True, nullValue="?", inferSchema=True)
train, test = df.randomSplit([0.9, 0.1], seed=SEED)
#ds.printSchema()

# SELECT COLUMNS
cols_features = df.columns[1:-1]
print(df.count(), len(cols_features))
del df
print(train.count(), len(train.columns[1:-1]), test.count(), len(test.columns[1:-1]))

262144 256
236195 256 25949 256


In [3]:
def vector_scaler_df(cols_features, ds):
    """
        Vectorized and Scaled features 
    """
    assembler = VectorAssembler(inputCols=cols_features, outputCol="features")
    scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
    pipeline = Pipeline(stages=[assembler, scaler])
    
    _proccess = pipeline.fit(ds)
    ds = _proccess.transform(ds).drop(*cols_features).drop("features")
    display(ds.show(5))
    
    return ds

In [4]:
train = vector_scaler_df(cols_features, train)
test = vector_scaler_df(cols_features, test)

+--------------------+------+--------------------+
|                  id|target|     scaled_features|
+--------------------+------+--------------------+
|00006042e379fc155...|     0|[0.49528391025019...|
|00034f0082232ad8f...|     1|[0.58426583877505...|
|00036a71992c149e9...|     1|[0.57050461218803...|
|0007233584b5a85b4...|     1|[0.19173779654246...|
|000863c7e31c5f2c6...|     1|[0.54246210877028...|
+--------------------+------+--------------------+
only showing top 5 rows



None

+--------------------+------+--------------------+
|                  id|target|     scaled_features|
+--------------------+------+--------------------+
|0008851be01140e75...|     0|[0.53830965805633...|
|00161f97cdda20b20...|     0|[0.50902435894809...|
|002973cbedd83d830...|     1|[0.48272745364806...|
|002bde7fd91b49aaa...|     0|[0.52097871657607...|
|003cb39fb6c0a2556...|     1|[0.53506723477089...|
+--------------------+------+--------------------+
only showing top 5 rows



None

In [None]:
# DECLARE PARAMETERS
clusters = 3
max_iter = 100

# TRAIN AND PREDICT
kmeans_clf = BisectingKMeans()\
         .setK(clusters)\
         .setMaxIter(max_iter)\
         .setSeed(SEED)\
         .setFeaturesCol("scaled_features")\
         .setPredictionCol("cluster")\
         .setDistanceMeasure("euclidean")

model_km = kmeans_clf.fit(train)
print(dir(model_km))

In [80]:
print(dir(kmeans_clf))

['__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__metaclass__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_call_java', '_clear', '_copyValues', '_copy_params', '_create_from_java_class', '_create_model', '_create_params_from_java', '_defaultParamMap', '_dummy', '_empty_java_param_map', '_fit', '_fit_java', '_from_java', '_input_kwargs', '_java_obj', '_make_java_param_pair', '_new_java_array', '_new_java_obj', '_paramMap', '_params', '_randomUID', '_resetUid', '_resolveParam', '_set', '_setDefault', '_shouldOwn', '_to_java', '_transfer_param_map_from_java', '_transfer_param_map_to_java', '_transfer_params_from_java', '_transfer_params_to_java', 'copy', 'distanceMeasure', 'explainParam', 'explainParams', 'extractParamMap', '

In [78]:
model_gauss.transform(ds).show()

+--------------------+------+--------------------+-------------------+-------------------+-------------------+-------+
|                  id|target|            features|distance_centroid_0|distance_centroid_1|distance_centroid_2|cluster|
+--------------------+------+--------------------+-------------------+-------------------+-------------------+-------+
|707b395ecdcbb4dc2...|     0|[-2.070654,1.0181...|           39.86638|          222.42633|          349.65533|      0|
|5880c03c6582a7b42...|     0|[-0.491702,0.0826...|           79.16347|          268.75504|          396.28262|      0|
|4ccbcb3d13e5072ff...|     1|[-1.680473,0.8605...|          107.12958|           94.70603|          219.66464|      0|
|e350f17a357f12a19...|     0|[0.183774,0.91913...|          55.019203|          242.87566|            370.334|      0|
|a8f910ea6075b6376...|     0|[-0.203933,-0.177...|          370.62872|          179.29964|          56.061794|      2|
|60cf059121122100e...|     1|[-1.114752,-0.718..

In [None]:
help()

In [49]:
# Mostrar los centroides.
import numpy as np

centers = [list([float(num) for num in _]) for _ in np.round(model_gauss.clusterCenters(), 6)]
print(type(centers))

print("Cluster Centers: ")
for center in centers:
    print(type(center), len(center), center[:10])

<class 'list'>
Cluster Centers: 
<class 'list'> 256 [0.006021, -0.009121, -0.004668, -0.006624, 0.003569, 0.002426, 0.016601, -0.001, 0.013006, 0.011731]
<class 'list'> 256 [0.01769, 0.009033, -0.029007, -0.006799, 0.003283, 0.011599, -0.003436, -0.021562, -0.009004, 0.022601]
<class 'list'> 256 [-0.006023, 0.01291, -0.011754, -0.00271, -0.003982, 0.015037, -0.043657, -0.000137, -0.006309, 0.035831]


In [71]:
from scipy.spatial.distance import jaccard, cosine, euclidean
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType
from scipy.spatial import distance

order = 0

for center in centers:
    distance_udf = F.udf(lambda x: float(euclidean(x, center)), FloatType())
    ds = ds.withColumn('distance_centroid_{}'.format(order), distance_udf(F.col('features')))
    order += 1

In [79]:
ds.show(15)

+--------------------+------+--------------------+-------------------+-------------------+-------------------+
|                  id|target|            features|distance_centroid_0|distance_centroid_1|distance_centroid_2|
+--------------------+------+--------------------+-------------------+-------------------+-------------------+
|707b395ecdcbb4dc2...|     0|[-2.070654,1.0181...|           39.86638|          222.42633|          349.65533|
|5880c03c6582a7b42...|     0|[-0.491702,0.0826...|           79.16347|          268.75504|          396.28262|
|4ccbcb3d13e5072ff...|     1|[-1.680473,0.8605...|          107.12958|           94.70603|          219.66464|
|e350f17a357f12a19...|     0|[0.183774,0.91913...|          55.019203|          242.87566|            370.334|
|a8f910ea6075b6376...|     0|[-0.203933,-0.177...|          370.62872|          179.29964|          56.061794|
|60cf059121122100e...|     1|[-1.114752,-0.718...|           28.37505|          190.73352|          317.77713|
|

In [47]:
from pyspark.sql.types import IntegerType, FloatType, DoubleType

df_centers = sc.parallelize(centers).toDF(cols_features)
df_centers = assembler.transform(df_centers).drop(*cols_features)

In [48]:
mh = MinHashLSH(inputCol="features", outputCol="hashes", seed=SEED)
model = mh.fit(df_centers)

print(dir(model))

['__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__metaclass__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_call_java', '_clear', '_copyValues', '_copy_params', '_create_from_java_class', '_create_params_from_java', '_defaultParamMap', '_dummy', '_empty_java_param_map', '_from_java', '_java_obj', '_make_java_param_pair', '_new_java_array', '_new_java_obj', '_paramMap', '_params', '_randomUID', '_resetUid', '_resolveParam', '_set', '_setDefault', '_shouldOwn', '_to_java', '_transfer_param_map_from_java', '_transfer_param_map_to_java', '_transfer_params_from_java', '_transfer_params_to_java', '_transform', 'approxNearestNeighbors', 'approxSimilarityJoin', 'copy', 'explainParam', 'explainParams', 'extractParamMap', 'getOrDefau

In [5]:
help(model.transform)

Help on method transform in module pyspark.ml.base:

transform(dataset, params=None) method of pyspark.ml.feature.MinHashLSHModel instance
    Transforms the input dataset with optional parameters.
    
    :param dataset: input dataset, which is an instance of :py:class:`pyspark.sql.DataFrame`
    :param params: an optional param map that overrides embedded params.
    :returns: transformed dataset
    
    .. versionadded:: 1.3.0



In [7]:
help(model.approxNearestNeighbors)

Help on method approxNearestNeighbors in module pyspark.ml.feature:

approxNearestNeighbors(dataset, key, numNearestNeighbors, distCol='distCol') method of pyspark.ml.feature.MinHashLSHModel instance
    Given a large dataset and an item, approximately find at most k items which have the
    closest distance to the item. If the :py:attr:`outputCol` is missing, the method will
    transform the data; if the :py:attr:`outputCol` exists, it will use that. This allows
    caching of the transformed data when necessary.
    
    .. note:: This method is experimental and will likely change behavior in the next release.
    
    :param dataset: The dataset to search for nearest neighbors of the key.
    :param key: Feature vector representing the item to search for.
    :param numNearestNeighbors: The maximum number of nearest neighbors.
    :param distCol: Output column for storing the distance between each result row and the key.
                    Use "distCol" as default value if it's no

Help on method approxNearestNeighbors in module pyspark.ml.feature:

approxNearestNeighbors(dataset, key, numNearestNeighbors, distCol='distCol') method of pyspark.ml.feature.MinHashLSHModel instance
    Given a large dataset and an item, approximately find at most k items which have the
    closest distance to the item. If the :py:attr:`outputCol` is missing, the method will
    transform the data; if the :py:attr:`outputCol` exists, it will use that. This allows
    caching of the transformed data when necessary.
    
    .. note:: This method is experimental and will likely change behavior in the next release.
    
    :param dataset: The dataset to search for nearest neighbors of the key.
    :param key: Feature vector representing the item to search for.
    :param numNearestNeighbors: The maximum number of nearest neighbors.
    :param distCol: Output column for storing the distance between each result row and the key.
                    Use "distCol" as default value if it's no

In [8]:
print(dir(mh))

['__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__metaclass__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_call_java', '_clear', '_copyValues', '_copy_params', '_create_from_java_class', '_create_model', '_create_params_from_java', '_defaultParamMap', '_dummy', '_empty_java_param_map', '_fit', '_fit_java', '_from_java', '_input_kwargs', '_java_obj', '_make_java_param_pair', '_new_java_array', '_new_java_obj', '_paramMap', '_params', '_randomUID', '_resetUid', '_resolveParam', '_set', '_setDefault', '_shouldOwn', '_to_java', '_transfer_param_map_from_java', '_transfer_param_map_to_java', '_transfer_params_from_java', '_transfer_params_to_java', 'copy', 'explainParam', 'explainParams', 'extractParamMap', 'fit', 'fitMultiple'

In [9]:
print(dir(model))

['__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__metaclass__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_call_java', '_clear', '_copyValues', '_copy_params', '_create_from_java_class', '_create_params_from_java', '_defaultParamMap', '_dummy', '_empty_java_param_map', '_from_java', '_java_obj', '_make_java_param_pair', '_new_java_array', '_new_java_obj', '_paramMap', '_params', '_randomUID', '_resetUid', '_resolveParam', '_set', '_setDefault', '_shouldOwn', '_to_java', '_transfer_param_map_from_java', '_transfer_param_map_to_java', '_transfer_params_from_java', '_transfer_params_to_java', '_transform', 'approxNearestNeighbors', 'approxSimilarityJoin', 'copy', 'explainParam', 'explainParams', 'extractParamMap', 'getOrDefau

In [6]:
help(MinHashLSH)

Help on class MinHashLSH in module pyspark.ml.feature:

class MinHashLSH(pyspark.ml.wrapper.JavaEstimator, LSHParams, pyspark.ml.param.shared.HasInputCol, pyspark.ml.param.shared.HasOutputCol, pyspark.ml.param.shared.HasSeed, pyspark.ml.util.JavaMLReadable, pyspark.ml.util.JavaMLWritable)
 |  .. note:: Experimental
 |  
 |  LSH class for Jaccard distance.
 |  The input can be dense or sparse vectors, but it is more efficient if it is sparse.
 |  For example, `Vectors.sparse(10, [(2, 1.0), (3, 1.0), (5, 1.0)])` means there are 10 elements
 |  in the space. This set contains elements 2, 3, and 5. Also, any input vector must have at
 |  least 1 non-zero index, and all non-zero values are treated as binary "1" values.
 |  
 |  .. seealso:: `Wikipedia on MinHash <https://en.wikipedia.org/wiki/MinHash>`_
 |  
 |  >>> from pyspark.ml.linalg import Vectors
 |  >>> from pyspark.sql.functions import col
 |  >>> data = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),
 |  ...         (1, Vect