In [25]:
import findspark 
findspark.init()

from pyspark.ml.feature import MinHashLSH, BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import coalesce, udf, struct, col, lit, unix_timestamp, count, when, isnan, isnull, split

from pyspark.ml import Pipeline
from IPython.display import display
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.clustering import KMeans, GaussianMixture, BisectingKMeans
from pyspark.mllib.evaluation import MulticlassMetrics

spark = SparkSession.builder.appName('laptop_everis').getOrCreate()

from pyspark import SparkContext
sc = SparkContext.getOrCreate()

# 1ER METODO USANDO KMEANS

In [2]:
SEED = 29082013

df = spark.read.csv("train.csv", header=True, nullValue="?", inferSchema=True)
train, test = df.randomSplit([0.9, 0.1], seed=SEED)
#ds.printSchema()

# SELECT COLUMNS
cols_features = df.columns[1:-1]
print(df.count(), len(cols_features))
del df
print(train.count(), len(train.columns[1:-1]), test.count(), len(test.columns[1:-1]))

262144 256
236195 256 25949 256


In [3]:
def vector_scaler_df(cols_features, ds, scaled=True):
    """
        Vectorized and Scaled features 
    """
    assembler = VectorAssembler(inputCols=cols_features, outputCol="features")
    scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
    
    stages = [assembler]
    if scaled:
        stages.append(scaler)
    
    pipeline = Pipeline(stages=stages)
    _proccess = pipeline.fit(ds)
    
    ds = _proccess.transform(ds).drop(*cols_features)
    if scaled:
        ds = ds.drop("features")
        
    display(ds.show(5))
    return ds

In [4]:
train = vector_scaler_df(
    cols_features, train, scaled=False
)
test = vector_scaler_df(
    cols_features, test, scaled=False
)

+--------------------+------+--------------------+
|                  id|target|            features|
+--------------------+------+--------------------+
|00006042e379fc155...|     0|[-0.043731,-0.009...|
|00034f0082232ad8f...|     1|[2.749005,0.24356...|
|00036a71992c149e9...|     1|[2.317103,-4.3215...|
|0007233584b5a85b4...|     1|[-9.570656,-5.163...|
|000863c7e31c5f2c6...|     1|[1.436977,0.63941...|
+--------------------+------+--------------------+
only showing top 5 rows



None

+--------------------+------+--------------------+
|                  id|target|            features|
+--------------------+------+--------------------+
|0008851be01140e75...|     0|[1.633524,5.21490...|
|00161f97cdda20b20...|     0|[0.810057,0.28687...|
|002973cbedd83d830...|     1|[0.07062,5.837957...|
|002bde7fd91b49aaa...|     0|[1.146199,-0.0767...|
|003cb39fb6c0a2556...|     1|[1.542351,0.08234...|
+--------------------+------+--------------------+
only showing top 5 rows



None

In [5]:
# DECLARE PARAMETERS
clusters = 3
max_iter = 100

# TRAIN AND PREDICT
kmeans_clf = BisectingKMeans()\
         .setK(clusters)\
         .setMaxIter(max_iter)\
         .setSeed(SEED)\
         .setFeaturesCol("features")\
         .setPredictionCol("cluster")\
         .setDistanceMeasure("euclidean")

model_km = kmeans_clf.fit(train)
print(dir(model_km))

['__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__metaclass__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_call_java', '_clear', '_copyValues', '_copy_params', '_create_from_java_class', '_create_params_from_java', '_defaultParamMap', '_dummy', '_empty_java_param_map', '_from_java', '_java_obj', '_make_java_param_pair', '_new_java_array', '_new_java_obj', '_paramMap', '_params', '_randomUID', '_resetUid', '_resolveParam', '_set', '_setDefault', '_shouldOwn', '_to_java', '_transfer_param_map_from_java', '_transfer_param_map_to_java', '_transfer_params_from_java', '_transfer_params_to_java', '_transform', 'clusterCenters', 'computeCost', 'copy', 'distanceMeasure', 'explainParam', 'explainParams', 'extractParamMap', 'features

In [6]:
print(dir(kmeans_clf))

['__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__metaclass__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_call_java', '_clear', '_copyValues', '_copy_params', '_create_from_java_class', '_create_model', '_create_params_from_java', '_defaultParamMap', '_dummy', '_empty_java_param_map', '_fit', '_fit_java', '_from_java', '_input_kwargs', '_java_obj', '_make_java_param_pair', '_new_java_array', '_new_java_obj', '_paramMap', '_params', '_randomUID', '_resetUid', '_resolveParam', '_set', '_setDefault', '_shouldOwn', '_to_java', '_transfer_param_map_from_java', '_transfer_param_map_to_java', '_transfer_params_from_java', '_transfer_params_to_java', 'copy', 'distanceMeasure', 'explainParam', 'explainParams', 'extractParamMap', '

In [7]:
model_km.transform(train).show()

+--------------------+------+--------------------+-------+
|                  id|target|            features|cluster|
+--------------------+------+--------------------+-------+
|00006042e379fc155...|     0|[-0.043731,-0.009...|      2|
|00034f0082232ad8f...|     1|[2.749005,0.24356...|      0|
|00036a71992c149e9...|     1|[2.317103,-4.3215...|      1|
|0007233584b5a85b4...|     1|[-9.570656,-5.163...|      0|
|000863c7e31c5f2c6...|     1|[1.436977,0.63941...|      0|
|0008dfd4b7ed96f06...|     1|[-0.682157,0.8953...|      1|
|000911571fb8a1f13...|     1|[0.214299,-1.0811...|      0|
|000ae59f3d37d7798...|     0|[-0.399398,1.8554...|      0|
|000d87dbb6b79672c...|     0|[0.753058,0.87277...|      0|
|000f46bfffa3951a8...|     1|[-1.352792,-0.081...|      0|
|00119677d32d59472...|     1|[-1.680918,-0.032...|      1|
|0012110e04af66071...|     1|[-0.517875,-1.848...|      1|
|001243c85b77b2bda...|     1|[1.415826,-2.3050...|      1|
|00130de7cf79838a0...|     1|[2.343037,0.71798...|      

In [8]:
test_with_cluster = model_km.transform(test)
test_with_cluster.show()

+--------------------+------+--------------------+-------+
|                  id|target|            features|cluster|
+--------------------+------+--------------------+-------+
|0008851be01140e75...|     0|[1.633524,5.21490...|      2|
|00161f97cdda20b20...|     0|[0.810057,0.28687...|      0|
|002973cbedd83d830...|     1|[0.07062,5.837957...|      2|
|002bde7fd91b49aaa...|     0|[1.146199,-0.0767...|      0|
|003cb39fb6c0a2556...|     1|[1.542351,0.08234...|      0|
|0045ccf444c75d050...|     0|[0.421806,-0.2879...|      2|
|00637dfbac621f4e5...|     0|[-0.70026,0.74640...|      2|
|0066c2b57dfa271f4...|     0|[0.682159,0.43132...|      0|
|007856881856ea586...|     1|[0.029747,-0.3572...|      2|
|007d8978c0a14237b...|     0|[-1.499893,-1.557...|      0|
|0082a40276f7d89e9...|     0|[-1.442629,2.0681...|      0|
|0084c2f5fe4eb1d21...|     1|[1.760883,1.51477...|      0|
|009c0d9555ea91b54...|     0|[-4.279094,0.7883...|      0|
|00a017990137d4d4c...|     0|[-1.190355,-0.630...|      

In [9]:
for center in model_km.clusterCenters():
    print(type(center))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [10]:
# Mostrar los centroides.
import numpy as np

centers = [list([float(num) for num in _]) for _ in np.round(model_km.clusterCenters(), 6)]
print(type(centers))

print("Cluster Centers: ")
for center in centers:
    print(type(center), len(center), center[:10])

<class 'list'>
Cluster Centers: 
<class 'list'> 256 [0.004476, -0.00708, -0.003175, -0.009977, 0.00495, 0.004227, 0.015417, -0.002271, 0.012098, 0.012234]
<class 'list'> 256 [0.015768, 0.010681, -0.031701, -0.005772, 0.005109, 0.011642, -0.004091, -0.021232, -0.007003, 0.022971]
<class 'list'> 256 [-0.005795, 0.011798, -0.011847, -0.00244, -0.001008, 0.015484, -0.042012, 0.000765, -0.008965, 0.030481]


In [11]:
"""
from scipy.spatial.distance import jaccard, cosine, euclidean
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType
from scipy.spatial import distance

order = 0

for center in centers:
    distance_udf = F.udf(lambda x: float(euclidean(x, center)), FloatType())
    cluster_train = cluster_train.withColumn('distance_centroid_{}'.format(order), distance_udf(F.col('scaled_features')))
    order += 1

cluster_train.show(15)
"""

"\nfrom scipy.spatial.distance import jaccard, cosine, euclidean\nimport pyspark.sql.functions as F\nfrom pyspark.sql.types import FloatType\nfrom scipy.spatial import distance\n\norder = 0\n\nfor center in centers:\n    distance_udf = F.udf(lambda x: float(euclidean(x, center)), FloatType())\n    cluster_train = cluster_train.withColumn('distance_centroid_{}'.format(order), distance_udf(F.col('scaled_features')))\n    order += 1\n\ncluster_train.show(15)\n"

In [12]:
"""
order = 0

for center in centers:
    distance_udf = F.udf(lambda x: float(euclidean(x, center)), FloatType())
    cluster_test = cluster_test.withColumn('distance_centroid_{}'.format(order), distance_udf(F.col('scaled_features')))
    order += 1

cluster_test.show(15)
"""

"\norder = 0\n\nfor center in centers:\n    distance_udf = F.udf(lambda x: float(euclidean(x, center)), FloatType())\n    cluster_test = cluster_test.withColumn('distance_centroid_{}'.format(order), distance_udf(F.col('scaled_features')))\n    order += 1\n\ncluster_test.show(15)\n"

In [13]:
from pyspark.sql.types import IntegerType, FloatType, DoubleType

df_centers = sc.parallelize(centers).toDF(cols_features)
df_centers = vector_scaler_df(cols_features, df_centers, scaled=False)
print(type(df_centers))
df_centers.show()

+--------------------+
|            features|
+--------------------+
|[0.004476,-0.0070...|
|[0.015768,0.01068...|
|[-0.005795,0.0117...|
+--------------------+



None

<class 'pyspark.sql.dataframe.DataFrame'>
+--------------------+
|            features|
+--------------------+
|[0.004476,-0.0070...|
|[0.015768,0.01068...|
|[-0.005795,0.0117...|
+--------------------+



In [14]:
train.columns

['id', 'target', 'features']

In [15]:
only_features = train.select('id', 'features')
only_features.show()

+--------------------+--------------------+
|                  id|            features|
+--------------------+--------------------+
|00006042e379fc155...|[-0.043731,-0.009...|
|00034f0082232ad8f...|[2.749005,0.24356...|
|00036a71992c149e9...|[2.317103,-4.3215...|
|0007233584b5a85b4...|[-9.570656,-5.163...|
|000863c7e31c5f2c6...|[1.436977,0.63941...|
|0008dfd4b7ed96f06...|[-0.682157,0.8953...|
|000911571fb8a1f13...|[0.214299,-1.0811...|
|000ae59f3d37d7798...|[-0.399398,1.8554...|
|000d87dbb6b79672c...|[0.753058,0.87277...|
|000f46bfffa3951a8...|[-1.352792,-0.081...|
|00119677d32d59472...|[-1.680918,-0.032...|
|0012110e04af66071...|[-0.517875,-1.848...|
|001243c85b77b2bda...|[1.415826,-2.3050...|
|00130de7cf79838a0...|[2.343037,0.71798...|
|0013c5d08f59e6310...|[0.134893,-1.2554...|
|00156f595cd45c8b5...|[0.233349,-0.5461...|
|00163d20a5e50c2c5...|[-0.684427,0.2366...|
|00188c43276013078...|[-0.252563,-1.770...|
|0018da43205bb7b22...|[-1.153068,2.3349...|
|0018f4b3976ac811c...|[-1.073428

In [16]:
"""
for row in only_features.take(5):
    print("----- ------------------", type(row))
    for vect in row:
        print("///// ", type(vect))
        for val in vect:
            print(type(val))
"""

'\nfor row in only_features.take(5):\n    print("----- ------------------", type(row))\n    for vect in row:\n        print("///// ", type(vect))\n        for val in vect:\n            print(type(val))\n'

In [17]:
"""
from pyspark.sql import types
only_features = only_features.withColumn(
    "features_x", only_features["features"].cast(
        types.ArrayType(
            types.DoubleType()
        )
    )
)
"""

'\nfrom pyspark.sql import types\nonly_features = only_features.withColumn(\n    "features_x", only_features["features"].cast(\n        types.ArrayType(\n            types.DoubleType()\n        )\n    )\n)\n'

In [18]:
only_features.schema
#StructType(List(StructField(features,VectorUDT,true)))

StructType(List(StructField(id,StringType,true),StructField(features,VectorUDT,true)))

In [19]:
only_features.count()

236195

In [20]:
# help(BucketedRandomProjectionLSH)

In [21]:
brp2 = BucketedRandomProjectionLSH(
    inputCol="features", outputCol="hashes", bucketLength=1
)
model_lsh = brp2.fit(df_centers)
print(dir(model_lsh))

['__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__metaclass__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_call_java', '_clear', '_copyValues', '_copy_params', '_create_from_java_class', '_create_params_from_java', '_defaultParamMap', '_dummy', '_empty_java_param_map', '_from_java', '_java_obj', '_make_java_param_pair', '_new_java_array', '_new_java_obj', '_paramMap', '_params', '_randomUID', '_resetUid', '_resolveParam', '_set', '_setDefault', '_shouldOwn', '_to_java', '_transfer_param_map_from_java', '_transfer_param_map_to_java', '_transfer_params_from_java', '_transfer_params_to_java', '_transform', 'approxNearestNeighbors', 'approxSimilarityJoin', 'bucketLength', 'copy', 'explainParam', 'explainParams', 'extractParamM

In [152]:
#model_lsh.transform(only_features).take(3)

In [22]:
help(model_lsh.approxNearestNeighbors)

Help on method approxNearestNeighbors in module pyspark.ml.feature:

approxNearestNeighbors(dataset, key, numNearestNeighbors, distCol='distCol') method of pyspark.ml.feature.BucketedRandomProjectionLSHModel instance
    Given a large dataset and an item, approximately find at most k items which have the
    closest distance to the item. If the :py:attr:`outputCol` is missing, the method will
    transform the data; if the :py:attr:`outputCol` exists, it will use that. This allows
    caching of the transformed data when necessary.
    
    .. note:: This method is experimental and will likely change behavior in the next release.
    
    :param dataset: The dataset to search for nearest neighbors of the key.
    :param key: Feature vector representing the item to search for.
    :param numNearestNeighbors: The maximum number of nearest neighbors.
    :param distCol: Output column for storing the distance between each result row and the key.
                    Use "distCol" as default

In [23]:
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import lit

order = 0

for center in model_km.clusterCenters():
    
    print(type(center), center.size, center[:10])
    _result = model_lsh.approxNearestNeighbors(
        only_features, Vectors.dense(center), 1
    )
    _result = _result.withColumn("cluster", lit(order))
    df_result = _result if order <= 0 else df_result.union(_result)
    
    order += 1

<class 'numpy.ndarray'> 256 [ 0.00447589 -0.00707993 -0.0031752  -0.00997688  0.00494996  0.0042267
  0.01541673 -0.00227112  0.01209811  0.01223353]
<class 'numpy.ndarray'> 256 [ 0.01576845  0.01068113 -0.03170115 -0.0057719   0.00510861  0.01164214
 -0.00409054 -0.02123242 -0.00700264  0.02297098]
<class 'numpy.ndarray'> 256 [-0.00579469  0.0117981  -0.0118467  -0.00244003 -0.00100815  0.01548432
 -0.04201236  0.0007651  -0.00896459  0.03048135]


In [26]:
df_result = df_result.select('cluster', col("id").alias("id_representant"))
df_result.show()

+-------+--------------------+
|cluster|     id_representant|
+-------+--------------------+
|      0|0eb9a64ef4ace1d18...|
|      1|00a4aae9bb201b156...|
|      2|b795fad07ee300251...|
+-------+--------------------+



In [27]:
df_result.count()

3

In [28]:
test_with_cluster = test_with_cluster.join(
    df_result, test_with_cluster.cluster == df_result.cluster, how='left'
)
test_with_cluster.show()

+--------------------+------+--------------------+-------+-------+--------------------+
|                  id|target|            features|cluster|cluster|     id_representant|
+--------------------+------+--------------------+-------+-------+--------------------+
|0008851be01140e75...|     0|[1.633524,5.21490...|      2|      2|b795fad07ee300251...|
|00161f97cdda20b20...|     0|[0.810057,0.28687...|      0|      0|0eb9a64ef4ace1d18...|
|002973cbedd83d830...|     1|[0.07062,5.837957...|      2|      2|b795fad07ee300251...|
|002bde7fd91b49aaa...|     0|[1.146199,-0.0767...|      0|      0|0eb9a64ef4ace1d18...|
|003cb39fb6c0a2556...|     1|[1.542351,0.08234...|      0|      0|0eb9a64ef4ace1d18...|
|0045ccf444c75d050...|     0|[0.421806,-0.2879...|      2|      2|b795fad07ee300251...|
|00637dfbac621f4e5...|     0|[-0.70026,0.74640...|      2|      2|b795fad07ee300251...|
|0066c2b57dfa271f4...|     0|[0.682159,0.43132...|      0|      0|0eb9a64ef4ace1d18...|
|007856881856ea586...|     1|[0.

In [29]:
test_with_cluster.count()

25949