In [1]:
import findspark 
findspark.init()

from pyspark.ml.feature import MinHashLSH, BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import coalesce, udf, struct, col, lit, unix_timestamp, count, when, isnan, isnull, split

from pyspark.ml import Pipeline
from IPython.display import display
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.clustering import KMeans, GaussianMixture, BisectingKMeans
from pyspark.mllib.evaluation import MulticlassMetrics

spark = SparkSession.builder.appName('laptop_everis').getOrCreate()

from pyspark import SparkContext
sc = SparkContext.getOrCreate()

# 1ER METODO USANDO KMEANS

In [2]:
SEED = 29082013

df = spark.read.csv("train.csv", header=True, nullValue="?", inferSchema=True)
train, test = df.randomSplit([0.9, 0.1], seed=SEED)
#ds.printSchema()

# SELECT COLUMNS
cols_features = df.columns[1:-1]
print(df.count(), len(cols_features))
del df
print(train.count(), len(train.columns[1:-1]), test.count(), len(test.columns[1:-1]))

262144 256
236195 256 25949 256


In [3]:
def vector_scaler_df(cols_features, ds, scaled=True):
    """
        Vectorized and Scaled features 
    """
    assembler = VectorAssembler(inputCols=cols_features, outputCol="features")
    scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
    
    stages = [assembler]
    if scaled:
        stages.append(scaler)
    
    pipeline = Pipeline(stages=stages)
    _proccess = pipeline.fit(ds)
    
    ds = _proccess.transform(ds).drop(*cols_features)
    if scaled:
        ds = ds.drop("features")
        
    display(ds.show(5))
    return ds

In [4]:
train = vector_scaler_df(
    cols_features, train, scaled=False
)
test = vector_scaler_df(
    cols_features, test, scaled=False
)

+--------------------+------+--------------------+
|                  id|target|            features|
+--------------------+------+--------------------+
|00006042e379fc155...|     0|[-0.043731,-0.009...|
|00034f0082232ad8f...|     1|[2.749005,0.24356...|
|00036a71992c149e9...|     1|[2.317103,-4.3215...|
|0007233584b5a85b4...|     1|[-9.570656,-5.163...|
|000863c7e31c5f2c6...|     1|[1.436977,0.63941...|
+--------------------+------+--------------------+
only showing top 5 rows



None

+--------------------+------+--------------------+
|                  id|target|            features|
+--------------------+------+--------------------+
|0008851be01140e75...|     0|[1.633524,5.21490...|
|00161f97cdda20b20...|     0|[0.810057,0.28687...|
|002973cbedd83d830...|     1|[0.07062,5.837957...|
|002bde7fd91b49aaa...|     0|[1.146199,-0.0767...|
|003cb39fb6c0a2556...|     1|[1.542351,0.08234...|
+--------------------+------+--------------------+
only showing top 5 rows



None

In [None]:
# DSE ENTRENA CON LOS DATOS DE ENTRENAMIENTO PARA LOGRAR OBTENER LOS CENTROIDES DE ESTOS MISMOS
clusters = 3
max_iter = 100

# TRAIN AND PREDICT
kmeans_clf = BisectingKMeans()\
         .setK(clusters)\
         .setMaxIter(max_iter)\
         .setSeed(SEED)\
         .setFeaturesCol("features")\
         .setPredictionCol("cluster")\
         .setDistanceMeasure("euclidean")

model_km = kmeans_clf.fit(train)
print(dir(model_km))

In [None]:
print(dir(kmeans_clf))

In [None]:
model_km.transform(train).show()

In [None]:
test_with_cluster = model_km.transform(test)
test_with_cluster.show()

In [None]:
# TIPOS DE DATO DE LOS CENTROIDES

for center in model_km.clusterCenters():
    print(type(center))

In [None]:
# Mostrar los centroides.
import numpy as np

centers = [
    list([float(num) for num in _]) for _ in np.round(model_km.clusterCenters(), 6)
]
print(type(centers))

print("Cluster Centers: ")
for center in centers:
    print(type(center), len(center), center[:10])

In [None]:
"""
from scipy.spatial.distance import jaccard, cosine, euclidean
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType
from scipy.spatial import distance

order = 0

for center in centers:
    distance_udf = F.udf(lambda x: float(euclidean(x, center)), FloatType())
    cluster_train = cluster_train.withColumn('distance_centroid_{}'.format(order), distance_udf(F.col('scaled_features')))
    order += 1

cluster_train.show(15)
"""

In [None]:
"""
order = 0

for center in centers:
    distance_udf = F.udf(lambda x: float(euclidean(x, center)), FloatType())
    cluster_test = cluster_test.withColumn('distance_centroid_{}'.format(order), distance_udf(F.col('scaled_features')))
    order += 1

cluster_test.show(15)
"""

In [None]:
from pyspark.sql.types import IntegerType, FloatType, DoubleType

df_centers = sc.parallelize(centers).toDF(cols_features)
df_centers = vector_scaler_df(cols_features, df_centers, scaled=False)
print(type(df_centers))
df_centers.show()

In [None]:
train.columns

In [None]:
only_features = train.select('id', 'features')
only_features.show()

In [None]:
"""
for row in only_features.take(5):
    print("----- ------------------", type(row))
    for vect in row:
        print("///// ", type(vect))
        for val in vect:
            print(type(val))
"""

In [None]:
"""
from pyspark.sql import types
only_features = only_features.withColumn(
    "features_x", only_features["features"].cast(
        types.ArrayType(
            types.DoubleType()
        )
    )
)
"""

In [None]:
only_features.schema
#StructType(List(StructField(features,VectorUDT,true)))

In [None]:
only_features.count()

In [None]:
# help(BucketedRandomProjectionLSH)

In [None]:
brp2 = BucketedRandomProjectionLSH(
    inputCol="features", outputCol="hashes", bucketLength=1
)
model_lsh = brp2.fit(df_centers)
print(dir(model_lsh))

In [None]:
#model_lsh.transform(only_features).take(3)

In [None]:
help(model_lsh.approxNearestNeighbors)

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import lit

order = 0

for center in model_km.clusterCenters():
    
    print(type(center), center.size, center[:10])
    _result = model_lsh.approxNearestNeighbors(
        only_features, Vectors.dense(center), 1
    )
    _result = _result.withColumn("cluster", lit(order))
    df_result = _result if order <= 0 else df_result.union(_result)
    
    order += 1

In [None]:
df_result = df_result.select('cluster', col("id").alias("id_representant"))
df_result.show()

In [None]:
df_result.count()

In [None]:
test_with_cluster = test_with_cluster.join(
    df_result, test_with_cluster.cluster == df_result.cluster, how='left'
)
test_with_cluster.show()

In [None]:
test_with_cluster.count()