In [25]:
import findspark 
findspark.init()

from pyspark.ml.feature import MinHashLSH, BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col

from pyspark.ml import Pipeline
from IPython.display import display
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.clustering import KMeans, GaussianMixture, BisectingKMeans
from pyspark.mllib.evaluation import MulticlassMetrics

spark = SparkSession.builder.appName('laptop_everis').getOrCreate()

from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [26]:
dataA = [(Vectors.dense([1.0, 1.0]),),
         (Vectors.dense([1.0, -1.0]),),
         (Vectors.dense([-1.0, -1.0]),),
         (Vectors.dense([-1.0, 1.0]),)]
dfA = spark.createDataFrame(dataA, ["features"])

dataB = [(Vectors.dense([1.0, 0.0]),),
         (Vectors.dense([-1.0, 0.0]),),
         (Vectors.dense([0.0, 1.0]),),
         (Vectors.dense([0.0, -1.0]),)]
dfB = spark.createDataFrame(dataB, ["features"])

key = Vectors.dense([1.0, 0.0])

In [27]:
print(type(dfA))
dfA.show()

<class 'pyspark.sql.dataframe.DataFrame'>
+-----------+
|   features|
+-----------+
|  [1.0,1.0]|
| [1.0,-1.0]|
|[-1.0,-1.0]|
| [-1.0,1.0]|
+-----------+



In [28]:
dfA.dtypes

[('features', 'vector')]

In [29]:
dfA.schema

StructType(List(StructField(features,VectorUDT,true)))

In [30]:
dfA.show()

+-----------+
|   features|
+-----------+
|  [1.0,1.0]|
| [1.0,-1.0]|
|[-1.0,-1.0]|
| [-1.0,1.0]|
+-----------+



In [35]:
for row in dfA.take(5):
    print("----- ------------------", type(row))
    for vect in row:
        print("///// ", type(vect))
        for val in vect:
            print(type(val))

----- ------------------ <class 'pyspark.sql.types.Row'>
/////  <class 'pyspark.ml.linalg.DenseVector'>
<class 'numpy.float64'>
<class 'numpy.float64'>
----- ------------------ <class 'pyspark.sql.types.Row'>
/////  <class 'pyspark.ml.linalg.DenseVector'>
<class 'numpy.float64'>
<class 'numpy.float64'>
----- ------------------ <class 'pyspark.sql.types.Row'>
/////  <class 'pyspark.ml.linalg.DenseVector'>
<class 'numpy.float64'>
<class 'numpy.float64'>
----- ------------------ <class 'pyspark.sql.types.Row'>
/////  <class 'pyspark.ml.linalg.DenseVector'>
<class 'numpy.float64'>
<class 'numpy.float64'>


In [4]:
print(type(dfB))
dfB.show()

<class 'pyspark.sql.dataframe.DataFrame'>
+----------+
|  features|
+----------+
| [1.0,0.0]|
|[-1.0,0.0]|
| [0.0,1.0]|
|[0.0,-1.0]|
+----------+



In [21]:
print(type(key))
key
print(dir(key))

<class 'pyspark.ml.linalg.DenseVector'>
['__UDT__', '__add__', '__class__', '__delattr__', '__dict__', '__dir__', '__div__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__radd__', '__rdiv__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', '__rsub__', '__rtruediv__', '__setattr__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__weakref__', '_delegate', 'array', 'dot', 'norm', 'numNonzeros', 'squared_distance', 'toArray', 'values']


In [16]:
print(dir(key.array))

['T', '__abs__', '__add__', '__and__', '__array__', '__array_finalize__', '__array_function__', '__array_interface__', '__array_prepare__', '__array_priority__', '__array_struct__', '__array_ufunc__', '__array_wrap__', '__bool__', '__class__', '__complex__', '__contains__', '__copy__', '__deepcopy__', '__delattr__', '__delitem__', '__dir__', '__divmod__', '__doc__', '__eq__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__iadd__', '__iand__', '__ifloordiv__', '__ilshift__', '__imatmul__', '__imod__', '__imul__', '__index__', '__init__', '__init_subclass__', '__int__', '__invert__', '__ior__', '__ipow__', '__irshift__', '__isub__', '__iter__', '__itruediv__', '__ixor__', '__le__', '__len__', '__lshift__', '__lt__', '__matmul__', '__mod__', '__mul__', '__ne__', '__neg__', '__new__', '__or__', '__pos__', '__pow__', '__radd__', '__rand__', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rlshift_

In [6]:
brp = BucketedRandomProjectionLSH(
    inputCol="features", outputCol="hashes", bucketLength=2.0, numHashTables=3
)
model = brp.fit(dfA)

# Feature Transformation
print("The hashed dataset where hashed values are stored in the column 'hashes':")
model.transform(dfA).show()

# Compute the locality sensitive hashes for the input rows, then perform approximate
# similarity join.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxSimilarityJoin(transformedA, transformedB, 1.5)`
#print("Approximately joining dfA and dfB on Euclidean distance smaller than 1.5:")
#model.approxSimilarityJoin(dfA, dfB, 1.5, distCol="EuclideanDistance")\
#    .select(col("datasetA.id").alias("idA"),
#            col("datasetB.id").alias("idB"),
#            col("EuclideanDistance")).show()

# Compute the locality sensitive hashes for the input rows, then perform approximate nearest
# neighbor search.
# We could avoid computing hashes by passing in the already-transformed dataset, e.g.
# `model.approxNearestNeighbors(transformedA, key, 2)`
print("Approximately searching dfA for 2 nearest neighbors of the key:")

The hashed dataset where hashed values are stored in the column 'hashes':
+-----------+--------------------+
|   features|              hashes|
+-----------+--------------------+
|  [1.0,1.0]|[[0.0], [-1.0], [...|
| [1.0,-1.0]|[[-1.0], [0.0], [...|
|[-1.0,-1.0]|[[-1.0], [0.0], [...|
| [-1.0,1.0]|[[0.0], [-1.0], [...|
+-----------+--------------------+

Approximately searching dfA for 2 nearest neighbors of the key:


In [7]:
model.approxNearestNeighbors(dfA, key, 2).show()

+----------+--------------------+-------+
|  features|              hashes|distCol|
+----------+--------------------+-------+
| [1.0,1.0]|[[0.0], [-1.0], [...|    1.0|
|[1.0,-1.0]|[[-1.0], [0.0], [...|    1.0|
+----------+--------------------+-------+



In [11]:
# $example off$
spark.stop()