In [1]:

import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Sergey Grishaev vector app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
spark

In [4]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [5]:
from pyspark.ml.linalg import DenseVector, SparseVector

In [8]:
v2 = DenseVector([[1,2], [2,3], [3,4]])

In [6]:
v = DenseVector([1, 2, 3, 4])

In [7]:
type(v)

pyspark.ml.linalg.DenseVector

In [11]:
type(v2[0])

numpy.ndarray

In [12]:
v.values

array([1., 2., 3., 4.])

In [13]:
type(v.values)

numpy.ndarray

In [14]:
v.toArray()

array([1., 2., 3., 4.])

In [15]:
type(v.toArray())

numpy.ndarray

## Indexing

In [16]:
v[0]

1.0

In [17]:
v[-1]

4.0

In [18]:
v[2:4]

array([3., 4.])

## Operations

In [19]:
v - 2

DenseVector([-1.0, 0.0, 1.0, 2.0])

In [20]:
v / 3

DenseVector([0.3333, 0.6667, 1.0, 1.3333])

## L1 norm

In [21]:
v.norm(1)

10.0

In [22]:
(v * -1).norm(1) == v.norm(1)

True

## L2 norm

In [23]:
type(v)

pyspark.ml.linalg.DenseVector

In [24]:
v.norm(2)

5.477225575051661

In [25]:
v.norm(0)

4.0

In [26]:
v.values[0] = 0

In [27]:
v

DenseVector([0.0, 2.0, 3.0, 4.0])

In [28]:
v.norm(0)

3.0

In [29]:
from pyspark.ml.linalg import Vectors

In [35]:
u = Vectors.dense([1, 2, 3, 5, 7])

In [36]:
u

DenseVector([1.0, 2.0, 3.0, 5.0, 7.0])

In [32]:
v - u

DenseVector([-1.0, 0.0, 0.0, -1.0])

In [37]:
v.squared_distance(u)

AssertionError: dimension mismatch

## Cosine similarity

In [38]:
v.dot(u) / (v.norm(2) * u.norm(2))

AssertionError: dimension mismatch

## Sparse vectors

In [39]:
from pyspark.ml.linalg import Vectors

In [40]:
ndx_value = tuple(zip(range(4), range(1, 5)))

In [41]:
ndx_value

((0, 1), (1, 2), (2, 3), (3, 4))

In [42]:
v = SparseVector(len(ndx_value), ndx_value)

In [43]:
v

SparseVector(4, {0: 1.0, 1: 2.0, 2: 3.0, 3: 4.0})

In [44]:
DenseVector(v)

DenseVector([1.0, 2.0, 3.0, 4.0])

In [45]:
u = Vectors.sparse(110, [0,2,4,10], [1, 2, 3, 5])

In [46]:
u


SparseVector(110, {0: 1.0, 2: 2.0, 4: 3.0, 10: 5.0})

In [47]:
u_1 = DenseVector(u)


In [48]:
u_1

DenseVector([1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])

In [49]:
u = SparseVector(4, range(4), [1, 2, 3, 5])

In [50]:
u

SparseVector(4, {0: 1.0, 1: 2.0, 2: 3.0, 3: 5.0})

In [51]:
d3 = DenseVector(u)

In [52]:
d3

DenseVector([1.0, 2.0, 3.0, 5.0])

In [53]:
v * 2

TypeError: unsupported operand type(s) for *: 'SparseVector' and 'int'

In [54]:
v - u

TypeError: unsupported operand type(s) for -: 'SparseVector' and 'SparseVector'

In [55]:
v.squared_distance(u)

1.0

In [56]:
v.dot(u) / (v.norm(2) * u.norm(2))

0.9939990885479664

In [57]:
spark.stop()