In [3]:

import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [4]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "natasha pritykovskaya vector app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [5]:
spark

In [6]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [7]:
from pyspark.ml.linalg import DenseVector, SparseVector

In [8]:
v = DenseVector([1, 2, 3, 4])

In [9]:
type(v)

pyspark.ml.linalg.DenseVector

In [10]:
type(v[0])

numpy.float64

In [11]:
v.values

array([1., 2., 3., 4.])

In [12]:
type(v.values)

numpy.ndarray

In [13]:
v.toArray()

array([1., 2., 3., 4.])

In [14]:
type(v.toArray())

numpy.ndarray

## Indexing

In [15]:
v[0]

1.0

In [16]:
v[-1]

4.0

In [17]:
v[2:4]

array([3., 4.])

## Operations

In [18]:
v - 2

DenseVector([-1.0, 0.0, 1.0, 2.0])

In [19]:
v / 3

DenseVector([0.3333, 0.6667, 1.0, 1.3333])

## L1 norm

In [20]:
v.norm(1)

10.0

In [21]:
(v * -1).norm(1) == v.norm(1)

True

## L2 norm

In [63]:
type(v)

pyspark.ml.linalg.DenseVector

In [64]:
v.norm(2)

5.477225575051661

In [65]:
v.norm(0)

4.0

In [66]:
v.values[0] = 0

In [67]:
v

DenseVector([0.0, 2.0, 3.0, 4.0])

In [68]:
v.norm(0)

3.0

In [69]:
u = Vectors.dense([1, 2, 3, 5])

In [70]:
u

DenseVector([1.0, 2.0, 3.0, 5.0])

In [71]:
v - u

DenseVector([-1.0, 0.0, 0.0, -1.0])

In [72]:
v.squared_distance(u)

2.0

## Cosine similarity

In [73]:
v.dot(u) / (v.norm(2) * u.norm(2))

0.9812567051925868

## Sparse vectors

In [28]:
from pyspark.ml.linalg import Vectors

In [29]:
ndx_value = tuple(zip(range(4), range(1, 5)))

In [30]:
ndx_value

((0, 1), (1, 2), (2, 3), (3, 4))

In [31]:
v = SparseVector(len(ndx_value), ndx_value)

In [32]:
v

SparseVector(4, {0: 1.0, 1: 2.0, 2: 3.0, 3: 4.0})

In [33]:
DenseVector(v)

DenseVector([1.0, 2.0, 3.0, 4.0])

In [34]:
u = Vectors.sparse(4, range(4), [1, 2, 3, 5])

In [35]:
u = SparseVector(4, range(4), [1, 2, 3, 5])

In [36]:
u

SparseVector(4, {0: 1.0, 1: 2.0, 2: 3.0, 3: 5.0})

In [37]:
v * 2

TypeError: unsupported operand type(s) for *: 'SparseVector' and 'int'

In [38]:
v - u

TypeError: unsupported operand type(s) for -: 'SparseVector' and 'SparseVector'

In [39]:
v.squared_distance(u)

1.0

In [40]:
v.dot(u) / (v.norm(2) * u.norm(2))

0.9939990885479664

In [41]:
spark.stop()