# Vectors & Matrices
#### https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.Vectors

In [1]:
import numpy as np
import scipy.sparse as sps
from pyspark.mllib.linalg import Vectors

# Dense

In [7]:
a = Vectors.dense([1, 2, 3])
b= Vectors.dense(1.0, 2.0)

In [8]:
print(type(a),type(b))

<class 'pyspark.mllib.linalg.DenseVector'> <class 'pyspark.mllib.linalg.DenseVector'>


# static sparse(size, *args)[source]
### Create a sparse vector, using either a dictionary, a list of (index, value) pairs, or two separate arrays of indices and values (sorted by index).

### Parameters
### size – Size of the vector.

### args – Non-zero entries, as a dictionary, list of tuples, or two sorted lists containing indices and values.

## To create a sparse vector, we need to provide the length of the vector — indices of non-zero values which should be strictly increasing and non-zero values.

In [12]:
Vectors.sparse(4, {1: 1.0, 3: 5.5})

SparseVector(4, {1: 1.0, 3: 5.5})

In [13]:
Vectors.sparse(4, [(1, 1.0), (3, 5.5)])

SparseVector(4, {1: 1.0, 3: 5.5})

In [14]:
Vectors.sparse(4, [1, 3], [1.0, 5.5])


SparseVector(4, {1: 1.0, 3: 5.5})

## Convret Sparse to Dense
### no direct method available. First convert to array and then apply dense method.

In [47]:
sparseVector = Vectors.sparse(4, [1, 3], [1.0, 5.5])
Vectors.dense(sparseVector.toArray())

DenseVector([0.0, 1.0, 0.0, 5.5])

# static squared_distance(v1, v2)[source]
### Squared distance between two vectors. a and b can be of type SparseVector, DenseVector, np.ndarray or array.array.

In [15]:
a = Vectors.sparse(4, [(0, 1), (3, 4)])
b = Vectors.dense([2, 5, 4, 1])
a.squared_distance(b)

51.0

# static stringify(vector)
### Converts a vector into a string, which can be recognized by Vectors.parse().

In [16]:
Vectors.stringify(Vectors.sparse(2, [1], [1.0]))


'(2,[1],[1.0])'

In [17]:
Vectors.stringify(Vectors.dense([0.0, 1.0]))

'[0.0,1.0]'

# static parse(s)[source]
### Parse a string representation back into the Vector.

In [18]:
Vectors.parse('[2,1,2 ]')

DenseVector([2.0, 1.0, 2.0])

In [19]:
Vectors.parse(' ( 100,  [0],  [2])')

SparseVector(100, {0: 2.0})

# Matrix
# class pyspark.mllib.linalg.DenseMatrix(numRows, numCols, values, isTransposed=False)[source]
### Bases: pyspark.mllib.linalg.Matrix

### Column-major dense matrix.
### toArray()[source]
### Return an numpy.ndarray

In [21]:
from pyspark.mllib.linalg import DenseMatrix
m = DenseMatrix(2, 2, range(4))
m.toArray()

array([[0., 2.],
       [1., 3.]])

#  class pyspark.mllib.linalg.SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed=False)
### Bases: pyspark.mllib.linalg.Matrix

### Sparse Matrix stored in CSC format.
### toArray()
### Return an numpy.ndarray

### toDense()

In [48]:
from pyspark.mllib.linalg import SparseMatrix
m = SparseMatrix(3, 2, [0, 1, 3], [0, 1, 2], [7, 11, 12])
m.toArray()

array([[ 7.,  0.],
       [ 0., 11.],
       [ 0., 12.]])

# class pyspark.mllib.linalg.Matrices
### Bases: object

### static dense(numRows, numCols, values)[source]
### Create a DenseMatrix


### Parameters
#### mat – a pyspark.ml.linalg.Matrix

### Returns
#### a pyspark.mllib.linalg.Matrix



### Compressed Sparse Column (CSC) format in column-major order.

In [28]:
from pyspark.mllib.linalg import Matrices
dm1 = Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])
dm2 = Matrices.dense(3, 2, [7, 0, 9, 0, 0, 12])

In [None]:
[0, 1, 3,
 0, 1, 2, 
 7, 11, 12
]

In [38]:
sm = Matrices.sparse(3, 2, [0, 1, 3], [0, 1, 2], [7, 11, 12])

In [39]:
print(sm)

3 X 2 CSCMatrix
(0,0) 7.0
(1,1) 11.0
(2,1) 12.0


## Convert Sparse Matrix to Dense or Array

In [40]:
print(sm.toDense())

DenseMatrix([[ 7.,  0.],
             [ 0., 11.],
             [ 0., 12.]])


In [42]:
print(sm.toArray())

[[ 7.  0.]
 [ 0. 11.]
 [ 0. 12.]]
