In [1]:
import findspark
#findspark.init(spark_path)
findspark.init()

from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext, SparkConf

In [2]:
environment_to_connect = 'local' # CHANGE IT IF CONNECTION TO A CLUSTER

conf = SparkConf().setAppName('matrix_multiplication').setMaster(environment_to_connect)
sc = SparkContext(conf=conf)
spark = SQLContext(sc)

In [3]:
# Some other useful imports
import numpy as np

### Upload files with Spark

#### Matrix A

In [4]:
num_partitions_matrix_A = 1

In [5]:
matrix_A = sc.textFile('Matriz_Ejemplo_A.dat', num_partitions_matrix_A)
matrix_A

Matriz_Ejemplo_A.dat MapPartitionsRDD[1] at textFile at <unknown>:0

In [6]:
matrix_A_rows = matrix_A.count()
matrix_A_rows

1000

In [7]:
matrix_A.getNumPartitions()

1

In [None]:
matrix_A.take(2)

In [None]:
type(matrix_A.take(1)[0])

In [8]:
matrix_A = matrix_A.map( lambda line: list(map(float, line.split(' '))) ) # Tokenize values and convert them into float
matrix_A

PythonRDD[3] at RDD at PythonRDD.scala:53

In [9]:
print(matrix_A.take(3)[0][:5], len(matrix_A.take(3)[0]))
print(matrix_A.take(3)[1][:5], len(matrix_A.take(3)[1]))
print(matrix_A.take(3)[2][:5], len(matrix_A.take(3)[2]))
matrix_A_columns = len(matrix_A.take(3)[0])
matrix_A_columns

[4.1287, 2.2078, 8.4434, -0.40286, 2.4057] 128
[1.9492, -1.1705, 2.2879, 6.0707, 3.211] 128
[3.9418, -2.5248, 4.3379, 5.7163, 8.6705] 128


128

In [10]:
print('Rows: ' + str(matrix_A_rows), 'Columns: ' + str(matrix_A_columns))

Rows: 1000 Columns: 128


#### Matrix B

In [11]:
num_partitions_matrix_B = 1

In [12]:
matrix_B = sc.textFile('Matriz_Ejemplo_B.dat', num_partitions_matrix_B)
matrix_B

Matriz_Ejemplo_B.dat MapPartitionsRDD[12] at textFile at <unknown>:0

In [13]:
matrix_B_rows = matrix_B.count()
matrix_B_rows

128

In [14]:
matrix_B.getNumPartitions()

1

In [None]:
matrix_B.take(2)

In [16]:
matrix_B = matrix_B.map( lambda line: list(map(float, line.split(' '))) ) # Tokenize values and convert them into float
matrix_B

PythonRDD[14] at RDD at PythonRDD.scala:53

In [18]:
print(matrix_B.take(3)[0][:5], len(matrix_B.take(3)[0]))
print(matrix_B.take(3)[1][:5], len(matrix_B.take(3)[1]))
print(matrix_B.take(3)[2][:5], len(matrix_B.take(3)[2]))
matrix_B_columns = len(matrix_B.take(3)[0])
matrix_B_columns

[4.1588, 7.4706, 6.431, 8.8949, 6.1778] 60
[4.779, 5.0498, 8.9186, 9.8314, 3.2737] 60
[4.2157, 6.6605, 4.5704, 9.2966, 4.1862] 60


60

In [19]:
print('Rows: ' + str(matrix_B_rows), 'Columns: ' + str(matrix_B_columns))

Rows: 128 Columns: 60


#### Convert matrices to coordinates (sparse) format

In [20]:
def convert_row(line, row): # i : row number
    coo_row = []
    for j in range(len(line)):
        value = line[j]
        if value != 0: coo_row.append((row,j,value))
    return(coo_row)

In [21]:
matrix_A_list = matrix_A.collect()
matrix_B_list = matrix_B.collect()

In [22]:
get_coo_matrix = (lambda matrix_list: [(convert_row(matrix_list[index], index)) for index in range(len(matrix_list))])

In [23]:
coo_matrix_A = get_coo_matrix(matrix_A_list)
coo_matrix_B = get_coo_matrix(matrix_B_list)

In [24]:
coo_matrix_A = sc.parallelize(coo_matrix_A, num_partitions_matrix_A)
coo_matrix_A

ParallelCollectionRDD[22] at parallelize at PythonRDD.scala:195

In [25]:
coo_matrix_B = sc.parallelize(coo_matrix_B, num_partitions_matrix_B)
coo_matrix_B

ParallelCollectionRDD[23] at parallelize at PythonRDD.scala:195

In [26]:
coo_matrix_A.cache() # save RDD in main memory

ParallelCollectionRDD[22] at parallelize at PythonRDD.scala:195

In [27]:
coo_matrix_B.cache() # save RDD in main memory

ParallelCollectionRDD[23] at parallelize at PythonRDD.scala:195

In [29]:
print(coo_matrix_A.take(2)[0][:5])
print(coo_matrix_A.take(2)[1][:5])

[(0, 0, 4.1287), (0, 1, 2.2078), (0, 2, 8.4434), (0, 3, -0.40286), (0, 4, 2.4057)]
[(1, 0, 1.9492), (1, 1, -1.1705), (1, 2, 2.2879), (1, 3, 6.0707), (1, 4, 3.211)]


In [28]:
print(coo_matrix_B.take(2)[0][:5])
print(coo_matrix_B.take(2)[1][:5])

[(0, 0, 4.1588), (0, 1, 7.4706), (0, 2, 6.431), (0, 3, 8.8949), (0, 4, 6.1778)]
[(1, 0, 4.779), (1, 1, 5.0498), (1, 2, 8.9186), (1, 3, 9.8314), (1, 4, 3.2737)]


#### Perform Matrix Multiplication

In [30]:
first_map_matrix_A = coo_matrix_A.flatMap( lambda line: [(row[1], (row[0], row[2])) for row in line] )
first_map_matrix_A

PythonRDD[28] at RDD at PythonRDD.scala:53

In [31]:
first_map_matrix_A.count()

128000

In [32]:
first_map_matrix_A.take(5)

[(0, (0, 4.1287)),
 (1, (0, 2.2078)),
 (2, (0, 8.4434)),
 (3, (0, -0.40286)),
 (4, (0, 2.4057))]

In [33]:
first_map_matrix_B = coo_matrix_B.flatMap( lambda line: [(row[1], (row[0], row[2])) for row in line] )
first_map_matrix_B

PythonRDD[31] at RDD at PythonRDD.scala:53

In [34]:
first_map_matrix_B.count()

7680

In [35]:
first_map_matrix_B.take(5)

[(0, (0, 4.1588)),
 (1, (0, 7.4706)),
 (2, (0, 6.431)),
 (3, (0, 8.8949)),
 (4, (0, 6.1778))]

In [36]:
first_reduce_sec_map = first_map_matrix_A.join(first_map_matrix_B).map( lambda line: ((line[1][0][0], line[1][1][0]),  line[1][0][1] * line[1][1][1]) )
first_reduce_sec_map

PythonRDD[41] at RDD at PythonRDD.scala:53

In [37]:
first_reduce_sec_map.count()

7680000

In [38]:
first_reduce_sec_map.take(5)

[((0, 0), 17.170437560000003),
 ((0, 1), 19.7310573),
 ((0, 2), 17.40536059),
 ((0, 3), 31.75672179),
 ((0, 4), 35.68600558)]

In [39]:
second_reduce = first_reduce_sec_map.reduceByKey( lambda x, y: x + y )
second_reduce

PythonRDD[48] at RDD at PythonRDD.scala:53

In [40]:
second_reduce.count()

128000

In [41]:
second_reduce.take(5)

[((0, 0), 950.983764774),
 ((0, 2), 983.9421465149998),
 ((0, 4), 1013.5149463818),
 ((0, 6), 1092.5707014894),
 ((0, 8), 977.2944711572001)]

In [42]:
result = second_reduce.sortByKey()
result

PythonRDD[57] at RDD at PythonRDD.scala:53

In [43]:
result.take(5)

[((0, 0), 950.983764774),
 ((0, 1), 1020.9554380636),
 ((0, 2), 983.9421465149998),
 ((0, 3), 986.8061505594001),
 ((0, 4), 1013.5149463818)]