In [1]:
import findspark
#findspark.init(spark_path)
findspark.init()

from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext, SparkConf

In [2]:
environment_to_connect = 'local' # CHANGE IT IF CONNECTION TO A CLUSTER

conf = SparkConf().setAppName('matrix_multiplication_analysis').setMaster(environment_to_connect)
sc = SparkContext(conf=conf)
spark = SQLContext(sc)

In [3]:
# Some other useful imports
import numpy as np

### Upload files with Spark

#### Matrix A

In [4]:
num_partitions_matrix_A = 1

In [5]:
matrix_A = sc.textFile('Matriz_Ejemplo_A.dat', num_partitions_matrix_A)
matrix_A

Matriz_Ejemplo_A.dat MapPartitionsRDD[1] at textFile at <unknown>:0

In [6]:
matrix_A_rows = matrix_A.count()
matrix_A_rows

1000

In [7]:
matrix_A.getNumPartitions()

1

In [None]:
matrix_A.take(2)

In [None]:
type(matrix_A.take(1)[0])

In [7]:
matrix_A = matrix_A.map( lambda line: list(map(float, line.split(' '))) ) # Tokenize values and convert them into float
matrix_A

PythonRDD[3] at RDD at PythonRDD.scala:53

In [8]:
print(matrix_A.take(3)[0][:5], len(matrix_A.take(3)[0]))
print(matrix_A.take(3)[1][:5], len(matrix_A.take(3)[1]))
print(matrix_A.take(3)[2][:5], len(matrix_A.take(3)[2]))
matrix_A_columns = len(matrix_A.take(3)[0])
matrix_A_columns

[4.1287, 2.2078, 8.4434, -0.40286, 2.4057] 128
[1.9492, -1.1705, 2.2879, 6.0707, 3.211] 128
[3.9418, -2.5248, 4.3379, 5.7163, 8.6705] 128


128

In [9]:
print('Rows: ' + str(matrix_A_rows), 'Columns: ' + str(matrix_A_columns))

Rows: 1000 Columns: 128


#### Matrix B

In [10]:
num_partitions_matrix_B = 1

In [11]:
matrix_B = sc.textFile('Matriz_Ejemplo_B.dat', num_partitions_matrix_B)
matrix_B

Matriz_Ejemplo_B.dat MapPartitionsRDD[12] at textFile at <unknown>:0

In [12]:
matrix_B_rows = matrix_B.count()
matrix_B_rows

128

In [14]:
matrix_B.getNumPartitions()

1

In [None]:
matrix_B.take(2)

In [13]:
matrix_B = matrix_B.map( lambda line: list(map(float, line.split(' '))) ) # Tokenize values and convert them into float
matrix_B

PythonRDD[14] at RDD at PythonRDD.scala:53

In [14]:
print(matrix_B.take(3)[0][:5], len(matrix_B.take(3)[0]))
print(matrix_B.take(3)[1][:5], len(matrix_B.take(3)[1]))
print(matrix_B.take(3)[2][:5], len(matrix_B.take(3)[2]))
matrix_B_columns = len(matrix_B.take(3)[0])
matrix_B_columns

[4.1588, 7.4706, 6.431, 8.8949, 6.1778] 60
[4.779, 5.0498, 8.9186, 9.8314, 3.2737] 60
[4.2157, 6.6605, 4.5704, 9.2966, 4.1862] 60


60

In [15]:
print('Rows: ' + str(matrix_B_rows), 'Columns: ' + str(matrix_B_columns))

Rows: 128 Columns: 60


#### Convert matrices to coordinates (sparse) format

In [16]:
def convert_row(line, row): # i : row number
    coo_row = []
    for j in range(len(line)):
        value = line[j]
        if value != 0: coo_row.append((row,j,value))
    return(coo_row)

In [17]:
matrix_A_list = matrix_A.collect()
matrix_B_list = matrix_B.collect()

In [18]:
get_coo_matrix = (lambda matrix_list: [(convert_row(matrix_list[index], index)) for index in range(len(matrix_list))])

In [19]:
coo_matrix_A = get_coo_matrix(matrix_A_list)
coo_matrix_B = get_coo_matrix(matrix_B_list)

In [20]:
coo_matrix_A = sc.parallelize(coo_matrix_A, num_partitions_matrix_A)
coo_matrix_A

ParallelCollectionRDD[22] at parallelize at PythonRDD.scala:195

In [21]:
coo_matrix_B = sc.parallelize(coo_matrix_B, num_partitions_matrix_B)
coo_matrix_B

ParallelCollectionRDD[23] at parallelize at PythonRDD.scala:195

In [22]:
coo_matrix_A.cache() # save RDD in main memory

ParallelCollectionRDD[22] at parallelize at PythonRDD.scala:195

In [23]:
coo_matrix_B.cache() # save RDD in main memory

ParallelCollectionRDD[23] at parallelize at PythonRDD.scala:195

In [24]:
print(coo_matrix_A.take(2)[0][:5])
print(coo_matrix_A.take(2)[1][:5])

[(0, 0, 4.1287), (0, 1, 2.2078), (0, 2, 8.4434), (0, 3, -0.40286), (0, 4, 2.4057)]
[(1, 0, 1.9492), (1, 1, -1.1705), (1, 2, 2.2879), (1, 3, 6.0707), (1, 4, 3.211)]


In [25]:
print(coo_matrix_B.take(2)[0][:5])
print(coo_matrix_B.take(2)[1][:5])

[(0, 0, 4.1588), (0, 1, 7.4706), (0, 2, 6.431), (0, 3, 8.8949), (0, 4, 6.1778)]
[(1, 0, 4.779), (1, 1, 5.0498), (1, 2, 8.9186), (1, 3, 9.8314), (1, 4, 3.2737)]


#### Perform Matrix Multiplication

In [26]:
first_map_matrix_A = coo_matrix_A.flatMap( lambda line: [(row[1], (row[0], row[2])) for row in line] )
first_map_matrix_A

PythonRDD[28] at RDD at PythonRDD.scala:53

In [31]:
first_map_matrix_A.count()

128000

In [27]:
first_map_matrix_A.take(5)

[(0, (0, 4.1287)),
 (1, (0, 2.2078)),
 (2, (0, 8.4434)),
 (3, (0, -0.40286)),
 (4, (0, 2.4057))]

In [30]:
first_map_matrix_B = coo_matrix_B.flatMap( lambda line: [(row[0], (row[1], row[2])) for row in line] )
first_map_matrix_B

PythonRDD[32] at RDD at PythonRDD.scala:53

In [31]:
first_map_matrix_B.count()

7680

In [32]:
first_map_matrix_B.take(120)

[(0, (0, 4.1588)),
 (0, (1, 7.4706)),
 (0, (2, 6.431)),
 (0, (3, 8.8949)),
 (0, (4, 6.1778)),
 (0, (5, 8.7999)),
 (0, (6, 5.4387)),
 (0, (7, 4.1222)),
 (0, (8, 1.7036)),
 (0, (9, 6.6315)),
 (0, (10, 6.7626)),
 (0, (11, 4.2752)),
 (0, (12, 4.0949)),
 (0, (13, 6.2942)),
 (0, (14, 8.3809)),
 (0, (15, 6.9591)),
 (0, (16, 6.2074)),
 (0, (17, 4.9545)),
 (0, (18, 5.1017)),
 (0, (19, 3.7777)),
 (0, (20, 4.4296)),
 (0, (21, 5.2095)),
 (0, (22, 3.5372)),
 (0, (23, 4.5093)),
 (0, (24, 6.5367)),
 (0, (25, 3.088)),
 (0, (26, 4.2214)),
 (0, (27, 3.2965)),
 (0, (28, 6.8746)),
 (0, (29, 4.2818)),
 (0, (30, 5.6333)),
 (0, (31, 4.2198)),
 (0, (32, 6.0381)),
 (0, (33, 2.2611)),
 (0, (34, 6.887)),
 (0, (35, 5.4321)),
 (0, (36, 8.1058)),
 (0, (37, 5.6511)),
 (0, (38, 8.2948)),
 (0, (39, 7.068)),
 (0, (40, 3.8169)),
 (0, (41, 2.7906)),
 (0, (42, 4.9194)),
 (0, (43, 6.4473)),
 (0, (44, 4.1604)),
 (0, (45, 6.3529)),
 (0, (46, 4.2733)),
 (0, (47, 6.3724)),
 (0, (48, 6.7459)),
 (0, (49, 5.4399)),
 (0, (50, 7.22

In [64]:
first_reduce_sec_map = first_map_matrix_A.join(first_map_matrix_B).\
    map( lambda line: ((line[1][0][0], line[1][1][0]), line[1][0][1] * line[1][1][1]) )
first_reduce_sec_map

PythonRDD[126] at RDD at PythonRDD.scala:53

In [45]:
first_reduce_sec_map.count()

7680000

In [57]:
first_reduce_sec_map.take(5)

[((0, 0), 17.1704),
 ((0, 1), 30.8439),
 ((0, 2), 26.5517),
 ((0, 3), 36.7244),
 ((0, 4), 25.5063)]

In [65]:
second_reduce = first_reduce_sec_map.reduceByKey( lambda x, y: round(x + y, 4) )
second_reduce

PythonRDD[131] at RDD at PythonRDD.scala:53

In [48]:
second_reduce.count()

60000

In [66]:
second_reduce.take(5)

[((0, 0), 2022.5443),
 ((0, 2), 2145.9141),
 ((0, 4), 2079.6449),
 ((0, 6), 2203.4818),
 ((0, 8), 2179.9472)]

In [67]:
result = second_reduce.sortByKey()
result

PythonRDD[139] at RDD at PythonRDD.scala:53

In [68]:
result.take(70)

[((0, 0), 2022.5443),
 ((0, 1), 2194.2844),
 ((0, 2), 2145.9141),
 ((0, 3), 2232.1916),
 ((0, 4), 2079.6449),
 ((0, 5), 2286.4625),
 ((0, 6), 2203.4818),
 ((0, 7), 2181.6806),
 ((0, 8), 2179.9472),
 ((0, 9), 2139.8339),
 ((0, 10), 2201.6202),
 ((0, 11), 2246.1318),
 ((0, 12), 2112.8754),
 ((0, 13), 2089.9635),
 ((0, 14), 2230.4036),
 ((0, 15), 2326.9978),
 ((0, 16), 2209.185),
 ((0, 17), 2128.445),
 ((0, 18), 2100.9829),
 ((0, 19), 2264.4892),
 ((0, 20), 2215.9385),
 ((0, 21), 2203.7053),
 ((0, 22), 2202.9464),
 ((0, 23), 2317.13),
 ((0, 24), 2104.3664),
 ((0, 25), 2151.3703),
 ((0, 26), 2175.0896),
 ((0, 27), 2280.6067),
 ((0, 28), 2155.6193),
 ((0, 29), 2260.3449),
 ((0, 30), 2130.6846),
 ((0, 31), 2279.9108),
 ((0, 32), 2179.3534),
 ((0, 33), 2093.5267),
 ((0, 34), 2341.5597),
 ((0, 35), 2124.6643),
 ((0, 36), 2228.5736),
 ((0, 37), 2127.9422),
 ((0, 38), 2137.3472),
 ((0, 39), 2105.4985),
 ((0, 40), 2047.4022),
 ((0, 41), 2122.7344),
 ((0, 42), 2191.6016),
 ((0, 43), 2089.4202),
 (

#### Comparing result with dot product Numpy

In [81]:
print(len(matrix_A_list), len(matrix_B_list))
print(matrix_A_list[0][:5], len(matrix_A_list[0]))
print(matrix_B_list[0][:5], len(matrix_B_list[0]))

1000 128
[4.1287, 2.2078, 8.4434, -0.40286, 2.4057] 128
[4.1588, 7.4706, 6.431, 8.8949, 6.1778] 60


In [84]:
matrix_result = np.dot(np.array(matrix_A_list), np.array(matrix_B_list))
matrix_result.shape

(1000, 60)

In [89]:
for value in matrix_result[0][:10]: print(round(value, 4))
print()
for value in matrix_result[1][:10]: print(round(value, 4))

2022.5443
2194.2845
2145.9132
2232.1919
2079.6451
2286.4625
2203.4812
2181.6809
2179.9475
2139.834

2306.2889
2360.2181
2206.5806
2357.8556
2270.5474
2343.1441
2248.7314
2305.8916
2341.3737
2296.1787
