In [1]:
from pyspark import SparkConf, SparkContext
import pyspark.sql.functions as f

## Global variables

In [2]:
firstMatRow = 0
firstMatCol = 0
secondMatRow = 0
secondMatCol = 0
resultMatRow = 0
resultMatCol = 0

## Mapper1
Split the original file by "," so it transform to a easier rdd

In [3]:
def mapper1(line):
    matrix, row, cal, val = line.split(",")
    maplist = []
    maplist.append((matrix, row, cal, val))
    return maplist

## Mapper2
Sort the rdd lines according to which matrix it belongs to
Let's say the value in the matrix is Mij, then for the first matrix we set the key-value pair (j, (M, i, Mij))
For the second matrix Njk, we set the key-value pair (j, (N, k, Njk))
By this, we can multiply them easily

In [4]:
def mapper2(line):
    maplist = []
    if line[0] == 'M':
        maplist.append((int(line[2]), (line[0], int(line[1]), int(line[3]))))
    elif line[0] == 'N':
        maplist.append((int(line[1]), (line[0], int(line[2]), int(line[3]))))
    return maplist

## Mapper3
By multiplying each element with the same key, we can get the key-value pair((i, k), Mij*Njk)

In [5]:
def mapper3(line):
    maplist = []
    maplist.append(((line[1][0][1], line[1][1][1]), line[1][0][2]*line[1][1][2]))
    return maplist

# Reducer1
Add the values with the same key

In [6]:
def reducer1(x, y):
    return x + y

## Main
I searched many related articles on the internet for mapreduce in matrix multiplication,
most of the ways they provide is to map the keys to the result matrix, and put the M's and N's values to that key with same j,
after this, we simply multiply those values with the same js (but in different matrix),
then we add up those values to get the final result!

In [None]:
sc.stop()

conf = SparkConf().setMaster("local").setAppName("MatrixMultiplication")
sc = SparkContext(conf=conf)
inputfile = sc.textFile("500input.txt").flatMap(mapper1)

# find matrix row and col
matrixM = inputfile.filter(lambda x: "M" in x[0])
matrixN = inputfile.filter(lambda x: "N" in x[0])
firstMatRow = matrixM.max(lambda x: x[1])[1]
firstMatCol = matrixM.max(lambda x: x[2])[2]
secondMatRow = matrixN.max(lambda x: x[1])[1]
secondMatCol = matrixN.max(lambda x: x[2])[2]
resultMatRow = int(firstMatRow) + 1
resultMatCol = int(secondMatCol) + 1

# mapping1
mappedfileM = matrixM.flatMap(mapper2)
mappedfileN = matrixN.flatMap(mapper2)

# mapping2
newItems = mappedfileM.join(mappedfileN)
#print(newItems.collect())
reducedfile = newItems.flatMap(mapper3)
#print(reducedfile.collect())
finalReduce = reducedfile.reduceByKey(reducer1)
#print(finalReduce.collect())
#print(finalReduce.sortByKey().collect())
sortedReduce = finalReduce.sortByKey()
    
outputFile = open("Outputfile.txt","w")
for key, value in sortedReduce.collect():
    result = str(key[0]) + "," + str(key[1]) + "," + str(value)
    outputFile.write("%s" % (result) + "\n")
outputFile.close()

sc.stop()
#print(inputfile.collect())