# Sparse Matrix Matrix multiplication using MrJob

Input data should be in the form of 

i j     valuek

i j+1   valuek+1

i+1 j   valuek+2

i+1 j+1 value k+3

## Configuration 

In [None]:
mydir = "mymrjob"
%env mydir = $mydir

matA = "data/mat/smat_100x10_A"
matB = "data/mat/smat_10x200_B"

%env matA $matA
%env matB $matB

myscript = mydir + "/sparse_matmat.py"
%env myscript $myscript

%system mkdir -p $mydir
%env myoutput $mydir/output
%env mylog $mydir/log

In [None]:
%%writefile $myscript

from mrjob.job import MRJob
from mrjob.compat import jobconf_from_env
import itertools
import sys

class SparseMatMult(MRJob):
    """ First, join the two matrices based on columns, then output
    the cartesian product of the columns, which forms the input for
    the product. """
    
    def configure_options(self):
        super(SparseMatMult,self).configure_options()
        self.add_passthrough_option('--A-matrix',default='A',
            dest='Amatname')
    
    def parsemat(self):
        """ Return 1 if this is the A matrix, otherwise return 2"""
        fn = jobconf_from_env('map.input.file')
        if self.options.Amatname in fn: 
            return 1
        else:
            return 2
    
    def joinmap(self, key, line):
        mtype = self.parsemat()
        vals = [float(v) for v in  line.split()]
        row = int(vals[0])
        rowvals = [(int(vals[i]),vals[i+1]) for i in range(1,len(vals),2)]
        if mtype==1:
            # rowvals are the entries in the row
            # we output the entire row for each column
            for val in rowvals:
                # sys.stdout.write("Key Mat A column, row, value: ({0}, {1}, {2})\n".format(val[0], row, val[1]))
                yield (val[0], (row, val[1]))
        else:
            # sys.stdout.write("Key Mat B row, values: ({0}, {1})\n".format(row, rowvals))
            yield (row, (rowvals,))
            
    def joinred(self, key, vals):
        # each key is a column of the matrix.
        # and there are two types of values:
        #  len == 2 (1, row, A_row,key) # a column of A
        #  len == 1 rowvals # a row of B
        
        # load the data into memory       
        brow = []
        acol = []

        for val in vals:
            if len(val) == 1:
                brow.extend(val[0])
            else:
                acol.append(val)
        
        for (bcol,bval) in brow:
            for (arow,aval) in acol:
                # sys.stdout.write("Keys values : ({0},{1},{2},{3})\n".format(arow,bcol, aval, bval))
                yield ((arow,bcol), aval*bval)
    
    def sumred(self, key, vals):
        # sys.stdout.write("Keys : ({0})\n".format(key))
        yield (key, sum(vals))
        
    def rowgroupmap(self, key, val):
        # sys.stdout.write("Keys : ({0},{1})\n".format(key[0], key[1]))
        yield key[0], (key[1], val)
        
    def appendred(self, key, vals):
        yield key, list(itertools.chain.from_iterable(vals))
        
    def steps(self):
        return [self.mr(mapper=self.joinmap, reducer=self.joinred),
            self.mr(mapper=None, reducer=self.sumred),
            self.mr(mapper=self.rowgroupmap, reducer=self.appendred)]

if __name__=='__main__':
    SparseMatMult.run()

### Execute the code

In [None]:
# ! python $myscript $matA $matB 1> $myoutput 2> $mylog

### Show Output

In [None]:
%cat $myoutput
# %cat $mylog

### Additional credits

Jure Leskovec Stanford Univ.

Anand Rajaraman Milliway Labs

Jeffrey D. Ullman Stanford Univ.