In [1]:
import pyspark

In [2]:
import numpy as np

In [3]:
from pyspark.mllib.linalg.distributed import CoordinateMatrix
from pyspark.mllib.linalg.distributed import MatrixEntry
from pyspark.sql.types import *

In [4]:
minimal_fields = [ 
          StructField("author", StringType(), True),
          StructField("score", LongType(), True),
          StructField("subreddit", StringType(), True)]

In [5]:
sj = sqlContext.read.json("s3a://insight-ohoidn/sample3.json", StructType(minimal_fields))

In [5]:
sj.printSchema()

root
 |-- author: string (nullable = true)
 |-- score: long (nullable = true)
 |-- subreddit: string (nullable = true)



In [115]:
sj.toPandas()

Unnamed: 0,author,score,subreddit
0.0,stunt_penguin,1,Filmmakers
1.0,[deleted],2,Addons4Kodi
2.0,69ing,5,NotTimAndEric
3.0,ArchadianJudge,2,Saber
4.0,sglville,2,The_Donald
5.0,NEWORLDODOR,3,JordanPeterson
6.0,zachwad22,1,AskReddit
7.0,lurker4lyfe6969,2,Military
8.0,SuburbanStoner,2,trashy
9.0,ensanguine,18,KitchenConfidential


In [6]:
sj.registerTempTable('test')

In [52]:
sqlContext.sql('select author from test').toPandas()

Unnamed: 0,author
0.0,stunt_penguin
1.0,[deleted]
2.0,69ing
3.0,ArchadianJudge
4.0,sglville
5.0,NEWORLDODOR
6.0,zachwad22
7.0,lurker4lyfe6969
8.0,SuburbanStoner
9.0,ensanguine


In [7]:
occurrences = sqlContext.sql("""
select *, dense_rank() over (order by subreddit desc) as rid 
from  (SELECT subreddit, author, sum(score) as tally, dense_rank() over (order by author desc) as uid
from test
group by subreddit, author)
""").persist(StorageLevel.MEMORY_AND_DISK_SER)
occurrences.registerTempTable('occurrences')

In [8]:
# dataframe with required data for constructing the sparse matrix
bare_occurrences = sqlContext.sql("""
select rid, uid, tally
from occurrences
""").persist(StorageLevel.MEMORY_AND_DISK_SER)

In [9]:
from operator import add

def coordinateMatrixMultiply(leftmat, rightmat):
    m = leftmat.entries.map(lambda entry: (entry.j, (entry.i, entry.value)))
    n = rightmat.entries.map(lambda entry: (entry.i, (entry.j, entry.value)))
    product_entries = m.join(n)\
    .map(lambda tup: ((tup[1][0][0], tup[1][1][0]), (tup[1][0][1] * tup[1][1][1])))\
    .reduceByKey(add)\
    .map(lambda record: MatrixEntry(record[0][0], record[0][1], record[1]))
    
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(product_entries)

In [121]:
def coordinateMatrixAdd(leftmat, rightmat, scalar):
    """
    Return leftmat + scalar * rightmat
    """
    m = leftmat.entries.map(lambda entry: ((entry.i, entry.j), entry.value))
    n = rightmat.entries.map(lambda entry: ((entry.i, entry.j), scalar * entry.value))
    matsum = m.fullOuterJoin(n)\
    .map(lambda tup: MatrixEntry(tup[0][0], tup[0][1],
                                 reduce(add, filter(lambda elt: elt is not None, tup[1]))))
    
    #return matsum
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(matsum)

In [120]:
reduce(add, filter(lambda elt: elt is not None, (1, 0)))

1

In [91]:
def coordinateMatrixElementwise(mat, op):
    """
    elt -> op(elt) for each nonzero element elt of the matrix mat
    """
    new_entries = mat.entries.map(lambda entry: MatrixEntry(entry.i, entry.j, op(entry.value)))
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(new_entries)

In [None]:
def coordinateMatrixElementwise(mat, scalar):
    """
    return scalar * mat
    """
    new_entries = mat.entries.map(lambda entry: MatrixEntry(entry.i, entry.j, scalar * entry.value))
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(new_entries)

Test stuff

In [94]:
test_coordmat2 = ndarr_to_coord_array(test_array2)

In [97]:
test_coordmat2.entries.filter(lambda entry: entry.value != 0).collect()

[MatrixEntry(0, 0, 1.0),
 MatrixEntry(0, 1, 2.0),
 MatrixEntry(0, 2, 3.0),
 MatrixEntry(1, 0, 4.0),
 MatrixEntry(1, 2, 6.0),
 MatrixEntry(1, 3, 1.0),
 MatrixEntry(2, 2, 8.0),
 MatrixEntry(2, 3, 2.0)]

In [95]:
test_coordmat2.entries.collect()

[MatrixEntry(0, 0, 1.0),
 MatrixEntry(0, 1, 2.0),
 MatrixEntry(0, 2, 3.0),
 MatrixEntry(0, 3, 0.0),
 MatrixEntry(1, 0, 4.0),
 MatrixEntry(1, 1, 0.0),
 MatrixEntry(1, 2, 6.0),
 MatrixEntry(1, 3, 1.0),
 MatrixEntry(2, 0, 0.0),
 MatrixEntry(2, 1, 0.0),
 MatrixEntry(2, 2, 8.0),
 MatrixEntry(2, 3, 2.0)]

In [123]:
def coordinate_matrix_to_ndarr(mat):
    size = mat.entries.count()
    elts = mat.entries.take(size)
    arr = np.zeros((mat.numRows(), mat.numCols()))
    for elt in elts:
        arr[elt.i][elt.j] = elt.value
    return arr

def ndarr_to_coord_array(arr):
    entries = []
    for i in range(len(arr)):
        for j in range(len(arr[0])):
            if arr[i][j] != 0:
                entries.append((i, j, arr[i][j]))
    return CoordinateMatrix(sc.parallelize(entries))

test_array = np.array([[1, 2, 3], [4, 0, 6], [0, 0, 8]], dtype = 'float')
test_array2 = np.array([[1, 2, 3, 0], [4, 0, 6, 1], [0, 0, 8, 2]], dtype = 'float')
test_array1d = np.array([[1, 1, 1, 1]])
test_coordmat = ndarr_to_coord_array(test_array)
test_coordmat2 = ndarr_to_coord_array(test_array2)
test_coordmat2_T = ndarr_to_coord_array(test_array2.T)


def test_multiply():
    assert np.all(coordinate_matrix_to_ndarr(coordinateMatrixMultiply(test_coordmat, test_coordmat)) ==\
        np.dot(test_array, test_array))
    assert np.all(coordinate_matrix_to_ndarr(coordinateMatrixMultiply(test_coordmat2, test_coordmat2_T)) ==\
        np.dot(test_array2, test_array2.T))
    #assert coordinate_matrix_to_ndarr(coordinateMatrixMultiply(
    #    ndarr_to_coord_array(test_array1d), ndarr_to_coord_array(test_array1d.T)))
    
test_array3 = np.array([[ 0.63203118,  0.30233108,  0.40677762,  0.58962667],
       [ 0.98905039,  0.9516414 ,  0.20273982,  0.20800506],
       [ 0.7751541 ,  0.94623161,  0.22601002,  0.40736821]])

def test_add():
    m = coordinate_matrix_to_ndarr(
        coordinateMatrixAdd(ndarr_to_coord_array(test_array3), ndarr_to_coord_array(test_array2), -2))
    m2 = test_array3 - 2 * test_array2
    #return m, m2
    assert np.all(m == m2)

In [101]:
coordinateMatrixAdd(ndarr_to_coord_array(test_array3), ndarr_to_coord_array(test_array2), -2)

<pyspark.mllib.linalg.distributed.CoordinateMatrix at 0x7fde35f7b250>

In [90]:
coordinate_matrix_to_ndarr(coordinateMatrixMultiply(
        ndarr_to_coord_array(test_array1d), ndarr_to_coord_array(test_array1d.T)))

array([[ 4.]])

In [124]:
test_add()
#test_multiply()

In [78]:
coordinate_matrix_to_ndarr(coordinateMatrixScalarMult(ndarr_to_coord_array(test_array2), 1.5))

array([[  1.5,   3. ,   4.5,   0. ],
       [  6. ,   0. ,   9. ,   1.5],
       [  0. ,   0. ,  12. ,   3. ]])

In [92]:
coordinate_matrix_to_ndarr(coordinateMatrixElementwise(ndarr_to_coord_array(test_array2), lambda n: np.log(n)))

array([[ 0.        ,  0.69314718,  1.09861229,        -inf],
       [ 1.38629436,        -inf,  1.79175947,  0.        ],
       [       -inf,        -inf,  2.07944154,  0.69314718]])

In [93]:
test_array2

array([[ 1.,  2.,  3.,  0.],
       [ 4.,  0.,  6.,  1.],
       [ 0.,  0.,  8.,  2.]])

SVD implementation

In [80]:
from random import normalvariate


In [85]:
cmat.numCols()

347112L

In [83]:
from random import normalvariate
def random_unit(n):
    vec = np.array([normalvariate(0, 1) for _ in range(n)])
    return vec / np.linalg.norm(vec)

def SVD_1D(A, threshold = 1e-10):
    """
    A : CoordinateMatrix
    """
    n, m = A.numRows(), A.numCols()
    x = random_unit(m)
    # TODO finish this

In [None]:
import numpy as np
from numpy.linalg import norm

def svd_1d(A, epsilon=1e-10):
    ''' The one-dimensional SVD '''
    n, m = A.shape
    x = randomUnitVector(m)
    lastV = None
    currentV = x
    B = np.dot(A.T, A)
 
    iterations = 0
    while True:
        iterations += 1
        lastV = currentV
        currentV = np.dot(B, lastV)
        currentV = currentV / norm(currentV)
 
        if abs(np.dot(currentV, lastV)) > 1 - epsilon:
            print("converged in {} iterations!".format(iterations))
            return currentV

In [61]:
cmat = CoordinateMatrix(bare_occurrences.rdd.map(tuple))

In [62]:
m2 = coordinateMatrixAdd(cmat, cmat, -2)

In [64]:
m2.entries.take(2)

[MatrixEntry(3, 9, -2.0), MatrixEntry(8, 4, -1.0)]

In [25]:
matsum = coordinateMatrixAdd(ndarr_to_coord_array(test_array3), ndarr_to_coord_array(test_array3), -2)

In [29]:
cmat = CoordinateMatrix(bare_occurrences.rdd.map(tuple))

In [43]:
cmat.entries.take(2)

[MatrixEntry(1, 176355, 1.0), MatrixEntry(2, 2547, 1.0)]

In [74]:
mul.entries.count()

595811

In [41]:
cmat.entries.count()

475392

In [45]:
cmb = cmat.toBlockMatrix()

In [47]:
cmi = cmb.toIndexedRowMatrix()

In [51]:
row = cmi.rows.take(1)

[IndexedRow(3558, (347112,[54015,294324],[1.0,2.0]))]

In [68]:
sqlContext.sql("""
select max(uid)
from occurrences
""").toPandas()

Unnamed: 0,max(uid)
0,347111


In [57]:
occurrences.toPandas()

Unnamed: 0,subreddit,author,tally,uid,rid
0.0,zyramains,TricolorStar,1,176355,1
1.0,zylzon,youknowitsyaboy,1,2547,2
2.0,zweiteliga,Saminka,1,206115,3
3.0,zurich,maxwellmaxen,2,72503,4
4.0,zsh,__soddit,1,159502,5
5.0,zootopia,thawed_caveman,2,22475,6
6.0,zootopia,speisekarte,11,29937,6
7.0,zootopia,rodrigogirao,6,42380,6
8.0,zootopia,phobos136,4,53058,6
9.0,zootopia,midnightopheliac,9,70104,6


In [89]:
# TODO: why doesn't this match rid?
cmat.numRows()

17085L

In [71]:
import numpy as np

In [74]:
np.array(df['tally']).min()

-632

In [69]:
sqlContext.sql("""
select author, subreddit, sum (score)
from test
group by author, subreddit
""").toPandas()

Unnamed: 0,author,subreddit,sum(score)
0.0,FormulaXDGame,formula1,31
1.0,HelluvaDeke,gaming,2
2.0,WhoKnowsWho2,xboxone,70
3.0,listentohim,DBZDokkanBattle,1
4.0,sevenzig,AskOuija,10
5.0,LumpyPick,leagueoflegends,4
6.0,volvostupidshit,Philippines,6
7.0,i_fuck_goats777,AskReddit,2
8.0,Ineeditunesalot,battlefield_4,11
9.0,martianlawrence,movies,11


In [78]:
sj.toPandas()

Unnamed: 0,author,author_cakeday,author_flair_css_class,author_flair_text,body,can_gild,collapsed,collapsed_reason,controversiality,created_utc,distinguished,edited,gilded,id,is_submitter,link_id,parent_id,retrieved_on,score,stickied,Unnamed: 21
0.0,stunt_penguin,,,,Wheelchairs make amazing dollys :D,True,False,,0,1501545600,,false,0,dkznc8h,False,t3_6qp8sw,t1_dkzbnn3,1503654247,1,False,...
1.0,[deleted],,,,[removed],True,False,,0,1501545600,,false,0,dkznc8i,False,t3_6qse6i,t1_dkzmgc3,1503654247,2,False,...
2.0,69ing,,,,I used to watch the shit out of these guys,True,False,,0,1501545600,,false,0,dkznc8j,False,t3_6qs8i1,t3_6qs8i1,1503654247,5,False,...
3.0,ArchadianJudge,,Archadianflair,,http://www.pixiv.net/member_illust.php?mode=me...,True,False,,0,1501545600,,false,0,dkznc8k,True,t3_6qsicx,t3_6qsicx,1503654247,2,False,...
4.0,sglville,,,,On the other yand you could say it's capitalis...,True,False,,0,1501545600,,false,0,dkznc8l,False,t3_6qryxf,t3_6qryxf,1503654247,2,False,...
5.0,NEWORLDODOR,,,,I'm not arguing that making university free an...,True,False,,0,1501545600,,false,0,dkznc8m,True,t3_6qrr1o,t1_dkzn053,1503654247,3,False,...
6.0,zachwad22,,,,"I like whisper-whistle obsessively, just barel...",True,False,,0,1501545600,,false,0,dkznc8n,False,t3_6qoe6s,t3_6qoe6s,1503654247,1,False,...
7.0,lurker4lyfe6969,,,,Served in the Chair Force. Can confirm,True,False,,0,1501545600,,false,0,dkznc8o,False,t3_6qodwi,t3_6qodwi,1503654247,2,False,...
8.0,SuburbanStoner,,,,"Nah, it's good right here",True,False,,0,1501545600,,false,0,dkznc8p,False,t3_6qn002,t1_dkz6ig2,1503654247,2,False,...
9.0,ensanguine,,,,I still stand by it not belonging on the plate...,True,False,,0,1501545600,,false,0,dkznc8q,False,t3_6qscet,t1_dkzn9vn,1503654247,18,False,...


In [27]:
rawDF = sqlContext.read.json("s3n://reddit-comments/2015/RC_2015-05", StructType(fields)).persist(StorageLevel.MEMORY_AND_DISK_SER)

In [28]:
rawDF.registerTempTable('comments')