In [1]:
import pyspark

In [2]:
import numpy as np

In [3]:
from pyspark.mllib.linalg.distributed import CoordinateMatrix
from pyspark.mllib.linalg.distributed import MatrixEntry
from pyspark.sql.types import *

In [565]:
minimal_fields = [ 
          StructField("author", StringType(), True),
          StructField("score", LongType(), True),
          StructField("controversiality", LongType(), True),
          StructField("subreddit", StringType(), True)]

In [175]:
sj = sqlContext.read.json("s3a://insight-ohoidn/sample3.json", StructType(minimal_fields))

In [5]:
sj.printSchema()

root
 |-- author: string (nullable = true)
 |-- score: long (nullable = true)
 |-- subreddit: string (nullable = true)



In [115]:
sj.toPandas()

Unnamed: 0,author,score,subreddit
0.0,stunt_penguin,1,Filmmakers
1.0,[deleted],2,Addons4Kodi
2.0,69ing,5,NotTimAndEric
3.0,ArchadianJudge,2,Saber
4.0,sglville,2,The_Donald
5.0,NEWORLDODOR,3,JordanPeterson
6.0,zachwad22,1,AskReddit
7.0,lurker4lyfe6969,2,Military
8.0,SuburbanStoner,2,trashy
9.0,ensanguine,18,KitchenConfidential


In [541]:
sj.registerTempTable('test')

In [52]:
sqlContext.sql('select author from test').toPandas()

Unnamed: 0,author
0.0,stunt_penguin
1.0,[deleted]
2.0,69ing
3.0,ArchadianJudge
4.0,sglville
5.0,NEWORLDODOR
6.0,zachwad22
7.0,lurker4lyfe6969
8.0,SuburbanStoner
9.0,ensanguine


In [573]:
occurrences = sqlContext.sql("""
select *, dense_rank() over (order by subreddit desc) as rid 
from  (SELECT subreddit, author, sum(score) as tally,\
    sum(abs(score)) as activity, dense_rank() over (order by author desc) as uid
from test
group by subreddit, author)
where tally!=0
""").persist(StorageLevel.MEMORY_AND_DISK_SER)
occurrences.registerTempTable('occurrences')

In [571]:
occurrences.count()

612

In [655]:
occurrences2 = sqlContext.sql("""
select *, dense_rank() over (order by subreddit desc) as rid 
from  (SELECT subreddit, author, sum(score) as tally,\
    sum(abs(score)) as activity, dense_rank() over (order by author desc) as uid
from test
where subreddit in (select subreddit from activity_full)
group by subreddit, author)
where tally!=0
""").persist(StorageLevel.MEMORY_AND_DISK_SER)
occurrences2.registerTempTable('occurrences2')

In [656]:
occurrences2.toPandas()

Unnamed: 0,subreddit,author,tally,activity,uid,rid
0.0,worldnews,zzephyrus,2,2,13,1
1.0,worldnews,zulruhkin,2,2,73,1
2.0,worldnews,zschultz,21,21,94,1
3.0,worldnews,zoso1012,21,21,120,1
4.0,worldnews,zin33,27,27,215,1
5.0,worldnews,zidanetribal,19,19,235,1
6.0,worldnews,zehlazycookie,6,6,328,1
7.0,worldnews,zeddediah,1,1,339,1
8.0,worldnews,zcxsfsdfsdfdfdfdfdf,28,28,362,1
9.0,worldnews,zartz,6,6,376,1


In [658]:
sqlContext.sql("""
select distinct rid, subreddit
from occurrences2
""").toPandas()

Unnamed: 0,rid,subreddit
0,1,worldnews
1,2,whatisthisthing
2,3,videos
3,4,todayilearned
4,5,therewasanattempt
5,6,teenmom
6,7,technology
7,8,soccer
8,9,skyrim
9,10,science


In [605]:
# dataframe with required data for constructing the sparse matrix
bare_occurrences = sqlContext.sql("""
select rid, uid, tally
from occurrences
where rid in (select rid from active)
""").persist(StorageLevel.MEMORY_AND_DISK_SER)
bare_occurrences.toPandas()

Unnamed: 0,rid,uid,tally
0.0,157,41,2
1.0,157,209,2
2.0,157,277,21
3.0,157,349,21
4.0,157,756,27
5.0,157,834,19
6.0,157,1214,6
7.0,157,1281,1
8.0,157,1349,28
9.0,157,1423,6


In [639]:
def mat_from_occurrences_df(occurrences_str, a1_str, a2_str):
    bare_occurrences = sqlContext.sql("""
    select rid, uid, tally
    from %s
    """ % occurrences_str).persist(StorageLevel.MEMORY_AND_DISK_SER)
    bare_occurrences.registerTempTable(occurrences_str)
    
    activity_df = sqlContext.sql("""
    select rid, sum(activity) as activity
    from occurrences
    group by rid
    """).persist(StorageLevel.MEMORY_AND_DISK_SER)
    activity_df.registerTempTable(a1_str)
    
    active = sqlContext.sql("""
    select *
    from activity_df
    where activity > 70000""")
    active.registerTempTable(a2_str)
    

    
    return activity_df, bare_occurrences

### Subreddit activity:

In [632]:
activity_df = sqlContext.sql("""
select rid, sum(activity) as activity
from occurrences
group by rid
""").persist(StorageLevel.MEMORY_AND_DISK_SER)
activity_df.registerTempTable('activity_df')

In [612]:
occurrences

DataFrame[subreddit: string, author: string, tally: bigint, activity: bigint, uid: int, rid: int]

In [631]:
activity_full = sqlContext.sql("""
select rid, subreddit, sum(activity) as activity
from occurrences
where activity > 2000
group by rid, subreddit
""").persist(StorageLevel.MEMORY_AND_DISK_SER)
activity_full.registerTempTable('activity_full')

In [622]:
activity_full.toPandas()

Unnamed: 0,rid,subreddit,activity
0,157,worldnews,25885
1,274,whatisthisthing,3607
2,441,videos,75139
3,900,todayilearned,50255
4,998,therewasanattempt,3173
5,1128,teenmom,2418
6,1140,technology,4423
7,1597,soccer,9005
8,1672,skyrim,2011
9,1961,science,4726


In [601]:
active = sqlContext.sql("""
select *
from activity_df
where activity > 70000""")
active.registerTempTable('active')

In [602]:
active.toPandas()

Unnamed: 0,rid,activity
0,157,92249
1,441,170466
2,900,109733
3,1597,89922
4,2571,205263
5,3242,97974
6,5109,95671
7,5120,183469
8,5182,71400
9,8796,94572


In [604]:
sqlContext.sql("""
select *
from occurrences
where rid=8796
""").toPandas()

Unnamed: 0,subreddit,author,tally,activity,uid,rid
0.0,The_Donald,zzyzzx2,1,1,17,8796
1.0,The_Donald,zz-zz,2,2,56,8796
2.0,The_Donald,zxscooby,5,5,104,8796
3.0,The_Donald,zwiebelsaft,6,6,143,8796
4.0,The_Donald,zweifaltspinsel,6,6,150,8796
5.0,The_Donald,zuul99,1,1,172,8796
6.0,The_Donald,zroxx2,101,101,295,8796
7.0,The_Donald,zkallywag,8,8,666,8796
8.0,The_Donald,zinnenator,1,17,743,8796
9.0,The_Donald,zinaca,16,16,755,8796


In [592]:
sqlContext.sql("""
select *
from occurrences
join occurrences on occurrences.rid=active.rid 
""")

ERROR: An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 0))



AnalysisException: u"Reference 'occurrences.rid' is ambiguous, could be: rid#10521, rid#11060.; line 4 pos 20"

In [642]:
act, bare = mat_from_occurrences_df('occurrences2', 'b', 'c')

In [643]:
nusers = tf_ij.numCols()

# subreddit activity vector
gf_i = CoordinateMatrix(act.rdd.map(lambda row: (row.rid, 0, row.activity)))

# subreddit-activity matrix
tf_ij = CoordinateMatrix(bare.rdd.map(tuple))
p_ij = coordinate_matrix_elementwise_vector_division(tf_ij, gf_i)
logp_ij = coordinateMatrixElementwise(p_ij, lambda elt: np.log(elt)/np.log(nusers))

entropy_i = coordinateMatrixElementwise(coordinate_matrix_sumj(logp_ij), lambda elt: elt + 1) # + 1, but this has to be handled separately to conserve matrix sparsity


In [647]:
ndarr_tf = coordinate_matrix_to_ndarr(tf_ij)

In [649]:
np.shape(ndarr_tf)

(60, 94593)

In [664]:
def transform_tf_if(val):
    newval = np.log(np.abs(val) + 1)
    if val < 0:
        return -newval
    else:
        return newval

# TODO: maybe save this for later, when noise can be reduced via dimensionality reduction
log_tf_ij = coordinateMatrixElementwise(tf_ij, transform_tf_if)

a_ij = coordinatematrix_multiply_vector_elementwise(log_tf_ij, entropy_i)

In [337]:
a_ij.entries.count()

465503

In [325]:
tf_ij.numRows()

17045L

In [323]:
entropy_i.numRows()

17045L

In [None]:
logp_ij = coordinateMatrixElementwise

In [203]:
activity_df.toPandas()

Unnamed: 0,rid,activity
0.0,1,1
1.0,2,1
2.0,3,1
3.0,4,2
4.0,5,1
5.0,6,295
6.0,7,32
7.0,8,9
8.0,9,4
9.0,10,3


In [9]:
from operator import add

# TODO: check that zero entries are correctly filtered
def coordinateMatrixMultiply(leftmat, rightmat):
    m = leftmat.entries.map(lambda entry: (entry.j, (entry.i, entry.value)))
    n = rightmat.entries.map(lambda entry: (entry.i, (entry.j, entry.value)))
    product_entries = m.join(n)\
    .map(lambda tup: ((tup[1][0][0], tup[1][1][0]), (tup[1][0][1] * tup[1][1][1])))\
    .reduceByKey(add)\
    .map(lambda record: MatrixEntry(record[0][0], record[0][1], record[1]))
    
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(product_entries)

In [121]:
def coordinateMatrixAdd(leftmat, rightmat, scalar):
    """
    Return leftmat + scalar * rightmat
    """
    m = leftmat.entries.map(lambda entry: ((entry.i, entry.j), entry.value))
    n = rightmat.entries.map(lambda entry: ((entry.i, entry.j), scalar * entry.value))
    matsum = m.fullOuterJoin(n)\
    .map(lambda tup: MatrixEntry(tup[0][0], tup[0][1],
                                 reduce(add, filter(lambda elt: elt is not None, tup[1]))))
    
    #return matsum
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(matsum)

In [246]:
def coordinate_matrix_elementwise_vector_division(mat, vec):
    """
    mat : CoordinateMatrix
    
    mat_{ij} -> mat_{ij}/vec_{i}
    """
    m = mat.entries.map(lambda entry: (entry.i, (entry.j, entry.value)))
    v = vec.entries.map(lambda entry: (entry.i, (entry.j, entry.value)))
    matdiv = m.join(v).map(lambda tup: MatrixEntry(tup[0], tup[1][0][0], float(tup[1][0][1]) / tup[1][1][1]))
    
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(matdiv)

In [260]:
#from operator import multipl
def coordinate_matrix_elementwise_matrix_multiplication(mat1, mat2):
    """
    mat : CoordinateMatrix
    
    return matprod, where matprod_{ij} = mat1_{ij} * mat2_{ij}
    """
    m1 = mat1.entries.map(lambda entry: ((entry.i, entry.j), entry.value))
    m2 = mat2.entries.map(lambda entry: ((entry.i, entry.j), entry.value))
    
    matprod = m1.join(m2).map(lambda tup: MatrixEntry(tup[0][0], tup[0][1], tup[1][0] * tup[1][1]))
    #matprod = m1.join(m2).reduceByKey(lambda tup: MatrixEntry(tup[0][0], tup[0][1], tup[1][0] * tup[1][1]))
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(matprod)

In [265]:
def coordinate_matrix_sumj(mat):
    """
    mat : CoordinateMatrix
    """
    summed_entries = mat.entries.map(lambda entry: (entry.i, entry.value)).reduceByKey(add)\
                          .map(lambda tup: MatrixEntry(tup[0], 0, tup[1]))

    return pyspark.mllib.linalg.distributed.CoordinateMatrix(summed_entries)

In [347]:
def coordinate_matrix_row(mat, i):
    """
    mat : CoordinateMatrix
    
    return the specified row vector
    """
    filtered_entries = mat.entries.filter(lambda entry: entry.i == i).\
        map(lambda entry: MatrixEntry(0, entry.j, entry.value))
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(filtered_entries)

In [355]:
def coordinate_vector_matrix_norm(vec):
    """
    TODO: type checking, confusing name?
    """
    return np.sqrt(vec.entries.map(lambda entry: entry.value**2).sum())

In [360]:
test_vec = np.array([[1,1,0]]).T
test_vec2 = np.array([[1,0,0]]).T

In [356]:
coordinate_vector_matrix_norm(ndarr_to_coord_array(test_vec))

1.4142135623730951

In [358]:
def coordinate_matrix_vector_l2(vec1, vec2):
    """
    Given two vectors of the data type CoordinateMatrix, return L2 norm of vec1/|vec1| - vec2/|vec2|
    """
    norm1, norm2 = map(coordinate_vector_matrix_norm, [vec1, vec2])

    vec1normed = coordinateMatrixScalarMult(vec1, 1./norm1)
    vec2normed = coordinateMatrixScalarMult(vec2, 1./norm2)
    
    diff = coordinateMatrixAdd(vec1normed, vec2normed, -1.)
    return coordinate_vector_matrix_norm(diff)

In [661]:
def sort_row_indices_by_distance(mat, vec):
    """
    Return a list of the row indices of mat sorted by the ascending L2 distance between normalized row vectors and vec/|vec|
    """
    size = mat.numCols()
    row_vectors = mat.entries.map(lambda entry: (entry.i, [(entry.j, entry.value)]))\
        .reduceByKey(add).map(lambda tup: (tup[0], Vectors.sparse(size, tup[1])))
    # TODO replace all 1D CoordinateMatrix instances by local sparse vectors
    compare_vector = vec.entries.map(lambda entry: ('', [(entry.j, entry.value)])).reduceByKey(add)\
        .map(lambda tup: Vectors.sparse(size, tup[1])).collect()[0]
    # TODO check vector normalization
    return row_vectors\
        .map(lambda tup: (tup[1].squared_distance(compare_vector)/(tup[1].norm(2) * compare_vector.norm(2)), tup[0])).sortByKey()\
        .map(lambda tup: tup[1]).collect()

In [553]:
def coordinatematrix_get_row(mat, i):
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(
                    mat.entries.filter(lambda entry: entry.i == i))

In [665]:
sort_row_indices_by_distance(a_ij, coordinatematrix_get_row(a_ij, 37))

[33L,
 3L,
 36L,
 6L,
 39L,
 9L,
 42L,
 12L,
 45L,
 15L,
 48L,
 18L,
 51L,
 21L,
 54L,
 24L,
 57L,
 27L,
 30L,
 1L,
 34L,
 4L,
 37L,
 7L,
 40L,
 10L,
 43L,
 13L,
 46L,
 16L,
 49L,
 19L,
 52L,
 22L,
 55L,
 25L,
 58L,
 28L,
 31L,
 32L,
 2L,
 35L,
 5L,
 38L,
 8L,
 41L,
 11L,
 44L,
 14L,
 47L,
 17L,
 50L,
 20L,
 53L,
 23L,
 56L,
 26L,
 59L,
 29L]

In [None]:
sort_row_indices_by_distance()

In [484]:
test_array5 = 2 * np.array([[1, 0, 0], [1, 1, 0], [0, 0,  1]])
vec = 2 * np.array([[1, 1, 0.001]])

In [487]:
cm = ndarr_to_coord_array(test_array5)
cv = ndarr_to_coord_array(vec)

In [563]:
td = coordinatematrix_get_row(cmat, 8797)

In [564]:
td.entries.count()

1

In [537]:
sort_row_indices_by_distance(cm, cv)

[1L, 0L, 2L]

In [474]:
from pyspark.mllib.linalg import Vectors, ArrayType

In [532]:
v = Vectors.sparse(3, [(1, 1), (2, 1)])

In [533]:
v.norm(2)

1.4142135623730951

In [492]:
cv.entries.map(lambda entry: (entry.j, entry.value)).collect()

[(0L, 2.0), (1L, 2.0), (2L, 0.002)]

In [493]:
def coordinatematrix_to_sparse_vector(mat):
    """
    mat : CoordinateMatrix
    
    Mat is assumed to have non-zero entries in only the 0th row index
    """
    size = mat.numCols()
    return Vectors.sparse(size, mat.entries.map(lambda entry: (entry.j, entry.value)).collect())

In [495]:
v = coordinatematrix_to_sparse_vector(cv)

In [471]:
mat_j.join(vec_j).

[(0L, ((0L, 2.0), 2.0)),
 (0L, ((1L, 2.0), 2.0)),
 (1L, ((1L, 2.0), 2.0)),
 (2L, (None, 0.002))]

In [466]:
mat_j.fullOuterJoin(vec_j).map(lambda tup: (tup[1][0][0], (tup[1][0][1] - tup[1][1])**2)).collect()

[(1L, 0.0), (0L, 0.0), (1L, 0.0), (2L, 3.992004)]

In [454]:
mat_j.join(vec_j).map(lambda tup: (tup[1][0][0], (tup[1][0][1] - tup[1][1])**2)).reduceByKey(add).collect()

[(0L, 0.0), (1L, 0.0)]

In [453]:
mat_j.join(vec_j).map(lambda tup: (tup[1][0][0], (tup[1][0][1] - tup[1][1])**2)).reduceByKey(add)\
    .map(lambda tup: (tup[1], tup[0])).sortByKey().map(lambda tup: tup[1]).collect()

[0L, 1L]

In [439]:
ndarr_to_coord_array(test_array2).entries.map(lambda entry: (entry.i, [(entry.i, entry.j)]))\
.reduceByKey(add).collect()

[(0L, [(0L, 0L), (0L, 1L), (0L, 2L)]),
 (1L, [(1L, 0L), (1L, 2L), (1L, 3L)]),
 (2L, [(2L, 2L), (2L, 3L)])]

In [361]:
coordinate_matrix_vector_l2(ndarr_to_coord_array(test_vec), ndarr_to_coord_array(test_vec))

0.0

In [371]:
vec1, vec2 = ndarr_to_coord_array(test_vec), ndarr_to_coord_array(test_vec2)

In [372]:
norm1, norm2 = map(coordinate_vector_matrix_norm, [vec1, vec2])

vec1normed = coordinateMatrixScalarMult(vec1, 1./norm1)
vec2normed = coordinateMatrixScalarMult(vec2, 1./norm2)

diff = coordinateMatrixAdd(vec1normed, vec2normed, -1.)

In [373]:
diff.entries.collect()

[MatrixEntry(0, 0, -0.292893218813), MatrixEntry(1, 0, 0.707106781187)]

In [380]:
coordinate_matrix_vector_l2(ndarr_to_coord_array(test_vec), ndarr_to_coord_array(2 * test_vec2))

0.76536686473017945

In [366]:
test_vec2


array([[1],
       [0],
       [0]])

In [379]:
np.linalg.norm(test_vec.T[0] / np.linalg.norm(test_vec) - test_vec2.T[0] / np.linalg.norm(test_vec2))

0.76536686473017945

In [375]:
mat = ndarr_to_coord_array(test_array2)
mat.entries.filter(lambda entry: entry.i == 2).map(lambda entry: collect()

SyntaxError: invalid syntax (<ipython-input-375-dc50cd5dbed2>, line 2)

In [298]:
def coordinatematrix_multiply_vector_elementwise(mat, vec):
    """
    mat : CoordinateMatrix
    vec : CoordinateMatrix
    """
    mat_entries = mat.entries.map(lambda entry: (entry.i, (entry.j, entry.value)))
    vec_entries = vec.entries.map(lambda entry: (entry.i, entry.value))

    prod = vec_entries.join(mat_entries).map(lambda tup: MatrixEntry(tup[0], tup[1][1][0], tup[1][0] * tup[1][1][1]))
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(prod)

In [91]:
def coordinateMatrixElementwise(mat, op):
    """
    elt -> op(elt) for each nonzero element elt of the matrix mat
    """
    new_entries = mat.entries.map(lambda entry: MatrixEntry(entry.i, entry.j, op(entry.value)))
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(new_entries)

In [None]:
def coordinateMatrixElementwiseMultiplication(mat, scalar):
    """
    return scalar * mat
    """
    new_entries = mat.entries.map(lambda entry: MatrixEntry(entry.i, entry.j, scalar * entry.value))
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(new_entries)

Implement LSA

Test stuff

In [94]:
test_coordmat2 = ndarr_to_coord_array(test_array2)

In [97]:
test_coordmat2.entries.filter(lambda entry: entry.value != 0).collect()

[MatrixEntry(0, 0, 1.0),
 MatrixEntry(0, 1, 2.0),
 MatrixEntry(0, 2, 3.0),
 MatrixEntry(1, 0, 4.0),
 MatrixEntry(1, 2, 6.0),
 MatrixEntry(1, 3, 1.0),
 MatrixEntry(2, 2, 8.0),
 MatrixEntry(2, 3, 2.0)]

In [123]:
def coordinate_matrix_to_ndarr(mat):
    size = mat.entries.count()
    elts = mat.entries.take(size)
    arr = np.zeros((mat.numRows(), mat.numCols()))
    for elt in elts:
        arr[elt.i][elt.j] = elt.value
    return arr

def ndarr_to_coord_array(arr):
    entries = []
    for i in range(len(arr)):
        for j in range(len(arr[0])):
            if arr[i][j] != 0:
                entries.append((i, j, arr[i][j]))
    return CoordinateMatrix(sc.parallelize(entries))

test_array = np.array([[1, 2, 3], [4, 0, 6], [0, 0, 8]], dtype = 'float')
test_array2 = np.array([[1, 2, 3, 0], [4, 0, 6, 1], [0, 0, 8, 2]], dtype = 'float')
test_array1d = np.array([[1, 1, 1, 1]])
test_coordmat = ndarr_to_coord_array(test_array)
test_coordmat2 = ndarr_to_coord_array(test_array2)
test_coordmat2_T = ndarr_to_coord_array(test_array2.T)


def test_multiply():
    assert np.all(coordinate_matrix_to_ndarr(coordinateMatrixMultiply(test_coordmat, test_coordmat)) ==\
        np.dot(test_array, test_array))
    assert np.all(coordinate_matrix_to_ndarr(coordinateMatrixMultiply(test_coordmat2, test_coordmat2_T)) ==\
        np.dot(test_array2, test_array2.T))
    #assert coordinate_matrix_to_ndarr(coordinateMatrixMultiply(
    #    ndarr_to_coord_array(test_array1d), ndarr_to_coord_array(test_array1d.T)))
    
test_array3 = np.array([[ 0.63203118,  0.30233108,  0.40677762,  0.58962667],
       [ 0.98905039,  0.9516414 ,  0.20273982,  0.20800506],
       [ 0.7751541 ,  0.94623161,  0.22601002,  0.40736821]])

def test_add():
    m = coordinate_matrix_to_ndarr(
        coordinateMatrixAdd(ndarr_to_coord_array(test_array3), ndarr_to_coord_array(test_array2), -2))
    m2 = test_array3 - 2 * test_array2
    #return m, m2
    assert np.all(m == m2)

In [101]:
coordinateMatrixAdd(ndarr_to_coord_array(test_array3), ndarr_to_coord_array(test_array2), -2)

<pyspark.mllib.linalg.distributed.CoordinateMatrix at 0x7fde35f7b250>

In [90]:
coordinate_matrix_to_ndarr(coordinateMatrixMultiply(
        ndarr_to_coord_array(test_array1d), ndarr_to_coord_array(test_array1d.T)))

array([[ 4.]])

In [217]:
def eval_matrix_binop(ndarr1, ndarr2, op):
    cmat = op(ndarr_to_coord_array(ndarr1),
             ndarr_to_coord_array(ndarr2))
    ndmat = coordinate_matrix_to_ndarr(cmat)
    return cmat, ndmat

In [244]:
vec = ndarr_to_coord_array(np.array([[2, 2, 2, 2]]))
mat = ndarr_to_coord_array(np.ones((4, 4)))

m = mat.entries.map(lambda entry: (entry.i, (entry.j, entry.value)))
v = vec.entries.map(lambda entry: (entry.i, (entry.j, entry.value)))
matdiv = m.join(v).map(lambda tup: MatrixEntry(tup[0], tup[1][0][0], float(tup[1][0][1]) / tup[1][1][1]))

In [263]:
eval_matrix_binop(2 * np.ones((4, 4)), 2 * np.ones((4, 4)), coordinate_matrix_elementwise_matrix_multiplication)

(<pyspark.mllib.linalg.distributed.CoordinateMatrix at 0x7fde2b3eba90>,
 array([[ 4.,  4.,  4.,  4.],
        [ 4.,  4.,  4.,  4.],
        [ 4.,  4.,  4.,  4.],
        [ 4.,  4.,  4.,  4.]]))

In [269]:
sumjtest = coordinate_matrix_sumj(ndarr_to_coord_array(np.ones((3, 4))))

In [270]:
coordinate_matrix_to_ndarr(sumjtest)

array([[ 4.],
       [ 4.],
       [ 4.]])

In [296]:
test_array2

array([[ 1.,  2.,  3.,  0.],
       [ 4.,  0.,  6.,  1.],
       [ 0.,  0.,  8.,  2.]])

In [300]:
eval_matrix_binop(test_array2, np.array([[2, 3, 4]]).T, coordinatematrix_multiply_vector_elementwise)

(<pyspark.mllib.linalg.distributed.CoordinateMatrix at 0x7fde20ca84d0>,
 array([[  2.,   4.,   6.,   0.],
        [ 12.,   0.,  18.,   3.],
        [  0.,   0.,  32.,   8.]]))

In [351]:
coordinate_matrix_row(ndarr_to_coord_array(test_array2), 1).entries.collect()

[MatrixEntry(0, 0, 4.0), MatrixEntry(0, 2, 6.0), MatrixEntry(0, 3, 1.0)]

In [125]:
test_add()
test_multiply()

In [78]:
coordinate_matrix_to_ndarr(coordinateMatrixScalarMult(ndarr_to_coord_array(test_array2), 1.5))

array([[  1.5,   3. ,   4.5,   0. ],
       [  6. ,   0. ,   9. ,   1.5],
       [  0. ,   0. ,  12. ,   3. ]])

In [92]:
coordinate_matrix_to_ndarr(coordinateMatrixElementwise(ndarr_to_coord_array(test_array2), lambda n: np.log(n)))

array([[ 0.        ,  0.69314718,  1.09861229,        -inf],
       [ 1.38629436,        -inf,  1.79175947,  0.        ],
       [       -inf,        -inf,  2.07944154,  0.69314718]])

In [93]:
test_array2

array([[ 1.,  2.,  3.,  0.],
       [ 4.,  0.,  6.,  1.],
       [ 0.,  0.,  8.,  2.]])

SVD implementation

In [None]:
dfi = 

In [80]:
from random import normalvariate


In [85]:
cmat.numCols()

347112L

In [83]:
from random import normalvariate
def random_unit(n):
    vec = np.array([normalvariate(0, 1) for _ in range(n)])
    return vec / np.linalg.norm(vec)

def SVD_1D(A, threshold = 1e-10):
    """
    A : CoordinateMatrix
    """
    n, m = A.numRows(), A.numCols()
    x = random_unit(m)
    # TODO finish this

In [None]:
import numpy as np
from numpy.linalg import norm

def svd_1d(A, epsilon=1e-10):
    ''' The one-dimensional SVD '''
    n, m = A.shape
    x = randomUnitVector(m)
    lastV = None
    currentV = x
    B = np.dot(A.T, A)
 
    iterations = 0
    while True:
        iterations += 1
        lastV = currentV
        currentV = np.dot(B, lastV)
        currentV = currentV / norm(currentV)
 
        if abs(np.dot(currentV, lastV)) > 1 - epsilon:
            print("converged in {} iterations!".format(iterations))
            return currentV

In [62]:
m2 = coordinateMatrixAdd(cmat, cmat, -2)

In [64]:
m2.entries.take(2)

[MatrixEntry(3, 9, -2.0), MatrixEntry(8, 4, -1.0)]

In [25]:
matsum = coordinateMatrixAdd(ndarr_to_coord_array(test_array3), ndarr_to_coord_array(test_array3), -2)

In [29]:
cmat = CoordinateMatrix(bare_occurrences.rdd.map(tuple))

In [43]:
cmat.entries.take(2)

[MatrixEntry(1, 176355, 1.0), MatrixEntry(2, 2547, 1.0)]

In [74]:
mul.entries.count()

595811

In [41]:
cmat.entries.count()

475392

In [45]:
cmb = cmat.toBlockMatrix()

In [47]:
cmi = cmb.toIndexedRowMatrix()

In [51]:
row = cmi.rows.take(1)

[IndexedRow(3558, (347112,[54015,294324],[1.0,2.0]))]

In [144]:
sqlContext.sql("""
select max(uid)
from occurrences
""").toPandas()

Unnamed: 0,max(uid)
0,341578


In [57]:
occurrences.toPandas()

Unnamed: 0,subreddit,author,tally,uid,rid
0.0,zyramains,TricolorStar,1,176355,1
1.0,zylzon,youknowitsyaboy,1,2547,2
2.0,zweiteliga,Saminka,1,206115,3
3.0,zurich,maxwellmaxen,2,72503,4
4.0,zsh,__soddit,1,159502,5
5.0,zootopia,thawed_caveman,2,22475,6
6.0,zootopia,speisekarte,11,29937,6
7.0,zootopia,rodrigogirao,6,42380,6
8.0,zootopia,phobos136,4,53058,6
9.0,zootopia,midnightopheliac,9,70104,6


In [89]:
# TODO: why doesn't this match rid?
cmat.numRows()

17085L

In [71]:
import numpy as np

In [74]:
np.array(df['tally']).min()

-632

In [69]:
sqlContext.sql("""
select author, subreddit, sum (score)
from test
group by author, subreddit
""").toPandas()

Unnamed: 0,author,subreddit,sum(score)
0.0,FormulaXDGame,formula1,31
1.0,HelluvaDeke,gaming,2
2.0,WhoKnowsWho2,xboxone,70
3.0,listentohim,DBZDokkanBattle,1
4.0,sevenzig,AskOuija,10
5.0,LumpyPick,leagueoflegends,4
6.0,volvostupidshit,Philippines,6
7.0,i_fuck_goats777,AskReddit,2
8.0,Ineeditunesalot,battlefield_4,11
9.0,martianlawrence,movies,11


In [78]:
sj.toPandas()

Unnamed: 0,author,author_cakeday,author_flair_css_class,author_flair_text,body,can_gild,collapsed,collapsed_reason,controversiality,created_utc,distinguished,edited,gilded,id,is_submitter,link_id,parent_id,retrieved_on,score,stickied,Unnamed: 21
0.0,stunt_penguin,,,,Wheelchairs make amazing dollys :D,True,False,,0,1501545600,,false,0,dkznc8h,False,t3_6qp8sw,t1_dkzbnn3,1503654247,1,False,...
1.0,[deleted],,,,[removed],True,False,,0,1501545600,,false,0,dkznc8i,False,t3_6qse6i,t1_dkzmgc3,1503654247,2,False,...
2.0,69ing,,,,I used to watch the shit out of these guys,True,False,,0,1501545600,,false,0,dkznc8j,False,t3_6qs8i1,t3_6qs8i1,1503654247,5,False,...
3.0,ArchadianJudge,,Archadianflair,,http://www.pixiv.net/member_illust.php?mode=me...,True,False,,0,1501545600,,false,0,dkznc8k,True,t3_6qsicx,t3_6qsicx,1503654247,2,False,...
4.0,sglville,,,,On the other yand you could say it's capitalis...,True,False,,0,1501545600,,false,0,dkznc8l,False,t3_6qryxf,t3_6qryxf,1503654247,2,False,...
5.0,NEWORLDODOR,,,,I'm not arguing that making university free an...,True,False,,0,1501545600,,false,0,dkznc8m,True,t3_6qrr1o,t1_dkzn053,1503654247,3,False,...
6.0,zachwad22,,,,"I like whisper-whistle obsessively, just barel...",True,False,,0,1501545600,,false,0,dkznc8n,False,t3_6qoe6s,t3_6qoe6s,1503654247,1,False,...
7.0,lurker4lyfe6969,,,,Served in the Chair Force. Can confirm,True,False,,0,1501545600,,false,0,dkznc8o,False,t3_6qodwi,t3_6qodwi,1503654247,2,False,...
8.0,SuburbanStoner,,,,"Nah, it's good right here",True,False,,0,1501545600,,false,0,dkznc8p,False,t3_6qn002,t1_dkz6ig2,1503654247,2,False,...
9.0,ensanguine,,,,I still stand by it not belonging on the plate...,True,False,,0,1501545600,,false,0,dkznc8q,False,t3_6qscet,t1_dkzn9vn,1503654247,18,False,...


In [27]:
rawDF = sqlContext.read.json("s3n://reddit-comments/2015/RC_2015-05", StructType(fields)).persist(StorageLevel.MEMORY_AND_DISK_SER)

In [28]:
rawDF.registerTempTable('comments')