In [3]:
import pyspark

In [4]:
import numpy as np

In [5]:
from pyspark.mllib.linalg.distributed import CoordinateMatrix
from pyspark.mllib.linalg.distributed import MatrixEntry
from pyspark.sql.types import *
from operator import add
from pyspark.mllib.linalg import Vectors, ArrayType

In [6]:
minimal_fields = [ 
          StructField("author", StringType(), True),
          StructField("score", LongType(), True),
          StructField("subreddit", StringType(), True)]

In [16]:
def df_most_active_subreddits(k = 1000):
    most_active = sqlContext.sql("""
    select * from
        (select *, dense_rank() over (order by activity desc) as ordered_id
        from (select rid, subreddit, sum(activity) as activity
            from occurrences
            group by rid, subreddit))
""").persist(StorageLevel.MEMORY_AND_DISK_SER)
    most_active.registerTempTable('most_active')
    return most_active

def df_valid_users(min_subreddits = 2, max_subreddits = 20):
    """
    filter users by the number of subreddits they've posted in, among the above-defined most active subreddits
    """
    most_active_users = sqlContext.sql("""
    select * from
        (select author, count(subreddit) as count
        from 
            (select * from occurrences
            where subreddit in (select subreddit from most_active))
        group by author
        order by count desc)
    where count>=%d and count<=%d""" % (min_subreddits, max_subreddits)).persist(StorageLevel.MEMORY_AND_DISK_SER)
    most_active_users.registerTempTable('most_active_users')
    return most_active_users

def load_and_preprocess(json_uri, num_subreddit, user_min_active_subreddits = 4, user_max_active_subreddits = 20):
    """
    Load json and do preprocessing via some SQL queries
    """
    sj = sqlContext.read.json(json_uri, StructType(minimal_fields))
    sj.registerTempTable('test')
    
    occurrences = sqlContext.sql("""
    select *, dense_rank() over (order by subreddit desc) as rid 
    from  (SELECT subreddit, author, sum(sign(score)) as tally,\
        count(score) as activity, dense_rank() over (order by author desc) as uid
    from test
    group by subreddit, author)
    where tally!=0
    """).persist(StorageLevel.MEMORY_AND_DISK_SER)
    occurrences.registerTempTable('occurrences')
        
    df_most_active_subreddits()
    df_valid_users(user_min_active_subreddits, user_max_active_subreddits)
    
    test2 = sqlContext.sql("""
    select test.author, test.score, test.subreddit, most_active.ordered_id as ordered_id
    from test
    inner join most_active on most_active.subreddit=test.subreddit""")
    test2.registerTempTable('test2')
    
    occurrences_pruned = sqlContext.sql("""
    select *
    from  (SELECT test2.subreddit, author, test2.ordered_id, sum(score) as tally,\
        sum(abs(score)) as activity, dense_rank() over (order by author desc) as uid
        from test2
        where author in (select author from most_active_users)
        group by test2.subreddit, test2.ordered_id, author)
    where tally!=0
    """).persist(StorageLevel.MEMORY_AND_DISK_SER)
    occurrences_pruned.registerTempTable('occurrences_pruned')
    
    bare_occurrences = sqlContext.sql("""
    select ordered_id, uid, tally
    from occurrences_pruned
    """).persist(StorageLevel.MEMORY_AND_DISK_SER)
    bare_occurrences.registerTempTable('bare_occurrences')
    
def gen_frequency_matrix():
    # subreddit-activity matrix
    bare_occurrences = sqlContext.sql("""select * from bare_occurrences""")
    tf_ij = CoordinateMatrix(bare_occurrences.rdd.map(
            lambda row: (row.ordered_id, (row.uid, row.tally)))\
        .sortByKey().map(lambda entry: (entry[0] - 1, entry[1][0], entry[1][1])))
    return tf_ij

### Now initialize stuff

In [17]:
#json_uri = "s3a://insight-ohoidn/sample3.json"
json_uri = "s3a://insight-ohoidn/sample10M.json"
numreddits = 100
user_min_active_subreddits = 5
user_max_active_subreddits = 20

In [18]:
load_and_preprocess(json_uri, numreddits,
                    user_min_active_subreddits = user_min_active_subreddits,
                    user_max_active_subreddits = user_max_active_subreddits)

subreddit_mapper = dict(df_most_active_subreddits(numreddits)\
                        .rdd.map(lambda entry: (entry.ordered_id, entry.subreddit)).collect())
idx_mapper = {v: k for k, v in subreddit_mapper.iteritems()}

def idx_to_subreddit(idx):
    return subreddit_mapper[idx + 1]

def subreddit_to_idx(sub):
    return idx_mapper[sub] - 1

subreddit_to_idx.inverse = idx_to_subreddit
idx_to_subreddit.inverse = subreddit_to_idx

tf_ij = gen_frequency_matrix()

In [19]:
tf_ij.entries.count()

870869

### Subreddit activity:

In [1247]:
i_sumtally_tuples = bare_occurrences.rdd.map(lambda row: (row.ordered_id, row.tally)).sortByKey()\
.reduceByKey(add)
gf_i = CoordinateMatrix(i_sumtally_tuples.map(lambda entry: (entry[0] - 1, 0, entry[1])))


nusers = tf_ij.numCols()

p_ij = coordinate_matrix_elementwise_vector_division(tf_ij, gf_i)

# TODO figure out nan issues
logp_ij = coordinateMatrixElementwise(p_ij, lambda elt: np.log((abs(elt))/np.log(nusers)))

entropy_i = coordinateMatrixElementwise(coordinate_matrix_sumj(logp_ij), lambda elt: elt + 1) # + 1, but this has to be handled separately to conserve matrix sparsity

In [28]:
%run cf_spark.py
%run cf_numpy.py

### numpy implementation:

In [1205]:
related_subs(tf_ij, 'DotA2', subreddit_to_idx)

[u'DotA2', u'2007scape', u'FireEmblemHeroes', u'leagueoflegends', u'ethtrader']

In [1202]:
spark_top_k_subs(sim, 'DotA2', subreddit_to_idx, k = 5)

[u'DotA2', u'2007scape', u'FireEmblemHeroes', u'leagueoflegends', u'ethtrader']

In [1236]:
related_subs(tf_ij, 'DotA2', subreddit_to_idx)

[u'DotA2', u'DestinyTheGame', u'aww', u'funny', u'Rainbow6']

In [1243]:
spark_top_k_subs(p_ij, 'Games', subreddit_to_idx, k = 5)

[u'Games', u'xboxone', u'RocketLeagueExchange', u'movies', u'FFBraveExvius']

In [1239]:
spark_top_k_subs(p_ij, 'DotA2', subreddit_to_idx, k = 5)

[u'DotA2', u'aww', u'funny', u'Rainbow6', u'TheSilphRoad']

In [1250]:
spark_top_k_subs(tf_ij, 'DotA2', subreddit_to_idx, k = 5)

[u'DotA2', u'DestinyTheGame', u'aww', u'funny', u'Rainbow6']

In [33]:
spark_top_k_subs(tf_ij, 'investing', subreddit_to_idx, k = 5)

[u'investing', u'stocks', u'RobinHood', u'wallstreetbets', u'pathofexile']

In [1249]:
related_subs(tf_ij, 'DotA2', subreddit_to_idx)

[u'DotA2', u'DestinyTheGame', u'aww', u'funny', u'Rainbow6']

In [1143]:
tf_ij.numCols()

1370L

In [504]:
def transform_tf_if(val):
    newval = np.log(np.abs(val) + 1)
    if val < 0:
        return -newval
    else:
        return newval

# TODO: maybe save this for later, when noise can be reduced via dimensionality reduction
log_tf_ij = coordinateMatrixElementwise(tf_ij, transform_tf_if)

a_ij = coordinatematrix_multiply_vector_elementwise(log_tf_ij, entropy_i)

In [37]:
def predict_nobias(ratings, similarity, kind='user'):
    if kind == 'user':
        user_bias = ratings.mean(axis=1)
        ratings = (ratings - user_bias[:, np.newaxis]).copy()
        pred = similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
        pred += user_bias[:, np.newaxis]
    elif kind == 'item':
        item_bias = ratings.mean(axis=0)
        ratings = (ratings - item_bias[np.newaxis, :]).copy()
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
        pred += item_bias[np.newaxis, :]
        
    return pred

In [93]:
def predict_topk(ratings, similarity, kind='user', k=40):
    pred = np.zeros(ratings.shape)
    if kind == 'user':
        for i in xrange(ratings.shape[0]):
            top_k_users = [np.argsort(similarity[:,i])[:-k-1:-1]]
            for j in xrange(ratings.shape[1]):
                pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users]) 
                pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
    if kind == 'item':
        for j in xrange(ratings.shape[1]):
            top_k_items = [np.argsort(similarity[:,j])[:-k-1:-1]]
            for i in xrange(ratings.shape[0]):
                pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T) 
                pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))        
    
    return pred

In [38]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [901]:
def top_k_movies(similarity, movie_idx, k=6):
    return [notebook_idx_to_subreddit(x) for x in np.argsort(similarity[movie_idx,:])[:-k-1:-1]]

In [1206]:
#get_mse(predict_nobias, test)

In [756]:
bare_occurrences.toPandas()

Unnamed: 0,rid,uid,tally
0.0,1,173573,1
1.0,2,2510,1
2.0,3,202843,1
3.0,4,71343,2
4.0,5,157008,1
5.0,6,22113,2
6.0,6,29476,11
7.0,6,41712,6
8.0,6,52207,4
9.0,6,68993,9


In [796]:
a_ij.numCols()

57010L

In [350]:
np.mean(ndmat, axis = 0).shape

(100,)

In [799]:
ndmat = coordinate_matrix_to_ndarr(tf_ij)
#ndmat/=np.mean(ndmat)
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]
train, test = train_test_split(ndmat)
similarity = fast_similarity(train)

sim [[  1.00000000e+00   1.00000000e-09   1.50000000e+01 ...,   1.00000000e-09
    1.00000000e-09   1.00000000e-09]
 [  1.00000000e-09   4.90000000e+01   1.00000000e-09 ...,   1.00000000e-09
    1.00000000e-09   1.00000000e-09]
 [  1.50000000e+01   1.00000000e-09   2.25000000e+02 ...,   1.00000000e-09
    1.00000000e-09   1.00000000e-09]
 ..., 
 [  1.00000000e-09   1.00000000e-09   1.00000000e-09 ...,   1.00000000e+00
    1.00000000e-09   1.00000000e-09]
 [  1.00000000e-09   1.00000000e-09   1.00000000e-09 ...,   1.00000000e-09
    1.00000000e+00   1.00000000e-09]
 [  1.00000000e-09   1.00000000e-09   1.00000000e-09 ...,   1.00000000e-09
    1.00000000e-09   4.90000000e+01]]
diag [   1.   49.  225. ...,    1.    1.   49.]
[[  1.   7.  15. ...,   1.   1.   7.]]


In [919]:
import matplotlib.pyplot as plt

In [916]:
np.min(np.abs(similarity))

4.1997201680174538e-13

In [932]:
%run linalg.py

In [931]:
test_multiply()

In [854]:
sim = similarity_matrix(ndarr_to_coord_array(ndmat.T))

In [948]:
sim.entries.partitions

AttributeError: 'PipelinedRDD' object has no attribute 'partitions'

In [977]:
simnd = coordinate_matrix_to_ndarr(sim)

In [983]:
test_arr = np.array([[1, 2, 3, 4, 5, -1, -2, -3, -4, -5], range(10)])

In [997]:
np.array(sim.entries.map(lambda x: (x.i, [(x.j, x.value)]))\
.reduceByKey(add)\
.map(lambda row: map(lambda tup: tup[0], sorted(row[1], key = lambda tup: -tup[1]))[:10])\
.collect())

array([[0L, 39L, 9L, 62L, 13L, 91L, 18L, 3L, 38L, 43L],
       [72L, 6L, 27L, 7L, 25L, 94L, 18L, 35L, 1L, 10L],
       [1L, 91L, 16L, 15L, 39L, 10L, 6L, 7L, 98L, 13L],
       [73L, 38L, 70L, 29L, 25L, 96L, 9L, 0L, 84L, 43L],
       [74L, 30L, 78L, 66L, 36L, 31L, 99L, 38L, 48L, 43L],
       [2L, 49L, 97L, 27L, 54L, 6L, 58L, 99L, 71L, 25L],
       [75L, 84L, 56L, 27L, 39L, 55L, 93L, 37L, 33L, 40L],
       [3L, 97L, 41L, 28L, 91L, 85L, 11L, 71L, 4L, 56L],
       [76L, 39L, 19L, 70L, 0L, 43L, 34L, 5L, 2L, 61L],
       [4L, 62L, 65L, 96L, 3L, 10L, 51L, 91L, 26L, 13L],
       [5L, 60L, 86L, 51L, 19L, 59L, 49L, 39L, 16L, 22L],
       [77L, 87L, 23L, 86L, 46L, 90L, 60L, 3L, 65L, 47L],
       [78L, 43L, 74L, 20L, 7L, 35L, 51L, 87L, 0L, 21L],
       [6L, 39L, 27L, 85L, 72L, 11L, 82L, 54L, 12L, 33L],
       [79L, 67L, 10L, 82L, 43L, 87L, 60L, 86L, 23L, 25L],
       [7L, 38L, 43L, 17L, 64L, 72L, 9L, 96L, 66L, 44L],
       [8L, 93L, 42L, 11L, 87L, 46L, 60L, 58L, 86L, 23L],
       [80L, 33L, 22L, 48

In [936]:
np.sum(np.abs(similarity) > 0)

1874161

In [940]:
np.sum(np.abs(similarity) > 1e-5)

800925

In [802]:
sim2 = fast_similarity(ndmat)

sim [[  2.22000000e+02   1.00000000e-09   1.50000000e+01 ...,   7.00000000e+01
    1.00000000e-09   9.80000000e+01]
 [  1.00000000e-09   5.74000000e+02   1.00000000e-09 ...,   1.00000000e-09
    1.00000000e-09   1.00000000e-09]
 [  1.50000000e+01   1.00000000e-09   2.02900000e+03 ...,   1.00000000e-09
    1.00000000e-09   2.00000000e+00]
 ..., 
 [  7.00000000e+01   1.00000000e-09   1.00000000e-09 ...,   3.10000000e+01
    1.00000000e-09   3.50000000e+01]
 [  1.00000000e-09   1.00000000e-09   1.00000000e-09 ...,   1.00000000e-09
    5.50000000e+01   1.00000000e-09]
 [  9.80000000e+01   1.00000000e-09   2.00000000e+00 ...,   3.50000000e+01
    1.00000000e-09   3.65100000e+03]]
diag [  222.   574.  2029. ...,    31.    55.  3651.]
[[ 14.89966443  23.9582971   45.04442252 ...,   5.56776436   7.41619849
   60.42350536]]


In [805]:
simnd = coordinate_matrix_to_ndarr(sim)

In [None]:
pwd

In [829]:
coordinate_matrix_to_ndarr(coordinatematrix_get_row(ndarr_to_coord_array(test_array3), 1))

array([[ 0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.98905039,  0.9516414 ,  0.20273982,  0.20800506]])

In [844]:
row = coordinatematrix_get_row(ndarr_to_coord_array(test_array3), 1)

In [846]:
coordinate_matrix_to_ndarr(row)

array([[ 0.98905039,  0.9516414 ,  0.20273982,  0.20800506]])

In [1089]:
%run cf_spark.py

In [852]:
subreddit_mapper

{1: u'AskReddit',
 2: u'politics',
 3: u'BigBrother',
 4: u'SquaredCircle',
 5: u'The_Donald',
 6: u'gameofthrones',
 7: u'nba',
 8: u'funny',
 9: u'RocketLeagueExchange',
 10: u'videos',
 11: u'worldnews',
 12: u'leagueoflegends',
 13: u'soccer',
 14: u'movies',
 15: u'DotA2',
 16: u'Showerthoughts',
 17: u'news',
 18: u'pics',
 19: u'gaming',
 20: u'MMA',
 21: u'teenagers',
 22: u'todayilearned',
 23: u'hearthstone',
 24: u'pathofexile',
 25: u'pcmasterrace',
 26: u'PUBATTLEGROUNDS',
 27: u'Overwatch',
 28: u'baseball',
 29: u'anime',
 30: u'NYYankees',
 31: u'asoiaf',
 32: u'relationships',
 33: u'Philippines',
 34: u'FireEmblemHeroes',
 35: u'me_irl',
 36: u'aww',
 37: u'gonewild',
 38: u'conspiracy',
 39: u'gifs',
 40: u'BlackPeopleTwitter',
 41: u'DBZDokkanBattle',
 42: u'hiphopheads',
 43: u'rupaulsdragrace',
 44: u'mildlyinteresting',
 45: u'Rainbow6',
 46: u'australia',
 47: u'ethtrader',
 48: u'GlobalOffensive',
 49: u'TheSilphRoad',
 50: u'thebachelor',
 51: u'GlobalOffensiv

In [904]:
top_k_movies(fast_similarity(ndmat, kind = 'item'), subreddit_to_idx('videos', idx_mapper))

[u'videos',
 u'india',
 u'grandorder',
 u'AskReddit',
 u'Warframe',
 u'relationships']

In [880]:
def run_spark_ndarray(ndmat, subreddit):
    """
    ndmat : np.ndarray
    
    row indices: users
    column indices: subreddits
    """
    sim = similarity_matrix(ndarr_to_coord_array(ndmat.T))
    return spark_top_k_subs(sim, subreddit_to_idx(subreddit, idx_mapper), subreddit_mapper)

In [1109]:
idx = 55
row = la.coordinatematrix_get_row(sim, idx)
movie_row = la.coordinate_matrix_to_ndarr(row)[0]
sm = [np.argsort(movie_row)[::-1]]

In [1125]:
sorted(sim.entries.map(lambda entry: entry.i).distinct().collect())

[0L,
 1L,
 2L,
 3L,
 4L,
 5L,
 6L,
 7L,
 8L,
 9L,
 10L,
 11L,
 12L,
 13L,
 14L,
 15L,
 16L,
 17L,
 18L,
 19L,
 20L,
 21L,
 22L,
 23L,
 24L,
 25L,
 26L,
 27L,
 28L,
 29L,
 30L,
 31L,
 32L,
 33L,
 34L,
 35L,
 36L,
 37L,
 38L,
 39L,
 40L,
 41L,
 42L,
 43L,
 44L,
 45L,
 46L,
 47L,
 48L,
 49L,
 50L,
 51L,
 52L,
 54L,
 55L,
 56L,
 57L,
 58L,
 59L,
 60L,
 61L,
 62L,
 63L,
 64L,
 65L,
 66L,
 67L,
 68L,
 69L,
 70L,
 71L,
 72L,
 73L,
 74L,
 75L,
 76L,
 77L,
 78L,
 79L,
 80L,
 81L,
 82L,
 83L,
 84L,
 85L,
 86L,
 87L,
 88L,
 89L,
 90L,
 91L,
 92L,
 93L,
 94L,
 95L,
 96L,
 97L,
 98L,
 99L]

In [1116]:
len(la.coordinatematrix_sort_rows(sim, 6))

99

In [1126]:
%run cf_spark.py

In [1104]:
subreddit_to_idx('Amd', idx_mapper)

55

In [1113]:
list(enumerate(la.coordinatematrix_sort_rows(sim, 6)))

[(0, [0L, 39L, 9L, 62L, 13L, 91L]),
 (1, [1L, 91L, 16L, 15L, 39L, 10L]),
 (2, [2L, 49L, 97L, 27L, 54L, 6L]),
 (3, [3L, 97L, 41L, 28L, 91L, 85L]),
 (4, [4L, 62L, 65L, 96L, 3L, 10L]),
 (5, [5L, 60L, 86L, 51L, 19L, 59L]),
 (6, [6L, 39L, 27L, 85L, 72L, 11L]),
 (7, [7L, 38L, 43L, 17L, 64L, 72L]),
 (8, [8L, 93L, 42L, 11L, 87L, 46L]),
 (9, [9L, 0L, 7L, 65L, 73L, 26L]),
 (10, [10L, 79L, 82L, 1L, 88L, 32L]),
 (11, [11L, 33L, 6L, 14L, 3L, 67L]),
 (12, [12L, 27L, 29L, 6L, 19L, 88L]),
 (13, [13L, 19L, 55L, 0L, 3L, 45L]),
 (14, [14L, 67L, 33L, 11L, 46L, 51L]),
 (15, [15L, 39L, 22L, 26L, 1L, 38L]),
 (16, [16L, 1L, 5L, 26L, 10L, 39L]),
 (17, [17L, 38L, 7L, 43L, 96L, 74L]),
 (18, [18L, 12L, 0L, 72L, 37L, 94L]),
 (19, [19L, 59L, 22L, 13L, 62L, 12L]),
 (20, [20L, 70L, 65L, 55L, 33L, 40L]),
 (21, [21L, 43L, 15L, 65L, 7L, 13L]),
 (22, [22L, 26L, 15L, 80L, 19L, 48L]),
 (23, [23L, 87L, 86L, 39L, 77L, 60L]),
 (24, [24L, 86L, 52L, 30L, 26L, 51L]),
 (25, [25L, 46L, 72L, 44L, 73L, 14L]),
 (26, [26L, 22L, 15L, 4

In [1112]:
la.coordinatematrix_sort_rows(sim, 6)[idx]

[[0L, 39L, 9L, 62L, 13L, 91L],
 [1L, 91L, 16L, 15L, 39L, 10L],
 [2L, 49L, 97L, 27L, 54L, 6L],
 [3L, 97L, 41L, 28L, 91L, 85L],
 [4L, 62L, 65L, 96L, 3L, 10L],
 [5L, 60L, 86L, 51L, 19L, 59L],
 [6L, 39L, 27L, 85L, 72L, 11L],
 [7L, 38L, 43L, 17L, 64L, 72L],
 [8L, 93L, 42L, 11L, 87L, 46L],
 [9L, 0L, 7L, 65L, 73L, 26L],
 [10L, 79L, 82L, 1L, 88L, 32L],
 [11L, 33L, 6L, 14L, 3L, 67L],
 [12L, 27L, 29L, 6L, 19L, 88L],
 [13L, 19L, 55L, 0L, 3L, 45L],
 [14L, 67L, 33L, 11L, 46L, 51L],
 [15L, 39L, 22L, 26L, 1L, 38L],
 [16L, 1L, 5L, 26L, 10L, 39L],
 [17L, 38L, 7L, 43L, 96L, 74L],
 [18L, 12L, 0L, 72L, 37L, 94L],
 [19L, 59L, 22L, 13L, 62L, 12L],
 [20L, 70L, 65L, 55L, 33L, 40L],
 [21L, 43L, 15L, 65L, 7L, 13L],
 [22L, 26L, 15L, 80L, 19L, 48L],
 [23L, 87L, 86L, 39L, 77L, 60L],
 [24L, 86L, 52L, 30L, 26L, 51L],
 [25L, 46L, 72L, 44L, 73L, 14L],
 [26L, 22L, 15L, 47L, 51L, 24L],
 [27L, 29L, 85L, 6L, 12L, 75L],
 [28L, 3L, 27L, 99L, 95L, 81L],
 [29L, 27L, 12L, 49L, 39L, 73L],
 [30L, 74L, 92L, 24L, 5L, 36L],
 [31L, 

In [1050]:
np.argsort(coordinate_matrix_to_ndarr(la.coordinatematrix_get_row(sim, 8))[::-1])

array([[67, 79, 43, 25, 26, 48, 47, 45, 44, 49, 80, 41, 40, 81, 38, 78, 50,
        51, 52, 66, 65, 64, 63, 71, 72, 37, 74, 75, 76, 56, 77, 54, 53, 59,
        36, 34, 68,  1,  2, 97,  4, 96,  6,  7, 95,  9, 94, 92, 91, 90, 89,
        88, 16, 17, 33, 82, 31, 30, 29, 28, 35, 27, 83, 22, 21, 20, 19, 85,
        24, 69,  0, 39, 15,  5, 18, 12, 10, 70, 55, 99, 73, 62, 14, 13, 98,
        61, 84, 32,  3, 57, 23, 86, 58, 60, 46, 87, 11, 42, 93,  8]])

In [1111]:
np.argsort(movie_row)[::-1]

array([55, 33, 40, 92, 52, 20, 13, 44, 75, 69, 56, 91, 84, 26, 14, 81, 97,
       28, 47, 42, 48,  9, 39, 24, 10, 43,  8,  6,  1, 51, 79, 87, 62, 23,
       96, 80,  0, 60, 18, 46, 61,  7, 95,  3, 27, 77, 86, 54, 73, 85,  5,
       12, 57, 11, 22, 17, 68, 93,  2, 16, 21, 99, 70, 25, 19, 98, 15, 66,
       67, 71, 72, 74, 37, 78, 82, 83, 88,  4, 89, 90, 94, 76, 64, 65, 29,
       35, 34, 38, 32, 31, 30, 41, 36, 45, 50, 53, 58, 59, 63, 49])

In [1018]:
run_spark_ndarray(ndmat, 'videos')

[u'videos',
 u'india',
 u'grandorder',
 u'AskReddit',
 u'Warframe',
 u'relationships']

In [905]:
run_spark_ndarray(ndmat, 'videos')

[u'videos',
 u'india',
 u'grandorder',
 u'AskReddit',
 u'Warframe',
 u'relationships']

In [1100]:
spark_top_k_subs(sim, subreddit_to_idx('Amd', idx_mapper), subreddit_mapper)

[u'Amd',
 u'FireEmblemHeroes',
 u'DBZDokkanBattle',
 u'Guildwars2',
 u'buildapc',
 u'teenagers']

In [1128]:
%run cf_spark.py

In [1130]:
spark_top_k_subs2(sim, subreddit_to_idx('buildapc', idx_mapper), subreddit_mapper)

[u'buildapc',
 u'Amd',
 u'SquaredCircle',
 u'pcmasterrace',
 u'science',
 u'gonewild']

In [811]:
np.all(np.isclose(simnd, sim2))

True

In [408]:
def func1(ndmat):
    ndmat = ndmat.copy()
    ndmat /= (epsilon + np.std(ndmat, axis = 0))
    ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)
    return ndmat

In [411]:
matfuncs = [func1]
def findtop_allmodels(sub_name):
    ndmat = coordinate_matrix_to_ndarr(tf_ij)
    ndmat = ndmat.T
    ndmat = ndmat[1:, 1:]
    
    def do_one(func, ndmat):
        ndmat = func(ndmat)
        train, test = train_test_split(ndmat)
        similarity = fast_similarity(train)

        item_similarity = fast_similarity(train, kind = 'item')
        print top_k_movies(item_similarity, subreddit_to_idx(sub_name), k = 10)
    [do_one(f, ndmat) for f in matfuncs]

In [426]:
import pickle

In [427]:
import md5

  if __name__ == '__main__':


In [873]:
def generate_ndmat():
    epsilon = 1e-9
    ndmat = coordinate_matrix_to_ndarr(tf_ij)
    ndmat = ndmat.T
    ndmat = ndmat[1:, 1:]

    #ndmat = np.mean(ndmat, axis = 0)
    ndmat /= (epsilon + np.std(ndmat, axis = 0))
    ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)
    
    print md5.md5(pickle.dumps(train)).digest()
    return ndmat

In [1009]:
epsilon = 1e-9
ndmat = coordinate_matrix_to_ndarr(tf_ij)
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]

#ndmat = np.mean(ndmat, axis = 0)
ndmat /= (epsilon + np.std(ndmat, axis = 0))
ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)

train, test = train_test_split(ndmat)
similarity = fast_similarity(ndmat)

item_similarity = fast_similarity(ndmat, kind = 'item')

md5.md5(pickle.dumps(train)).digest()

'\x1b\xbb\xc46 O\xe3\xdb\x83\xb2\x15\x84Rt\x00\x06'

In [266]:
subreddit_mapper

{1: u'AskReddit',
 2: u'politics',
 3: u'BigBrother',
 4: u'SquaredCircle',
 5: u'The_Donald',
 6: u'gameofthrones',
 7: u'nba',
 8: u'funny',
 9: u'RocketLeagueExchange',
 10: u'videos',
 11: u'worldnews',
 12: u'leagueoflegends',
 13: u'soccer',
 14: u'movies',
 15: u'DotA2',
 16: u'Showerthoughts',
 17: u'news',
 18: u'pics',
 19: u'gaming',
 20: u'MMA',
 21: u'teenagers',
 22: u'todayilearned',
 23: u'hearthstone',
 24: u'pathofexile',
 25: u'pcmasterrace',
 26: u'PUBATTLEGROUNDS',
 27: u'Overwatch',
 28: u'baseball',
 29: u'anime',
 30: u'NYYankees',
 31: u'asoiaf',
 32: u'relationships',
 33: u'Philippines',
 34: u'FireEmblemHeroes',
 35: u'me_irl',
 36: u'aww',
 37: u'gonewild',
 38: u'conspiracy',
 39: u'gifs',
 40: u'BlackPeopleTwitter',
 41: u'DBZDokkanBattle',
 42: u'hiphopheads',
 43: u'rupaulsdragrace',
 44: u'mildlyinteresting',
 45: u'Rainbow6',
 46: u'australia',
 47: u'ethtrader',
 48: u'GlobalOffensive',
 49: u'TheSilphRoad',
 50: u'thebachelor',
 51: u'GlobalOffensiv

In [304]:
subreddit_mapper

{1: u'AskReddit',
 2: u'politics',
 3: u'BigBrother',
 4: u'SquaredCircle',
 5: u'The_Donald',
 6: u'gameofthrones',
 7: u'nba',
 8: u'funny',
 9: u'RocketLeagueExchange',
 10: u'videos',
 11: u'worldnews',
 12: u'leagueoflegends',
 13: u'soccer',
 14: u'movies',
 15: u'DotA2',
 16: u'Showerthoughts',
 17: u'news',
 18: u'pics',
 19: u'gaming',
 20: u'MMA',
 21: u'teenagers',
 22: u'todayilearned',
 23: u'hearthstone',
 24: u'pathofexile',
 25: u'pcmasterrace',
 26: u'PUBATTLEGROUNDS',
 27: u'Overwatch',
 28: u'baseball',
 29: u'anime',
 30: u'NYYankees',
 31: u'asoiaf',
 32: u'relationships',
 33: u'Philippines',
 34: u'FireEmblemHeroes',
 35: u'me_irl',
 36: u'aww',
 37: u'gonewild',
 38: u'conspiracy',
 39: u'gifs',
 40: u'BlackPeopleTwitter',
 41: u'DBZDokkanBattle',
 42: u'hiphopheads',
 43: u'rupaulsdragrace',
 44: u'mildlyinteresting',
 45: u'Rainbow6',
 46: u'australia',
 47: u'ethtrader',
 48: u'GlobalOffensive',
 49: u'TheSilphRoad',
 50: u'thebachelor',
 51: u'GlobalOffensiv

In [333]:
top_k_movies(item_similarity, subreddit_to_idx(u'videos'), k = 10)

[u'videos',
 u'anime',
 u'movies',
 u'Amd',
 u'gaming',
 u'hockey',
 u'hearthstone',
 u'rickandmorty',
 u'news',
 u'ffxiv']

In [444]:
findtop_allmodels('videos')

[u'videos', u'movies', u'anime', u'Amd', u'gaming', u'pics', u'asoiaf', u'rickandmorty', u'AdviceAnimals', u'nba']


In [530]:
import cf_numpy
ndmat = coordinate_matrix_to_ndarr(tf_ij)
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]
subreddit_mapper = dict(act.rdd.map(lambda entry: (entry.ordered_id, entry.subreddit)).collect())

In [535]:
%pdb

Automatic pdb calling has been turned OFF


In [541]:
ndmat = coordinate_matrix_to_ndarr(tf_ij)

In [543]:
train_test_split(ndmat)

ValueError: a must be non-empty

In [537]:
epsilon = 1e-9
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]

# TODO: play with this
ndmat /= (epsilon + np.std(ndmat, axis = 0))
ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)

train, test = train_test_split(ndmat)
state['item_similarity_full'] = fast_similarity(ndmat, kind = 'item')
state['item_similarity_sampled'] = fast_similarity(train, kind = 'item')
state['subreddit_mapper'] = subreddit_mapper
state['idx_mapper'] = {v: k for k, v in subreddit_mapper.iteritems()}

ValueError: Cannot take a larger sample than population when 'replace=False'

In [567]:
reload(cf_numpy)
cf_numpy.init(ndmat, subreddit_mapper)

In [571]:
cf_numpy.related_subs('movies')

[u'movies',
 u'MMA',
 u'Amd',
 u'AskReddit',
 u'SquaredCircle',
 u'australia',
 u'Games',
 u'teslamotors',
 u'politics',
 u'funny']

In [523]:
top_k_movies(item_similarity, subreddit_to_idx(u'news'), k = 10)

[u'news',
 u'politics',
 u'gameofthrones',
 u'Overwatch',
 u'worldnews',
 u'BlackPeopleTwitter',
 u'DotA2',
 u'funny',
 u'science',
 u'relationships']

In [524]:
top_k_movies(item_similarity, subreddit_to_idx(u'leagueoflegends'), k = 10)

[u'leagueoflegends',
 u'FireEmblemHeroes',
 u'nba',
 u'DotA2',
 u'SquaredCircle',
 u'2007scape',
 u'RocketLeagueExchange',
 u'nfl',
 u'Bitcoin',
 u'ethtrader']

In [297]:
predictions = predict_nobias(train, item_similarity, kind = 'item')

In [300]:
predictions.shape

(227, 100)

In [258]:
idx_to_subreddit(90)

u'FashionReps'

In [224]:
predictions.shape

(227, 101)

In [337]:
a_ij.entries.count()

465503

In [103]:
tf_ij.numCols()

43758L

In [323]:
entropy_i.numRows()

17045L

In [None]:
logp_ij = coordinateMatrixElementwise

In [203]:
activity_df.toPandas()

Unnamed: 0,rid,activity
0.0,1,1
1.0,2,1
2.0,3,1
3.0,4,2
4.0,5,1
5.0,6,295
6.0,7,32
7.0,8,9
8.0,9,4
9.0,10,3


In [15]:
from operator import add

# TODO: check that zero entries are correctly filtered
def coordinateMatrixMultiply(leftmat, rightmat):
    m = leftmat.entries.map(lambda entry: (entry.j, (entry.i, entry.value)))
    n = rightmat.entries.map(lambda entry: (entry.i, (entry.j, entry.value)))
    product_entries = m.join(n)\
    .map(lambda tup: ((tup[1][0][0], tup[1][1][0]), (tup[1][0][1] * tup[1][1][1])))\
    .reduceByKey(add)\
    .map(lambda record: MatrixEntry(record[0][0], record[0][1], record[1]))
    
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(product_entries)

In [16]:
def coordinateMatrixAdd(leftmat, rightmat, scalar):
    """
    Return leftmat + scalar * rightmat
    """
    m = leftmat.entries.map(lambda entry: ((entry.i, entry.j), entry.value))
    n = rightmat.entries.map(lambda entry: ((entry.i, entry.j), scalar * entry.value))
    matsum = m.fullOuterJoin(n)\
    .map(lambda tup: MatrixEntry(tup[0][0], tup[0][1],
                                 reduce(add, filter(lambda elt: elt is not None, tup[1]))))
    
    #return matsum
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(matsum)

In [360]:
test_vec = np.array([[1,1,0]]).T
test_vec2 = np.array([[1,0,0]]).T

In [356]:
coordinate_vector_matrix_norm(ndarr_to_coord_array(test_vec))

1.4142135623730951

In [665]:
sort_row_indices_by_distance(a_ij, coordinatematrix_get_row(a_ij, 37))

[33L,
 3L,
 36L,
 6L,
 39L,
 9L,
 42L,
 12L,
 45L,
 15L,
 48L,
 18L,
 51L,
 21L,
 54L,
 24L,
 57L,
 27L,
 30L,
 1L,
 34L,
 4L,
 37L,
 7L,
 40L,
 10L,
 43L,
 13L,
 46L,
 16L,
 49L,
 19L,
 52L,
 22L,
 55L,
 25L,
 58L,
 28L,
 31L,
 32L,
 2L,
 35L,
 5L,
 38L,
 8L,
 41L,
 11L,
 44L,
 14L,
 47L,
 17L,
 50L,
 20L,
 53L,
 23L,
 56L,
 26L,
 59L,
 29L]

In [None]:
sort_row_indices_by_distance()

In [484]:
test_array5 = 2 * np.array([[1, 0, 0], [1, 1, 0], [0, 0,  1]])
vec = 2 * np.array([[1, 1, 0.001]])

In [487]:
cm = ndarr_to_coord_array(test_array5)
cv = ndarr_to_coord_array(vec)

In [563]:
td = coordinatematrix_get_row(cmat, 8797)

In [564]:
td.entries.count()

1

In [537]:
sort_row_indices_by_distance(cm, cv)

[1L, 0L, 2L]

In [532]:
v = Vectors.sparse(3, [(1, 1), (2, 1)])

In [533]:
v.norm(2)

1.4142135623730951

In [492]:
cv.entries.map(lambda entry: (entry.j, entry.value)).collect()

[(0L, 2.0), (1L, 2.0), (2L, 0.002)]

In [599]:
v = ndarray_to_sparse_vector(np.zeros(5))

In [600]:
sparse_vector_to_ndarray(v)

array([ 0.,  0.,  0.,  0.,  0.])

In [495]:
v = coordinatematrix_to_sparse_vector(cv)

In [466]:
mat_j.fullOuterJoin(vec_j).map(lambda tup: (tup[1][0][0], (tup[1][0][1] - tup[1][1])**2)).collect()

[(1L, 0.0), (0L, 0.0), (1L, 0.0), (2L, 3.992004)]

In [454]:
mat_j.join(vec_j).map(lambda tup: (tup[1][0][0], (tup[1][0][1] - tup[1][1])**2)).reduceByKey(add).collect()

[(0L, 0.0), (1L, 0.0)]

In [453]:
mat_j.join(vec_j).map(lambda tup: (tup[1][0][0], (tup[1][0][1] - tup[1][1])**2)).reduceByKey(add)\
    .map(lambda tup: (tup[1], tup[0])).sortByKey().map(lambda tup: tup[1]).collect()

[0L, 1L]

In [439]:
ndarr_to_coord_array(test_array2).entries.map(lambda entry: (entry.i, [(entry.i, entry.j)]))\
.reduceByKey(add).collect()

[(0L, [(0L, 0L), (0L, 1L), (0L, 2L)]),
 (1L, [(1L, 0L), (1L, 2L), (1L, 3L)]),
 (2L, [(2L, 2L), (2L, 3L)])]

In [361]:
coordinate_matrix_vector_l2(ndarr_to_coord_array(test_vec), ndarr_to_coord_array(test_vec))

0.0

In [371]:
vec1, vec2 = ndarr_to_coord_array(test_vec), ndarr_to_coord_array(test_vec2)

In [372]:
norm1, norm2 = map(coordinate_vector_matrix_norm, [vec1, vec2])

vec1normed = coordinateMatrixScalarMult(vec1, 1./norm1)
vec2normed = coordinateMatrixScalarMult(vec2, 1./norm2)

diff = coordinateMatrixAdd(vec1normed, vec2normed, -1.)

In [373]:
diff.entries.collect()

[MatrixEntry(0, 0, -0.292893218813), MatrixEntry(1, 0, 0.707106781187)]

In [380]:
coordinate_matrix_vector_l2(ndarr_to_coord_array(test_vec), ndarr_to_coord_array(2 * test_vec2))

0.76536686473017945

In [654]:
v.__dict__

{'indices': array([], dtype=int32),
 'size': 5,
 'values': array([], dtype=float64)}

In [379]:
np.linalg.norm(test_vec.T[0] / np.linalg.norm(test_vec) - test_vec2.T[0] / np.linalg.norm(test_vec2))

0.76536686473017945

In [375]:
mat = ndarr_to_coord_array(test_array2)
mat.entries.filter(lambda entry: entry.i == 2).map(lambda entry: collect()

SyntaxError: invalid syntax (<ipython-input-375-dc50cd5dbed2>, line 2)

Implement LSA

Test stuff

In [94]:
test_coordmat2 = ndarr_to_coord_array(test_array2)

In [97]:
test_coordmat2.entries.filter(lambda entry: entry.value != 0).collect()

[MatrixEntry(0, 0, 1.0),
 MatrixEntry(0, 1, 2.0),
 MatrixEntry(0, 2, 3.0),
 MatrixEntry(1, 0, 4.0),
 MatrixEntry(1, 2, 6.0),
 MatrixEntry(1, 3, 1.0),
 MatrixEntry(2, 2, 8.0),
 MatrixEntry(2, 3, 2.0)]

In [244]:
vec = ndarr_to_coord_array(np.array([[2, 2, 2, 2]]))
mat = ndarr_to_coord_array(np.ones((4, 4)))

m = mat.entries.map(lambda entry: (entry.i, (entry.j, entry.value)))
v = vec.entries.map(lambda entry: (entry.i, (entry.j, entry.value)))
matdiv = m.join(v).map(lambda tup: MatrixEntry(tup[0], tup[1][0][0], float(tup[1][0][1]) / tup[1][1][1]))

In [263]:
eval_matrix_binop(2 * np.ones((4, 4)), 2 * np.ones((4, 4)), coordinate_matrix_elementwise_matrix_multiplication)

(<pyspark.mllib.linalg.distributed.CoordinateMatrix at 0x7fde2b3eba90>,
 array([[ 4.,  4.,  4.,  4.],
        [ 4.,  4.,  4.,  4.],
        [ 4.,  4.,  4.,  4.],
        [ 4.,  4.,  4.,  4.]]))

In [269]:
sumjtest = coordinate_matrix_sumj(ndarr_to_coord_array(np.ones((3, 4))))

In [270]:
coordinate_matrix_to_ndarr(sumjtest)

array([[ 4.],
       [ 4.],
       [ 4.]])

In [296]:
test_array2

array([[ 1.,  2.,  3.,  0.],
       [ 4.,  0.,  6.,  1.],
       [ 0.,  0.,  8.,  2.]])

In [300]:
eval_matrix_binop(test_array2, np.array([[2, 3, 4]]).T, coordinatematrix_multiply_vector_elementwise)

(<pyspark.mllib.linalg.distributed.CoordinateMatrix at 0x7fde20ca84d0>,
 array([[  2.,   4.,   6.,   0.],
        [ 12.,   0.,  18.,   3.],
        [  0.,   0.,  32.,   8.]]))

In [351]:
coordinate_matrix_row(ndarr_to_coord_array(test_array2), 1).entries.collect()

[MatrixEntry(0, 0, 4.0), MatrixEntry(0, 2, 6.0), MatrixEntry(0, 3, 1.0)]

In [125]:
test_add()
test_multiply()

In [78]:
coordinate_matrix_to_ndarr(coordinateMatrixScalarMult(ndarr_to_coord_array(test_array2), 1.5))

array([[  1.5,   3. ,   4.5,   0. ],
       [  6. ,   0. ,   9. ,   1.5],
       [  0. ,   0. ,  12. ,   3. ]])

In [92]:
coordinate_matrix_to_ndarr(coordinateMatrixElementwise(ndarr_to_coord_array(test_array2), lambda n: np.log(n)))

array([[ 0.        ,  0.69314718,  1.09861229,        -inf],
       [ 1.38629436,        -inf,  1.79175947,  0.        ],
       [       -inf,        -inf,  2.07944154,  0.69314718]])

In [93]:
test_array2

array([[ 1.,  2.,  3.,  0.],
       [ 4.,  0.,  6.,  1.],
       [ 0.,  0.,  8.,  2.]])

SVD implementation

In [80]:
from random import normalvariate

In [85]:
cmat.numCols()

347112L

In [83]:
from random import normalvariate
def random_unit(n):
    vec = np.array([normalvariate(0, 1) for _ in range(n)])
    return vec / np.linalg.norm(vec)

def SVD_1D(A, threshold = 1e-10):
    """
    A : CoordinateMatrix
    """
    n, m = A.numRows(), A.numCols()
    x = random_unit(m)
    # TODO finish this

In [None]:
import numpy as np
from numpy.linalg import norm

def svd_1d(A, epsilon=1e-10):
    ''' The one-dimensional SVD '''
    n, m = A.shape
    x = randomUnitVector(m)
    lastV = None
    currentV = x
    B = np.dot(A.T, A)
 
    iterations = 0
    while True:
        iterations += 1
        lastV = currentV
        currentV = np.dot(B, lastV)
        currentV = currentV / norm(currentV)
 
        if abs(np.dot(currentV, lastV)) > 1 - epsilon:
            print("converged in {} iterations!".format(iterations))
            return currentV

In [62]:
m2 = coordinateMatrixAdd(cmat, cmat, -2)

In [64]:
m2.entries.take(2)

[MatrixEntry(3, 9, -2.0), MatrixEntry(8, 4, -1.0)]

In [25]:
matsum = coordinateMatrixAdd(ndarr_to_coord_array(test_array3), ndarr_to_coord_array(test_array3), -2)

In [29]:
cmat = CoordinateMatrix(bare_occurrences.rdd.map(tuple))

In [43]:
cmat.entries.take(2)

[MatrixEntry(1, 176355, 1.0), MatrixEntry(2, 2547, 1.0)]

In [74]:
mul.entries.count()

595811

In [41]:
cmat.entries.count()

475392

In [45]:
cmb = cmat.toBlockMatrix()

In [47]:
cmi = cmb.toIndexedRowMatrix()

In [51]:
row = cmi.rows.take(1)

[IndexedRow(3558, (347112,[54015,294324],[1.0,2.0]))]

In [144]:
sqlContext.sql("""
select max(uid)
from occurrences
""").toPandas()

Unnamed: 0,max(uid)
0,341578


In [57]:
occurrences.toPandas()

Unnamed: 0,subreddit,author,tally,uid,rid
0.0,zyramains,TricolorStar,1,176355,1
1.0,zylzon,youknowitsyaboy,1,2547,2
2.0,zweiteliga,Saminka,1,206115,3
3.0,zurich,maxwellmaxen,2,72503,4
4.0,zsh,__soddit,1,159502,5
5.0,zootopia,thawed_caveman,2,22475,6
6.0,zootopia,speisekarte,11,29937,6
7.0,zootopia,rodrigogirao,6,42380,6
8.0,zootopia,phobos136,4,53058,6
9.0,zootopia,midnightopheliac,9,70104,6


In [89]:
# TODO: why doesn't this match rid?
cmat.numRows()

17085L

In [71]:
import numpy as np

In [74]:
np.array(df['tally']).min()

-632

In [69]:
sqlContext.sql("""
select author, subreddit, sum (score)
from test
group by author, subreddit
""").toPandas()

Unnamed: 0,author,subreddit,sum(score)
0.0,FormulaXDGame,formula1,31
1.0,HelluvaDeke,gaming,2
2.0,WhoKnowsWho2,xboxone,70
3.0,listentohim,DBZDokkanBattle,1
4.0,sevenzig,AskOuija,10
5.0,LumpyPick,leagueoflegends,4
6.0,volvostupidshit,Philippines,6
7.0,i_fuck_goats777,AskReddit,2
8.0,Ineeditunesalot,battlefield_4,11
9.0,martianlawrence,movies,11


In [78]:
sj.toPandas()

Unnamed: 0,author,author_cakeday,author_flair_css_class,author_flair_text,body,can_gild,collapsed,collapsed_reason,controversiality,created_utc,distinguished,edited,gilded,id,is_submitter,link_id,parent_id,retrieved_on,score,stickied,Unnamed: 21
0.0,stunt_penguin,,,,Wheelchairs make amazing dollys :D,True,False,,0,1501545600,,false,0,dkznc8h,False,t3_6qp8sw,t1_dkzbnn3,1503654247,1,False,...
1.0,[deleted],,,,[removed],True,False,,0,1501545600,,false,0,dkznc8i,False,t3_6qse6i,t1_dkzmgc3,1503654247,2,False,...
2.0,69ing,,,,I used to watch the shit out of these guys,True,False,,0,1501545600,,false,0,dkznc8j,False,t3_6qs8i1,t3_6qs8i1,1503654247,5,False,...
3.0,ArchadianJudge,,Archadianflair,,http://www.pixiv.net/member_illust.php?mode=me...,True,False,,0,1501545600,,false,0,dkznc8k,True,t3_6qsicx,t3_6qsicx,1503654247,2,False,...
4.0,sglville,,,,On the other yand you could say it's capitalis...,True,False,,0,1501545600,,false,0,dkznc8l,False,t3_6qryxf,t3_6qryxf,1503654247,2,False,...
5.0,NEWORLDODOR,,,,I'm not arguing that making university free an...,True,False,,0,1501545600,,false,0,dkznc8m,True,t3_6qrr1o,t1_dkzn053,1503654247,3,False,...
6.0,zachwad22,,,,"I like whisper-whistle obsessively, just barel...",True,False,,0,1501545600,,false,0,dkznc8n,False,t3_6qoe6s,t3_6qoe6s,1503654247,1,False,...
7.0,lurker4lyfe6969,,,,Served in the Chair Force. Can confirm,True,False,,0,1501545600,,false,0,dkznc8o,False,t3_6qodwi,t3_6qodwi,1503654247,2,False,...
8.0,SuburbanStoner,,,,"Nah, it's good right here",True,False,,0,1501545600,,false,0,dkznc8p,False,t3_6qn002,t1_dkz6ig2,1503654247,2,False,...
9.0,ensanguine,,,,I still stand by it not belonging on the plate...,True,False,,0,1501545600,,false,0,dkznc8q,False,t3_6qscet,t1_dkzn9vn,1503654247,18,False,...


In [27]:
rawDF = sqlContext.read.json("s3n://reddit-comments/2015/RC_2015-05", StructType(fields)).persist(StorageLevel.MEMORY_AND_DISK_SER)

In [28]:
rawDF.registerTempTable('comments')

In [574]:
sj.select

<bound method DataFrame.select of DataFrame[author: string, score: bigint, controversiality: bigint, subreddit: string]>