In [603]:
import pyspark

In [2]:
import numpy as np

In [581]:
from pyspark.mllib.linalg.distributed import CoordinateMatrix
from pyspark.mllib.linalg.distributed import MatrixEntry
from pyspark.sql.types import *
from operator import add
from pyspark.mllib.linalg import Vectors, ArrayType

In [4]:
minimal_fields = [ 
          StructField("author", StringType(), True),
          StructField("score", LongType(), True),
          StructField("controversiality", LongType(), True),
          StructField("subreddit", StringType(), True)]

In [5]:
sj = sqlContext.read.json("s3a://insight-ohoidn/sample3.json", StructType(minimal_fields))

In [5]:
sj.printSchema()

root
 |-- author: string (nullable = true)
 |-- score: long (nullable = true)
 |-- subreddit: string (nullable = true)



In [115]:
sj.toPandas()

Unnamed: 0,author,score,subreddit
0.0,stunt_penguin,1,Filmmakers
1.0,[deleted],2,Addons4Kodi
2.0,69ing,5,NotTimAndEric
3.0,ArchadianJudge,2,Saber
4.0,sglville,2,The_Donald
5.0,NEWORLDODOR,3,JordanPeterson
6.0,zachwad22,1,AskReddit
7.0,lurker4lyfe6969,2,Military
8.0,SuburbanStoner,2,trashy
9.0,ensanguine,18,KitchenConfidential


In [6]:
sj.registerTempTable('test')

In [52]:
sqlContext.sql('select author from test').toPandas()

Unnamed: 0,author
0.0,stunt_penguin
1.0,[deleted]
2.0,69ing
3.0,ArchadianJudge
4.0,sglville
5.0,NEWORLDODOR
6.0,zachwad22
7.0,lurker4lyfe6969
8.0,SuburbanStoner
9.0,ensanguine


In [572]:
sqlContext

<pyspark.sql.context.SQLContext at 0x7f83fc371350>

In [469]:
occurrences = sqlContext.sql("""
select *, dense_rank() over (order by subreddit desc) as rid 
from  (SELECT subreddit, author, sum(sign(score)) as tally,\
    count(score) as activity, dense_rank() over (order by author desc) as uid
from test
group by subreddit, author)
where tally!=0
""").persist(StorageLevel.MEMORY_AND_DISK_SER)
occurrences.registerTempTable('occurrences')

In [470]:
def df_most_active_subreddits(k = 1000):
    most_active = sqlContext.sql("""
    select * from
        (select *, dense_rank() over (order by activity desc) as ordered_id
        from (select rid, subreddit, sum(activity) as activity
            from occurrences
            group by rid, subreddit))
    where ordered_id<=%s
""" % k).persist(StorageLevel.MEMORY_AND_DISK_SER)
    most_active.registerTempTable('most_active')
    return most_active

In [475]:
max_numreddits = 100
act = df_most_active_subreddits(max_numreddits)

In [476]:
act.toPandas()

Unnamed: 0,rid,subreddit,activity,ordered_id
0.0,16312,AskReddit,69641,1
1.0,2571,politics,18724,2
2.0,15972,BigBrother,17430,3
3.0,9395,SquaredCircle,14014,4
4.0,8796,The_Donald,11711,5
5.0,5120,gameofthrones,10861,6
6.0,3242,nba,9613,7
7.0,5182,funny,9236,8
8.0,10051,RocketLeagueExchange,8503,9
9.0,441,videos,7850,10


In [477]:
def df_valid_users(min_subreddits = 2, max_subreddits = 20):
    """
    filter users by the number of subreddits they've posted in, among the above-defined most active subreddits
    """
    most_active_users = sqlContext.sql("""
    select * from
        (select author, count(subreddit) as count
        from 
            (select * from occurrences
            where subreddit in (select subreddit from most_active))
        group by author
        order by count desc)
    where count>=%d and count<=%d""" % (min_subreddits, max_subreddits)).persist(StorageLevel.MEMORY_AND_DISK_SER)
    most_active_users.registerTempTable('most_active_users')
    return most_active_users

In [495]:
act_users = df_valid_users(min_subreddits = 4)

In [496]:
act_users.toPandas()

Unnamed: 0,author,count
0.0,JeopardyQBot,19
1.0,_youtubot_,18
2.0,Mentioned_Videos,17
3.0,imguralbumbot,17
4.0,HelperBot_,16
5.0,RemindMeBot,16
6.0,LinkFixBot,15
7.0,youtubefactsbot,12
8.0,Gyazo_Bot,11
9.0,timezone_bot,11


In [497]:
test2 = sqlContext.sql("""
select test.author, test.score, test.controversiality, test.subreddit, most_active.ordered_id as ordered_id
from test
inner join most_active on most_active.subreddit=test.subreddit""")
test2.registerTempTable('test2')

In [498]:
occurrences_pruned = sqlContext.sql("""
select *
from  (SELECT test2.subreddit, author, test2.ordered_id, sum(score) as tally,\
    sum(abs(score)) as activity, dense_rank() over (order by author desc) as uid
    from test2
    where author in (select author from most_active_users)
    group by test2.subreddit, test2.ordered_id, author)
where tally!=0
""").persist(StorageLevel.MEMORY_AND_DISK_SER)
occurrences_pruned.registerTempTable('occurrences_pruned')

In [499]:
occurrences_pruned.toPandas()

Unnamed: 0,subreddit,author,ordered_id,tally,activity,uid
0.0,relationships,zylithi,32,3,3,1
1.0,legaladvice,zylithi,67,4,4,1
2.0,politics,zylithi,2,1,1,1
3.0,AskReddit,zylithi,1,14,14,1
4.0,BigBrother,zotsandcrambles,3,7,7,2
5.0,pokemongo,zotsandcrambles,59,19,19,2
6.0,todayilearned,zotsandcrambles,22,8,8,2
7.0,freefolk,zotsandcrambles,40,10,10,2
8.0,politics,zetec,2,15,15,3
9.0,nfl,zetec,55,6,28,3


In [501]:
bare_occurrences = sqlContext.sql("""
select ordered_id, uid, tally
from occurrences_pruned
""").persist(StorageLevel.MEMORY_AND_DISK_SER)

In [502]:
bare_occurrences.toPandas()

Unnamed: 0,ordered_id,uid,tally
0.0,32,1,3
1.0,67,1,4
2.0,2,1,1
3.0,1,1,14
4.0,3,2,7
5.0,59,2,19
6.0,22,2,8
7.0,40,2,10
8.0,2,3,15
9.0,55,3,6


### Subreddit activity:

In [738]:
df_most_active_subreddits(10).toPandas()

Unnamed: 0,rid,subreddit,activity,ordered_id
0,16313,AskReddit,1143263,1
1,2571,politics,205263,2
2,5120,gameofthrones,183469,3
3,441,videos,170466,4
4,15973,BigBrother,156128,5
5,900,todayilearned,109733,6
6,3242,nba,97974,7
7,5109,gaming,95671,8
8,8796,The_Donald,94572,9
9,9396,SquaredCircle,94065,10


In [91]:
tf_ij.entries.map(lambda entry: entry.i).distinct().count()

100

In [92]:
gf_i.numCols()

1L

In [93]:
tf_ij.numRows()

101L

In [877]:
act.toPandas()

Unnamed: 0,rid,subreddit,activity,ordered_id
0.0,16313,AskReddit,1143263,1
1.0,2571,politics,205263,2
2.0,5120,gameofthrones,183469,3
3.0,441,videos,170466,4
4.0,15973,BigBrother,156128,5
5.0,900,todayilearned,109733,6
6.0,3242,nba,97974,7
7.0,5109,gaming,95671,8
8.0,8796,The_Donald,94572,9
9.0,9396,SquaredCircle,94065,10


In [154]:
bare_occurrences.toPandas()

Unnamed: 0,ordered_id,uid,tally
0.0,47,1,10
1.0,99,1,9
2.0,78,1,2
3.0,34,1,1
4.0,82,1,1
5.0,24,1,10
6.0,61,1,2
7.0,88,1,25
8.0,58,1,13
9.0,40,1,5


In [167]:
bare_occurrences.rdd.map(lambda row: (row.ordered_id, (row.uid, row.tally))).sortByKey().collect()

[(1, (2, 2)),
 (1, (4, 14)),
 (1, (5, 2)),
 (1, (6, 4)),
 (1, (7, 14)),
 (1, (9, 1)),
 (1, (10, -2)),
 (1, (11, 2)),
 (1, (12, 5)),
 (1, (13, 75)),
 (1, (16, 29)),
 (1, (17, 307)),
 (1, (18, 3)),
 (1, (21, 79)),
 (1, (22, 21)),
 (1, (23, 2)),
 (1, (24, 2)),
 (1, (29, 36)),
 (1, (30, 6)),
 (1, (31, 22)),
 (1, (33, 17)),
 (1, (38, 37)),
 (1, (39, 8)),
 (1, (40, 2)),
 (1, (41, 1)),
 (1, (42, 17)),
 (1, (44, 21)),
 (1, (45, 15)),
 (1, (46, 2)),
 (1, (47, 14)),
 (1, (48, 9)),
 (1, (49, 12)),
 (1, (50, 12)),
 (1, (51, 2)),
 (1, (52, 1)),
 (1, (53, 16)),
 (1, (58, 1)),
 (1, (62, 16)),
 (1, (63, 9)),
 (1, (65, 17)),
 (1, (66, 7)),
 (1, (67, 12)),
 (1, (71, 15)),
 (1, (72, 8)),
 (1, (73, 10597)),
 (1, (74, 3)),
 (1, (75, 1031)),
 (1, (79, 1)),
 (1, (81, 167)),
 (1, (82, 6)),
 (1, (85, 2)),
 (1, (87, 1)),
 (1, (88, 57)),
 (1, (89, 1)),
 (1, (90, 47)),
 (1, (91, 14)),
 (1, (92, 48)),
 (1, (93, 16)),
 (1, (94, 1)),
 (1, (95, 4)),
 (1, (96, 10)),
 (1, (97, 4)),
 (1, (98, 3)),
 (1, (99, 111)),
 (1, 

In [503]:
i_sumtally_tuples = bare_occurrences.rdd.map(lambda row: (row.ordered_id, row.tally)).sortByKey()\
.reduceByKey(add)
gf_i = CoordinateMatrix(i_sumtally_tuples.map(lambda entry: (entry[0], 0, entry[1])))

# subreddit-activity matrix
tf_ij = CoordinateMatrix(bare_occurrences.rdd.map(lambda row: (row.ordered_id, (row.uid, row.tally))).sortByKey()\
.map(lambda entry: (entry[0], entry[1][0], entry[1][1])))
nusers = tf_ij.numCols()

p_ij = coordinate_matrix_elementwise_vector_division(tf_ij, gf_i)

# TODO figure out nan issues
logp_ij = coordinateMatrixElementwise(p_ij, lambda elt: np.log((abs(elt))/np.log(nusers)))

entropy_i = coordinateMatrixElementwise(coordinate_matrix_sumj(logp_ij), lambda elt: elt + 1) # + 1, but this has to be handled separately to conserve matrix sparsity

In [602]:
p_ij.numRows()

101L

In [506]:
tf_ij.numRows()

101L

In [504]:
def transform_tf_if(val):
    newval = np.log(np.abs(val) + 1)
    if val < 0:
        return -newval
    else:
        return newval

# TODO: maybe save this for later, when noise can be reduced via dimensionality reduction
log_tf_ij = coordinateMatrixElementwise(tf_ij, transform_tf_if)

a_ij = coordinatematrix_multiply_vector_elementwise(log_tf_ij, entropy_i)

In [37]:
def predict_nobias(ratings, similarity, kind='user'):
    if kind == 'user':
        user_bias = ratings.mean(axis=1)
        ratings = (ratings - user_bias[:, np.newaxis]).copy()
        pred = similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
        pred += user_bias[:, np.newaxis]
    elif kind == 'item':
        item_bias = ratings.mean(axis=0)
        ratings = (ratings - item_bias[np.newaxis, :]).copy()
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
        pred += item_bias[np.newaxis, :]
        
    return pred

In [93]:
def predict_topk(ratings, similarity, kind='user', k=40):
    pred = np.zeros(ratings.shape)
    if kind == 'user':
        for i in xrange(ratings.shape[0]):
            top_k_users = [np.argsort(similarity[:,i])[:-k-1:-1]]
            for j in xrange(ratings.shape[1]):
                pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users]) 
                pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
    if kind == 'item':
        for j in xrange(ratings.shape[1]):
            top_k_items = [np.argsort(similarity[:,j])[:-k-1:-1]]
            for i in xrange(ratings.shape[0]):
                pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T) 
                pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))        
    
    return pred

In [38]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [259]:
subreddit_mapper = dict(act.rdd.map(lambda entry: (entry.ordered_id, entry.subreddit)).collect())
idx_mapper = {v: k for k, v in subreddit_mapper.iteritems()}

def idx_to_subreddit(idx):
    return subreddit_mapper[idx + 1]

def subreddit_to_idx(sub):
    return idx_mapper[sub] - 1

In [131]:
def top_k_movies(similarity, movie_idx, k=6):
    return [idx_to_subreddit(x) for x in np.argsort(similarity[movie_idx,:])[:-k-1:-1]]

[u'politics',
 u'television',
 u'worldnews',
 u'pics',
 u'OldSchoolCool',
 u'buildapc']

In [150]:
idx_mapper['DotA2']

15

In [80]:
idx_to_subreddit(0)

u'AskReddit'

In [68]:
get_mse(predict_nobias, test)

TypeError: 'function' object has no attribute '__getitem__'

In [843]:
np.ones((2, 2)).dot(np.ones((2, 1)))

array([[ 2.],
       [ 2.]])

In [756]:
bare_occurrences.toPandas()

Unnamed: 0,rid,uid,tally
0.0,1,173573,1
1.0,2,2510,1
2.0,3,202843,1
3.0,4,71343,2
4.0,5,157008,1
5.0,6,22113,2
6.0,6,29476,11
7.0,6,41712,6
8.0,6,52207,4
9.0,6,68993,9


In [796]:
a_ij.numCols()

57010L

In [350]:
np.mean(ndmat, axis = 0).shape

(100,)

In [529]:
ndmat = coordinate_matrix_to_ndarr(tf_ij)
#ndmat/=np.mean(ndmat)
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]
train, test = train_test_split(ndmat)
similarity = fast_similarity(train)

In [408]:
def func1(ndmat):
    ndmat = ndmat.copy()
    ndmat /= (epsilon + np.std(ndmat, axis = 0))
    ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)
    return ndmat

In [411]:
matfuncs = [func1]
def findtop_allmodels(sub_name):
    ndmat = coordinate_matrix_to_ndarr(tf_ij)
    ndmat = ndmat.T
    ndmat = ndmat[1:, 1:]
    
    def do_one(func, ndmat):
        ndmat = func(ndmat)
        train, test = train_test_split(ndmat)
        similarity = fast_similarity(train)

        item_similarity = fast_similarity(train, kind = 'item')
        print top_k_movies(item_similarity, subreddit_to_idx(sub_name), k = 10)
    [do_one(f, ndmat) for f in matfuncs]

In [426]:
import pickle

In [427]:
import md5

  if __name__ == '__main__':


In [539]:
epsilon = 1e-9
ndmat = coordinate_matrix_to_ndarr(tf_ij)
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]

#ndmat = np.mean(ndmat, axis = 0)
ndmat /= (epsilon + np.std(ndmat, axis = 0))
ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)

train, test = train_test_split(ndmat)
similarity = fast_similarity(ndmat)


item_similarity = fast_similarity(ndmat, kind = 'item')

md5.md5(pickle.dumps(train)).digest()

'\x15\xb3$\xd6\xe6\x1c\xc0o.\xb4U\x8eZ\xa3\x18\xb6'

In [266]:
subreddit_mapper

{1: u'AskReddit',
 2: u'politics',
 3: u'BigBrother',
 4: u'SquaredCircle',
 5: u'The_Donald',
 6: u'gameofthrones',
 7: u'nba',
 8: u'funny',
 9: u'RocketLeagueExchange',
 10: u'videos',
 11: u'worldnews',
 12: u'leagueoflegends',
 13: u'soccer',
 14: u'movies',
 15: u'DotA2',
 16: u'Showerthoughts',
 17: u'news',
 18: u'pics',
 19: u'gaming',
 20: u'MMA',
 21: u'teenagers',
 22: u'todayilearned',
 23: u'hearthstone',
 24: u'pathofexile',
 25: u'pcmasterrace',
 26: u'PUBATTLEGROUNDS',
 27: u'Overwatch',
 28: u'baseball',
 29: u'anime',
 30: u'NYYankees',
 31: u'asoiaf',
 32: u'relationships',
 33: u'Philippines',
 34: u'FireEmblemHeroes',
 35: u'me_irl',
 36: u'aww',
 37: u'gonewild',
 38: u'conspiracy',
 39: u'gifs',
 40: u'BlackPeopleTwitter',
 41: u'DBZDokkanBattle',
 42: u'hiphopheads',
 43: u'rupaulsdragrace',
 44: u'mildlyinteresting',
 45: u'Rainbow6',
 46: u'australia',
 47: u'ethtrader',
 48: u'GlobalOffensive',
 49: u'TheSilphRoad',
 50: u'thebachelor',
 51: u'GlobalOffensiv

In [304]:
subreddit_mapper

{1: u'AskReddit',
 2: u'politics',
 3: u'BigBrother',
 4: u'SquaredCircle',
 5: u'The_Donald',
 6: u'gameofthrones',
 7: u'nba',
 8: u'funny',
 9: u'RocketLeagueExchange',
 10: u'videos',
 11: u'worldnews',
 12: u'leagueoflegends',
 13: u'soccer',
 14: u'movies',
 15: u'DotA2',
 16: u'Showerthoughts',
 17: u'news',
 18: u'pics',
 19: u'gaming',
 20: u'MMA',
 21: u'teenagers',
 22: u'todayilearned',
 23: u'hearthstone',
 24: u'pathofexile',
 25: u'pcmasterrace',
 26: u'PUBATTLEGROUNDS',
 27: u'Overwatch',
 28: u'baseball',
 29: u'anime',
 30: u'NYYankees',
 31: u'asoiaf',
 32: u'relationships',
 33: u'Philippines',
 34: u'FireEmblemHeroes',
 35: u'me_irl',
 36: u'aww',
 37: u'gonewild',
 38: u'conspiracy',
 39: u'gifs',
 40: u'BlackPeopleTwitter',
 41: u'DBZDokkanBattle',
 42: u'hiphopheads',
 43: u'rupaulsdragrace',
 44: u'mildlyinteresting',
 45: u'Rainbow6',
 46: u'australia',
 47: u'ethtrader',
 48: u'GlobalOffensive',
 49: u'TheSilphRoad',
 50: u'thebachelor',
 51: u'GlobalOffensiv

In [333]:
top_k_movies(item_similarity, subreddit_to_idx(u'videos'), k = 10)

[u'videos',
 u'anime',
 u'movies',
 u'Amd',
 u'gaming',
 u'hockey',
 u'hearthstone',
 u'rickandmorty',
 u'news',
 u'ffxiv']

In [444]:
findtop_allmodels('videos')

[u'videos', u'movies', u'anime', u'Amd', u'gaming', u'pics', u'asoiaf', u'rickandmorty', u'AdviceAnimals', u'nba']


In [530]:
import cf_numpy
ndmat = coordinate_matrix_to_ndarr(tf_ij)
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]
subreddit_mapper = dict(act.rdd.map(lambda entry: (entry.ordered_id, entry.subreddit)).collect())

In [535]:
%pdb

Automatic pdb calling has been turned OFF


In [541]:
ndmat = coordinate_matrix_to_ndarr(tf_ij)

In [543]:
train_test_split(ndmat)

ValueError: a must be non-empty

In [537]:
epsilon = 1e-9
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]

# TODO: play with this
ndmat /= (epsilon + np.std(ndmat, axis = 0))
ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)

train, test = train_test_split(ndmat)
state['item_similarity_full'] = fast_similarity(ndmat, kind = 'item')
state['item_similarity_sampled'] = fast_similarity(train, kind = 'item')
state['subreddit_mapper'] = subreddit_mapper
state['idx_mapper'] = {v: k for k, v in subreddit_mapper.iteritems()}

ValueError: Cannot take a larger sample than population when 'replace=False'

In [567]:
reload(cf_numpy)
cf_numpy.init(ndmat, subreddit_mapper)

In [571]:
cf_numpy.related_subs('movies')

[u'movies',
 u'MMA',
 u'Amd',
 u'AskReddit',
 u'SquaredCircle',
 u'australia',
 u'Games',
 u'teslamotors',
 u'politics',
 u'funny']

In [523]:
top_k_movies(item_similarity, subreddit_to_idx(u'news'), k = 10)

[u'news',
 u'politics',
 u'gameofthrones',
 u'Overwatch',
 u'worldnews',
 u'BlackPeopleTwitter',
 u'DotA2',
 u'funny',
 u'science',
 u'relationships']

In [524]:
top_k_movies(item_similarity, subreddit_to_idx(u'leagueoflegends'), k = 10)

[u'leagueoflegends',
 u'FireEmblemHeroes',
 u'nba',
 u'DotA2',
 u'SquaredCircle',
 u'2007scape',
 u'RocketLeagueExchange',
 u'nfl',
 u'Bitcoin',
 u'ethtrader']

In [297]:
predictions = predict_nobias(train, item_similarity, kind = 'item')

In [300]:
predictions.shape

(227, 100)

In [258]:
idx_to_subreddit(90)

u'FashionReps'

In [224]:
predictions.shape

(227, 101)

In [337]:
a_ij.entries.count()

465503

In [103]:
tf_ij.numCols()

43758L

In [323]:
entropy_i.numRows()

17045L

In [None]:
logp_ij = coordinateMatrixElementwise

In [203]:
activity_df.toPandas()

Unnamed: 0,rid,activity
0.0,1,1
1.0,2,1
2.0,3,1
3.0,4,2
4.0,5,1
5.0,6,295
6.0,7,32
7.0,8,9
8.0,9,4
9.0,10,3


In [15]:
from operator import add

# TODO: check that zero entries are correctly filtered
def coordinateMatrixMultiply(leftmat, rightmat):
    m = leftmat.entries.map(lambda entry: (entry.j, (entry.i, entry.value)))
    n = rightmat.entries.map(lambda entry: (entry.i, (entry.j, entry.value)))
    product_entries = m.join(n)\
    .map(lambda tup: ((tup[1][0][0], tup[1][1][0]), (tup[1][0][1] * tup[1][1][1])))\
    .reduceByKey(add)\
    .map(lambda record: MatrixEntry(record[0][0], record[0][1], record[1]))
    
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(product_entries)

In [16]:
def coordinateMatrixAdd(leftmat, rightmat, scalar):
    """
    Return leftmat + scalar * rightmat
    """
    m = leftmat.entries.map(lambda entry: ((entry.i, entry.j), entry.value))
    n = rightmat.entries.map(lambda entry: ((entry.i, entry.j), scalar * entry.value))
    matsum = m.fullOuterJoin(n)\
    .map(lambda tup: MatrixEntry(tup[0][0], tup[0][1],
                                 reduce(add, filter(lambda elt: elt is not None, tup[1]))))
    
    #return matsum
    return pyspark.mllib.linalg.distributed.CoordinateMatrix(matsum)

In [360]:
test_vec = np.array([[1,1,0]]).T
test_vec2 = np.array([[1,0,0]]).T

In [356]:
coordinate_vector_matrix_norm(ndarr_to_coord_array(test_vec))

1.4142135623730951

In [665]:
sort_row_indices_by_distance(a_ij, coordinatematrix_get_row(a_ij, 37))

[33L,
 3L,
 36L,
 6L,
 39L,
 9L,
 42L,
 12L,
 45L,
 15L,
 48L,
 18L,
 51L,
 21L,
 54L,
 24L,
 57L,
 27L,
 30L,
 1L,
 34L,
 4L,
 37L,
 7L,
 40L,
 10L,
 43L,
 13L,
 46L,
 16L,
 49L,
 19L,
 52L,
 22L,
 55L,
 25L,
 58L,
 28L,
 31L,
 32L,
 2L,
 35L,
 5L,
 38L,
 8L,
 41L,
 11L,
 44L,
 14L,
 47L,
 17L,
 50L,
 20L,
 53L,
 23L,
 56L,
 26L,
 59L,
 29L]

In [None]:
sort_row_indices_by_distance()

In [484]:
test_array5 = 2 * np.array([[1, 0, 0], [1, 1, 0], [0, 0,  1]])
vec = 2 * np.array([[1, 1, 0.001]])

In [487]:
cm = ndarr_to_coord_array(test_array5)
cv = ndarr_to_coord_array(vec)

In [563]:
td = coordinatematrix_get_row(cmat, 8797)

In [564]:
td.entries.count()

1

In [537]:
sort_row_indices_by_distance(cm, cv)

[1L, 0L, 2L]

In [532]:
v = Vectors.sparse(3, [(1, 1), (2, 1)])

In [533]:
v.norm(2)

1.4142135623730951

In [492]:
cv.entries.map(lambda entry: (entry.j, entry.value)).collect()

[(0L, 2.0), (1L, 2.0), (2L, 0.002)]

In [599]:
v = ndarray_to_sparse_vector(np.zeros(5))

In [600]:
sparse_vector_to_ndarray(v)

array([ 0.,  0.,  0.,  0.,  0.])

In [495]:
v = coordinatematrix_to_sparse_vector(cv)

In [466]:
mat_j.fullOuterJoin(vec_j).map(lambda tup: (tup[1][0][0], (tup[1][0][1] - tup[1][1])**2)).collect()

[(1L, 0.0), (0L, 0.0), (1L, 0.0), (2L, 3.992004)]

In [454]:
mat_j.join(vec_j).map(lambda tup: (tup[1][0][0], (tup[1][0][1] - tup[1][1])**2)).reduceByKey(add).collect()

[(0L, 0.0), (1L, 0.0)]

In [453]:
mat_j.join(vec_j).map(lambda tup: (tup[1][0][0], (tup[1][0][1] - tup[1][1])**2)).reduceByKey(add)\
    .map(lambda tup: (tup[1], tup[0])).sortByKey().map(lambda tup: tup[1]).collect()

[0L, 1L]

In [439]:
ndarr_to_coord_array(test_array2).entries.map(lambda entry: (entry.i, [(entry.i, entry.j)]))\
.reduceByKey(add).collect()

[(0L, [(0L, 0L), (0L, 1L), (0L, 2L)]),
 (1L, [(1L, 0L), (1L, 2L), (1L, 3L)]),
 (2L, [(2L, 2L), (2L, 3L)])]

In [361]:
coordinate_matrix_vector_l2(ndarr_to_coord_array(test_vec), ndarr_to_coord_array(test_vec))

0.0

In [371]:
vec1, vec2 = ndarr_to_coord_array(test_vec), ndarr_to_coord_array(test_vec2)

In [372]:
norm1, norm2 = map(coordinate_vector_matrix_norm, [vec1, vec2])

vec1normed = coordinateMatrixScalarMult(vec1, 1./norm1)
vec2normed = coordinateMatrixScalarMult(vec2, 1./norm2)

diff = coordinateMatrixAdd(vec1normed, vec2normed, -1.)

In [373]:
diff.entries.collect()

[MatrixEntry(0, 0, -0.292893218813), MatrixEntry(1, 0, 0.707106781187)]

In [380]:
coordinate_matrix_vector_l2(ndarr_to_coord_array(test_vec), ndarr_to_coord_array(2 * test_vec2))

0.76536686473017945

In [654]:
v.__dict__

{'indices': array([], dtype=int32),
 'size': 5,
 'values': array([], dtype=float64)}

In [379]:
np.linalg.norm(test_vec.T[0] / np.linalg.norm(test_vec) - test_vec2.T[0] / np.linalg.norm(test_vec2))

0.76536686473017945

In [375]:
mat = ndarr_to_coord_array(test_array2)
mat.entries.filter(lambda entry: entry.i == 2).map(lambda entry: collect()

SyntaxError: invalid syntax (<ipython-input-375-dc50cd5dbed2>, line 2)

Implement LSA

Test stuff

In [94]:
test_coordmat2 = ndarr_to_coord_array(test_array2)

In [97]:
test_coordmat2.entries.filter(lambda entry: entry.value != 0).collect()

[MatrixEntry(0, 0, 1.0),
 MatrixEntry(0, 1, 2.0),
 MatrixEntry(0, 2, 3.0),
 MatrixEntry(1, 0, 4.0),
 MatrixEntry(1, 2, 6.0),
 MatrixEntry(1, 3, 1.0),
 MatrixEntry(2, 2, 8.0),
 MatrixEntry(2, 3, 2.0)]

In [244]:
vec = ndarr_to_coord_array(np.array([[2, 2, 2, 2]]))
mat = ndarr_to_coord_array(np.ones((4, 4)))

m = mat.entries.map(lambda entry: (entry.i, (entry.j, entry.value)))
v = vec.entries.map(lambda entry: (entry.i, (entry.j, entry.value)))
matdiv = m.join(v).map(lambda tup: MatrixEntry(tup[0], tup[1][0][0], float(tup[1][0][1]) / tup[1][1][1]))

In [263]:
eval_matrix_binop(2 * np.ones((4, 4)), 2 * np.ones((4, 4)), coordinate_matrix_elementwise_matrix_multiplication)

(<pyspark.mllib.linalg.distributed.CoordinateMatrix at 0x7fde2b3eba90>,
 array([[ 4.,  4.,  4.,  4.],
        [ 4.,  4.,  4.,  4.],
        [ 4.,  4.,  4.,  4.],
        [ 4.,  4.,  4.,  4.]]))

In [269]:
sumjtest = coordinate_matrix_sumj(ndarr_to_coord_array(np.ones((3, 4))))

In [270]:
coordinate_matrix_to_ndarr(sumjtest)

array([[ 4.],
       [ 4.],
       [ 4.]])

In [296]:
test_array2

array([[ 1.,  2.,  3.,  0.],
       [ 4.,  0.,  6.,  1.],
       [ 0.,  0.,  8.,  2.]])

In [300]:
eval_matrix_binop(test_array2, np.array([[2, 3, 4]]).T, coordinatematrix_multiply_vector_elementwise)

(<pyspark.mllib.linalg.distributed.CoordinateMatrix at 0x7fde20ca84d0>,
 array([[  2.,   4.,   6.,   0.],
        [ 12.,   0.,  18.,   3.],
        [  0.,   0.,  32.,   8.]]))

In [351]:
coordinate_matrix_row(ndarr_to_coord_array(test_array2), 1).entries.collect()

[MatrixEntry(0, 0, 4.0), MatrixEntry(0, 2, 6.0), MatrixEntry(0, 3, 1.0)]

In [125]:
test_add()
test_multiply()

In [78]:
coordinate_matrix_to_ndarr(coordinateMatrixScalarMult(ndarr_to_coord_array(test_array2), 1.5))

array([[  1.5,   3. ,   4.5,   0. ],
       [  6. ,   0. ,   9. ,   1.5],
       [  0. ,   0. ,  12. ,   3. ]])

In [92]:
coordinate_matrix_to_ndarr(coordinateMatrixElementwise(ndarr_to_coord_array(test_array2), lambda n: np.log(n)))

array([[ 0.        ,  0.69314718,  1.09861229,        -inf],
       [ 1.38629436,        -inf,  1.79175947,  0.        ],
       [       -inf,        -inf,  2.07944154,  0.69314718]])

In [93]:
test_array2

array([[ 1.,  2.,  3.,  0.],
       [ 4.,  0.,  6.,  1.],
       [ 0.,  0.,  8.,  2.]])

SVD implementation

In [None]:
dfi = 

In [80]:
from random import normalvariate


In [85]:
cmat.numCols()

347112L

In [83]:
from random import normalvariate
def random_unit(n):
    vec = np.array([normalvariate(0, 1) for _ in range(n)])
    return vec / np.linalg.norm(vec)

def SVD_1D(A, threshold = 1e-10):
    """
    A : CoordinateMatrix
    """
    n, m = A.numRows(), A.numCols()
    x = random_unit(m)
    # TODO finish this

In [None]:
import numpy as np
from numpy.linalg import norm

def svd_1d(A, epsilon=1e-10):
    ''' The one-dimensional SVD '''
    n, m = A.shape
    x = randomUnitVector(m)
    lastV = None
    currentV = x
    B = np.dot(A.T, A)
 
    iterations = 0
    while True:
        iterations += 1
        lastV = currentV
        currentV = np.dot(B, lastV)
        currentV = currentV / norm(currentV)
 
        if abs(np.dot(currentV, lastV)) > 1 - epsilon:
            print("converged in {} iterations!".format(iterations))
            return currentV

In [62]:
m2 = coordinateMatrixAdd(cmat, cmat, -2)

In [64]:
m2.entries.take(2)

[MatrixEntry(3, 9, -2.0), MatrixEntry(8, 4, -1.0)]

In [25]:
matsum = coordinateMatrixAdd(ndarr_to_coord_array(test_array3), ndarr_to_coord_array(test_array3), -2)

In [29]:
cmat = CoordinateMatrix(bare_occurrences.rdd.map(tuple))

In [43]:
cmat.entries.take(2)

[MatrixEntry(1, 176355, 1.0), MatrixEntry(2, 2547, 1.0)]

In [74]:
mul.entries.count()

595811

In [41]:
cmat.entries.count()

475392

In [45]:
cmb = cmat.toBlockMatrix()

In [47]:
cmi = cmb.toIndexedRowMatrix()

In [51]:
row = cmi.rows.take(1)

[IndexedRow(3558, (347112,[54015,294324],[1.0,2.0]))]

In [144]:
sqlContext.sql("""
select max(uid)
from occurrences
""").toPandas()

Unnamed: 0,max(uid)
0,341578


In [57]:
occurrences.toPandas()

Unnamed: 0,subreddit,author,tally,uid,rid
0.0,zyramains,TricolorStar,1,176355,1
1.0,zylzon,youknowitsyaboy,1,2547,2
2.0,zweiteliga,Saminka,1,206115,3
3.0,zurich,maxwellmaxen,2,72503,4
4.0,zsh,__soddit,1,159502,5
5.0,zootopia,thawed_caveman,2,22475,6
6.0,zootopia,speisekarte,11,29937,6
7.0,zootopia,rodrigogirao,6,42380,6
8.0,zootopia,phobos136,4,53058,6
9.0,zootopia,midnightopheliac,9,70104,6


In [89]:
# TODO: why doesn't this match rid?
cmat.numRows()

17085L

In [71]:
import numpy as np

In [74]:
np.array(df['tally']).min()

-632

In [69]:
sqlContext.sql("""
select author, subreddit, sum (score)
from test
group by author, subreddit
""").toPandas()

Unnamed: 0,author,subreddit,sum(score)
0.0,FormulaXDGame,formula1,31
1.0,HelluvaDeke,gaming,2
2.0,WhoKnowsWho2,xboxone,70
3.0,listentohim,DBZDokkanBattle,1
4.0,sevenzig,AskOuija,10
5.0,LumpyPick,leagueoflegends,4
6.0,volvostupidshit,Philippines,6
7.0,i_fuck_goats777,AskReddit,2
8.0,Ineeditunesalot,battlefield_4,11
9.0,martianlawrence,movies,11


In [78]:
sj.toPandas()

Unnamed: 0,author,author_cakeday,author_flair_css_class,author_flair_text,body,can_gild,collapsed,collapsed_reason,controversiality,created_utc,distinguished,edited,gilded,id,is_submitter,link_id,parent_id,retrieved_on,score,stickied,Unnamed: 21
0.0,stunt_penguin,,,,Wheelchairs make amazing dollys :D,True,False,,0,1501545600,,false,0,dkznc8h,False,t3_6qp8sw,t1_dkzbnn3,1503654247,1,False,...
1.0,[deleted],,,,[removed],True,False,,0,1501545600,,false,0,dkznc8i,False,t3_6qse6i,t1_dkzmgc3,1503654247,2,False,...
2.0,69ing,,,,I used to watch the shit out of these guys,True,False,,0,1501545600,,false,0,dkznc8j,False,t3_6qs8i1,t3_6qs8i1,1503654247,5,False,...
3.0,ArchadianJudge,,Archadianflair,,http://www.pixiv.net/member_illust.php?mode=me...,True,False,,0,1501545600,,false,0,dkznc8k,True,t3_6qsicx,t3_6qsicx,1503654247,2,False,...
4.0,sglville,,,,On the other yand you could say it's capitalis...,True,False,,0,1501545600,,false,0,dkznc8l,False,t3_6qryxf,t3_6qryxf,1503654247,2,False,...
5.0,NEWORLDODOR,,,,I'm not arguing that making university free an...,True,False,,0,1501545600,,false,0,dkznc8m,True,t3_6qrr1o,t1_dkzn053,1503654247,3,False,...
6.0,zachwad22,,,,"I like whisper-whistle obsessively, just barel...",True,False,,0,1501545600,,false,0,dkznc8n,False,t3_6qoe6s,t3_6qoe6s,1503654247,1,False,...
7.0,lurker4lyfe6969,,,,Served in the Chair Force. Can confirm,True,False,,0,1501545600,,false,0,dkznc8o,False,t3_6qodwi,t3_6qodwi,1503654247,2,False,...
8.0,SuburbanStoner,,,,"Nah, it's good right here",True,False,,0,1501545600,,false,0,dkznc8p,False,t3_6qn002,t1_dkz6ig2,1503654247,2,False,...
9.0,ensanguine,,,,I still stand by it not belonging on the plate...,True,False,,0,1501545600,,false,0,dkznc8q,False,t3_6qscet,t1_dkzn9vn,1503654247,18,False,...


In [27]:
rawDF = sqlContext.read.json("s3n://reddit-comments/2015/RC_2015-05", StructType(fields)).persist(StorageLevel.MEMORY_AND_DISK_SER)

In [28]:
rawDF.registerTempTable('comments')

In [574]:
sj.select

<bound method DataFrame.select of DataFrame[author: string, score: bigint, controversiality: bigint, subreddit: string]>