In [1]:
import pyspark

In [2]:
import numpy as np

In [3]:
from pyspark.mllib.linalg.distributed import CoordinateMatrix
from pyspark.mllib.linalg.distributed import MatrixEntry
from pyspark.sql.types import *
from operator import add
from pyspark.mllib.linalg import Vectors, ArrayType

In [4]:
minimal_fields = [ 
          StructField("author", StringType(), True),
          StructField("score", LongType(), True),
          StructField("subreddit", StringType(), True)]

In [5]:
#def parquet_cache(table_name):
#    def memo(f):
#        

In [6]:
def df_most_active_subreddits(num_subreddit = 1000, npartitions = 18):
    most_active = sqlContext.sql("""
    select * from
        (select *, dense_rank() over (order by activity desc) as ordered_id
        from (select rid, subreddit, sum(activity) as activity
            from occurrences
            group by rid, subreddit))
        where ordered_id<=%d
""" % num_subreddit).persist(StorageLevel.MEMORY_AND_DISK_SER)
    most_active.registerTempTable('most_active')
    return most_active.repartition(npartitions)

def df_valid_users(min_subreddits = 2, max_subreddits = 20):
    """
    filter users by the number of subreddits they've posted in, among the above-defined most active subreddits
    """
    most_active_users = sqlContext.sql("""
    select * from
        (select author, count(subreddit) as count
        from 
            (select * from occurrences
            where subreddit in (select subreddit from most_active))
        group by author
        order by count desc)
    where count>=%d and count<=%d""" % (min_subreddits, max_subreddits)).persist(StorageLevel.MEMORY_AND_DISK_SER)
    most_active_users.registerTempTable('most_active_users')
    return most_active_users

def load_and_preprocess(json_uri, num_subreddit, user_min_active_subreddits = 4, user_max_active_subreddits = 20):
    """
    Load json and do preprocessing via some SQL queries
    """
    sj = sqlContext.read.json(json_uri, StructType(minimal_fields))
    sj.registerTempTable('test')
    
    occurrences = sqlContext.sql("""
    select *, dense_rank() over (order by subreddit desc) as rid 
    from  (SELECT subreddit, author, sum(sign(score)) as tally,\
        count(score) as activity, dense_rank() over (order by author desc) as uid
    from test
    group by subreddit, author)
    where tally!=0
    """).persist(StorageLevel.MEMORY_AND_DISK_SER)
    occurrences.registerTempTable('occurrences')
        
    df_most_active_subreddits(num_subreddit)
    df_valid_users(user_min_active_subreddits, user_max_active_subreddits)
    
    test2 = sqlContext.sql("""
    select test.author, test.score, test.subreddit, most_active.ordered_id as ordered_id
    from test
    inner join most_active on most_active.subreddit=test.subreddit""")
    test2.registerTempTable('test2')
    
    occurrences_pruned = sqlContext.sql("""
    select *
    from  (SELECT test2.subreddit, author, test2.ordered_id, sum(score) as tally,\
        sum(abs(score)) as activity, dense_rank() over (order by author desc) as uid
        from test2
        where author in (select author from most_active_users)
        group by test2.subreddit, test2.ordered_id, author)
    where tally!=0
    """).persist(StorageLevel.MEMORY_AND_DISK_SER)
    occurrences_pruned.registerTempTable('occurrences_pruned')
    
    bare_occurrences = sqlContext.sql("""
    select ordered_id, uid, tally
    from occurrences_pruned
    """).persist(StorageLevel.MEMORY_AND_DISK_SER)
    bare_occurrences.registerTempTable('bare_occurrences')
    
def gen_frequency_matrix(npartitions = 18, bias_correction = False):
    # subreddit-activity matrix
    bare_occurrences = sqlContext.sql("""select * from bare_occurrences""").repartition(npartitions)
    rows = bare_occurrences.rdd.map(
            lambda row: (row.ordered_id, (row.uid, row.tally))).sortByKey()
    tf_ij = CoordinateMatrix(rows.map(lambda entry: (entry[0] - 1, entry[1][0], entry[1][1])))
    
    return rows, tf_ij

In [316]:
sqlContext.createDataFrame(rows).write.parquet(hdfs_prefix + "/test")

In [319]:
sqlContext.read.parquet(hdfs_prefix + "/test").rdd.getNumPartitions()

18

In [72]:
sqlContext.sql("""select
ordered_id, subreddit
from most_active
group by ordered_id, subreddit""").toPandas()

    ordered_id             subreddit
0            1             AskReddit
1            2              politics
2            3            The_Donald
3            4                soccer
4            5                   nba
5            6  RocketLeagueExchange
6            7             worldnews
7            8            BigBrother
8            9         SquaredCircle
9           10       leagueoflegends
10          11                  news
11          12                 funny
12          13         gameofthrones
13          14                gaming
14          15                 DotA2
15          16                videos
16          17                  pics
17          18       PUBATTLEGROUNDS
18          19             Overwatch
19          20           hearthstone
20          21         todayilearned
21          22                   nfl
22          23       DBZDokkanBattle
23          24                movies
24          25           pathofexile
25          26         relationships
2

In [66]:
sqlContext.sql("""select
distinct subreddit
from most_active""").collect()

[Row(subreddit=u'AskReddit'),
 Row(subreddit=u'politics'),
 Row(subreddit=u'The_Donald'),
 Row(subreddit=u'soccer'),
 Row(subreddit=u'nba'),
 Row(subreddit=u'RocketLeagueExchange'),
 Row(subreddit=u'worldnews'),
 Row(subreddit=u'BigBrother'),
 Row(subreddit=u'SquaredCircle'),
 Row(subreddit=u'leagueoflegends'),
 Row(subreddit=u'news'),
 Row(subreddit=u'funny'),
 Row(subreddit=u'gameofthrones'),
 Row(subreddit=u'gaming'),
 Row(subreddit=u'DotA2'),
 Row(subreddit=u'videos'),
 Row(subreddit=u'pics'),
 Row(subreddit=u'PUBATTLEGROUNDS'),
 Row(subreddit=u'Overwatch'),
 Row(subreddit=u'hearthstone'),
 Row(subreddit=u'todayilearned'),
 Row(subreddit=u'nfl'),
 Row(subreddit=u'DBZDokkanBattle'),
 Row(subreddit=u'movies'),
 Row(subreddit=u'pathofexile'),
 Row(subreddit=u'relationships'),
 Row(subreddit=u'pcmasterrace'),
 Row(subreddit=u'conspiracy'),
 Row(subreddit=u'gifs'),
 Row(subreddit=u'anime'),
 Row(subreddit=u'GlobalOffensive'),
 Row(subreddit=u'MMA'),
 Row(subreddit=u'Showerthoughts'),
 R

In [16]:
r.getNumPartitions()

1

In [43]:
#sqlContext.sql("set spark.sql.shuffle.partitions=4").toPandas()

Unnamed: 0,key,value
0,spark.sql.shuffle.partitions,4


### Now initialize stuff

In [7]:
#json_uri = "s3a://insight-ohoidn/sample3.json"
#json_uri = "s3a://insight-ohoidn/sample10M.json"
json_uri = "s3a://insight-ohoidn/RC_2017-08"
hdfs_prefix = "hdfs://ip-10-0-0-4:9000"
numreddits = 5000
user_min_active_subreddits = 8
user_max_active_subreddits = 100
default_partitions = 6

In [320]:
def datasource_hash():
    return json_uri + str(numreddits) + str(user_min_active_subreddits) + str(user_max_active_subreddits)

In [8]:
load_and_preprocess(json_uri, numreddits,
                    user_min_active_subreddits = user_min_active_subreddits,
                    user_max_active_subreddits = user_max_active_subreddits)

subreddit_mapper = dict(df_most_active_subreddits(numreddits)\
                        .rdd.map(lambda entry: (entry.ordered_id, entry.subreddit)).sortByKey().collect())
idx_mapper = {v: k for k, v in subreddit_mapper.iteritems()}

def idx_to_subreddit(idx):
    return subreddit_mapper[idx + 1]

def subreddit_to_idx(sub):
    return idx_mapper[sub] - 1

subreddit_to_idx.inverse = idx_to_subreddit
idx_to_subreddit.inverse = subreddit_to_idx

rows, tf_ij = gen_frequency_matrix()

In [52]:
tf_ij.entries.count()

5472178

In [38]:
pwd

u'/home/ubuntu'

In [54]:
tf_ij.numRows()

2000L

In [46]:
size

2098083L

In [73]:
%run svd.py

In [93]:
from sparsesvd import sparsesvd as svd2

In [54]:
tups = list(enumerate(enumerate(range(20)))) +  list(enumerate(enumerate(range(20))))

In [57]:
tp = sc.parallelize(tups)

In [66]:
list(tp.groupByKey().collect()[0][1])

[(0, 0), (0, 0)]

In [69]:
rows.take(2)

[(1, (266, 3)), (1, (313, 2))]

In [74]:
size = tf_ij.numCols()
vectors = rows.groupByKey().map(lambda tup: Vectors.sparse(size, list(tup[1])))

In [85]:
vectors = rows.groupByKey().map(lambda tup: Vectors(size, list(tup[1])))

In [86]:
mat = RowMatrix(vectors)

In [87]:
dsvd = mat.computeSVD(500, computeU=True)

AttributeError: 'RowMatrix' object has no attribute 'computeSVD'

In [293]:
bare_occurrences.write?

In [84]:
pyspark.__version__

'2.1.1+hadoop2.7'

In [82]:
mat.numRows()

4824L

In [None]:
vectors = rows.map(lambda tup: (tup[0], [tup[1]])).reduceByKey(add).collect()

In [18]:
tf_ij.numRows()

4825L

In [19]:
tf_ij.numCols()

833254L

In [38]:
tf_ij.entries.getNumPartitions()

4

### Subreddit activity:

In [9]:
import linalg

In [10]:
bare_occurrences = sqlContext.sql("""select * from bare_occurrences""").repartition(default_partitions)
i_sumtally_tuples = bare_occurrences.rdd.map(lambda row: (row.ordered_id, abs(row.tally))).sortByKey()\
.reduceByKey(add)
gf_i = CoordinateMatrix(i_sumtally_tuples.map(lambda entry: (entry[0] - 1, 0, entry[1])))


nusers = tf_ij.numCols()

p_ij = linalg.coordinate_matrix_elementwise_vector_division(tf_ij, gf_i)

In [13]:
# TODO figure out nan issues
#p_ij_logp_ij = linalg.coordinateMatrixElementwise(p_ij, lambda elt: np.abs(elt) * np.log(np.abs(elt))/np.log(nusers))

#g_i = linalg.coordinateMatrixAddConstant(linalg.coordinate_matrix_sumj(p_ij_logp_ij), 1.)

#log_tf_ij = linalg.coordinateMatrixElementwise(tf_ij, lambda elt: elt * np.log(1 + np.abs(elt)))

#a_ij = linalg.coordinatematrix_multiply_vector_elementwise(log_tf_ij, g_i)
#entropy_i = coordinateMatrixElementwise(coordinate_matrix_sumj(logp_ij), lambda elt: elt + 1) # + 1, but this has to be handled separately to conserve matrix sparsity

In [28]:
an = np.array(a_ij.entries.map(lambda entry: entry.value).collect())

In [33]:
a_ij.entries.count()

7631400

In [12]:
np.sum(np.isnan(np.array(a_ij.entries.map(lambda entry: entry.value).collect())))

0

### TODO: try entropy

In [None]:
#p_ij_logp_ij = linalg.coordinateMatrixElementwise(p_ij, lambda elt: np.abs(elt) * np.log(np.abs(elt))/np.log(nusers))

weight_i_sqrt = linalg.coordinateMatrixElementwise(
    linalg.coordinate_matrix_sumj(
        linalg.coordinateMatrixElementwise(tf_ij, lambda elt: elt * elt)),
    lambda elt: 1./np.sqrt(elt))

In [23]:
weight_i = linalg.coordinateMatrixElementwise(
    linalg.coordinate_matrix_sumj(
        linalg.coordinateMatrixElementwise(p_ij, lambda elt: -abs(elt) * np.log(abs(elt)) / np.log(nusers))),
    lambda elt: 1. + elt)

#g_i = linalg.coordinateMatrixAddConstant(linalg.coordinate_matrix_sumj(p_ij_logp_ij), 1.)

log_tf_ij = linalg.coordinateMatrixElementwise(tf_ij, lambda elt: np.log(1 + abs(elt)))

a_ij = linalg.coordinatematrix_multiply_vector_elementwise(log_tf_ij, weight_i)
#entropy_i = coordinateMatrixElementwise(coordinate_matrix_sumj(logp_ij), lambda elt: elt + 1) # + 1, but this has to be handled separately to conserve matrix sparsity

In [171]:
t_entries = sc.parallelize([MatrixEntry(0, 0, 1), MatrixEntry(2, 0, 1)])

In [172]:
cm = CoordinateMatrix(t_entries)

In [174]:
cm.numCols()

1L

In [178]:
tf_ij.numRows()

2000L

In [186]:
set2 = set(g_i.entries.map(lambda entry: entry.i).distinct().collect())

In [184]:
set1 = set(tf_ij.entries.map(lambda entry: entry.i).distinct().collect())

In [189]:
a_ij.entries.map(lambda entry: entry.i).distinct().count()

1999

In [161]:
tf_ij.numCols()

354598L

In [152]:
p_ij_logp_ij.entries.take(10)

[MatrixEntry(0, 14, -1.03349869227e-06),
 MatrixEntry(0, 30, -3.33088877597e-07),
 MatrixEntry(0, 66, -1.62489458268e-06),
 MatrixEntry(0, 89, -2.54582275764e-07),
 MatrixEntry(0, 95, -2.46455142761e-08),
 MatrixEntry(0, 140, -1.5221000875e-06),
 MatrixEntry(0, 189, -4.73772988391e-08),
 MatrixEntry(0, 209, nan),
 MatrixEntry(0, 226, -6.93867580806e-08),
 MatrixEntry(0, 258, -2.46455142761e-08)]

In [15]:
%run cf_spark.py
%run cf_numpy.py

In [122]:
import linalg

In [132]:
reload(linalg)

<module 'linalg' from 'linalg.py'>

In [24]:
nda = linalg.coordinate_matrix_to_ndarr(tf_ij)

In [19]:
%run svd.py

In [47]:
sparse_arr.shape

(4826, 2098083)

In [77]:
from pyspark.mllib.linalg.distributed import RowMatrix

In [None]:
rows = tf_ij.entries.map(lambda entry: (entry.i, [(entry.j, entry.value)])).reduceByKey(add).collect()

In [14]:
%run svd.py

In [17]:
a_ij.numRows()

4825L

In [24]:
#bare_occurrences = sqlContext.sql("""select * from bare_occurrences""").repartition(default_partitions)
#flat_coords = np.array(bare_occurrences.rdd.map(lambda row: (row.ordered_id, row.uid, row.tally)).collect())
flat_coords = np.array(a_ij.entries.map(lambda row: (row.i, row.j, row.value)).collect())
row_ind, col_ind, data = flat_coords.T
sparse_arr = csr_matrix((data.astype('float32'), (row_ind, col_ind)))

In [270]:
vals, svc = svd_correlation_matrix(sparse_arr, 170, normalize=False)

MemoryError: 

In [21]:
vals2, svc2 = svd_correlation_matrix(sparse_arr, 500, normalize = True)

In [16]:
vals3, svc3 = svd_correlation_matrix(sparse_arr, 300, normalize = True)

In [31]:
vals4, svc4 = svd_correlation_matrix(sparse_arr, 150, normalize = True)

In [39]:
vals5, svc5 = svd_correlation_matrix(sparse_arr,300, normalize = False)

In [41]:
vals6, svc6 = svd_correlation_matrix(sparse_arr, 3, normalize = True)

In [25]:
vals7, svc7 = svd_correlation_matrix(sparse_arr, 300, normalize = True)

In [27]:
vals, svc = svd_correlation_matrix(sparse_arr, 75)

In [119]:
test_matrix = csr_matrix(np.array(
                       [[0, 1, 0, 0, 0],
                       [0, 0, 1, 1, 0],
                       [0, 0, 1, 1, 0],
                       [0, 2, .5, 0, 0]]), dtype = 'float32')

In [120]:
a, b = svd_correlation_matrix(test_matrix, 2)

In [121]:
print b

[[ 1.         -0.07949956 -0.07949942  0.98295999]
 [-0.07949957  1.          0.99999994  0.10509278]
 [-0.07949942  0.99999994  1.          0.10509291]
 [ 0.98295999  0.10509279  0.10509291  1.        ]]


In [46]:
[idx_to_subreddit(x) for x in np.argsort(svc[subreddit_to_idx(''),:])[:-10-1:-1]]

[u'LinkinPark',
 u'leagueoflegends',
 u'IdiotsFightingThings',
 u'SupersRP',
 u'NBASpurs',
 u'UniversityOfHouston',
 u'panthers',
 u'ROCD',
 u'ChildrenFallingOver',
 u'SuicideWatch']

In [291]:
[idx_to_subreddit(x) for x in np.argsort(svc[subreddit_to_idx('anime'),:])[:-20-1:-1]]

[u'anime',
 u'Animemes',
 u'manga',
 u'anime_irl',
 u'Overwatch',
 u'OnePiece',
 u'ftfanime',
 u'polandball',
 u'gameofthrones',
 u'AskReddit',
 u'FireEmblemHeroes',
 u'grandorder',
 u'worldnews',
 u'facepalm',
 u'Naruto',
 u'nfl',
 u'eu4',
 u'DotA2',
 u'leagueoflegends',
 u'BokuNoHeroAcademia']

In [38]:
[idx_to_subreddit(x) for x in np.argsort(svc2[subreddit_to_idx('ForeverAlone'),:])[:-20-1:-1]]

[u'ForeverAlone',
 u'RightwingLGBT',
 u'cormacmccarthy',
 u'farcry',
 u'BreakUps',
 u'GalaxyS7',
 u'Audi',
 u'socialanxiety',
 u'OtogiSSA',
 u'discordapp',
 u'ImagesOfCalifornia',
 u'Skullgirls',
 u'FreeKarma4You',
 u'DebateAltRight',
 u'PeopleBeingJerks',
 u'arresteddevelopment',
 u'IncelReddit',
 u'CSURams',
 u'LazyMan',
 u'Miitopia']

In [22]:
[idx_to_subreddit(x) for x in np.argsort(svc3[subreddit_to_idx('Showerthoughts'),:])[:-20-1:-1]]

[u'Showerthoughts',
 u'sissyhypno',
 u'MurderedByWords',
 u'Roast_Me',
 u'HelloInternet',
 u'cincinnati',
 u'ESOGuilds',
 u'ToolBand',
 u'Presetbay',
 u'Starcitizen_trades',
 u'Infinitewarfare',
 u'PickUpTorrents',
 u'EverythingScience',
 u'civ5',
 u'shittyadvice',
 u'sips',
 u'TryingForABaby',
 u'2healthbars',
 u'LofiHipHop',
 u'vainglorygame']

In [73]:
[idx_to_subreddit(x) for x in np.argsort(svc3[subreddit_to_idx('MDMA'),:])[:-20-1:-1]]

[u'MDMA',
 u'researchchemicals',
 u'RCSources',
 u'DankNation',
 u'Drugs',
 u'DarkNetMarkets',
 u'AsiaTripper',
 u'drugscirclejerk',
 u'phenibut',
 u'Dream_Market',
 u'1P_LSD',
 u'shrooms',
 u'DrugStashes',
 u'EU_RCSources',
 u'Angular2',
 u'moomooio',
 u'FinalFantasyXII',
 u'benzodiazepines',
 u'Fraudnet2',
 u'Etizolam']

In [36]:
[idx_to_subreddit(x) for x in np.argsort(svc4[subreddit_to_idx('ForeverAlone'),:])[:-20-1:-1]]

[u'ForeverAlone',
 u'RightwingLGBT',
 u'JordanPeterson',
 u'AskThe_Donald',
 u'The_Donald',
 u'tucker_carlson',
 u'IncelReddit',
 u'BPDlovedones',
 u'CSURams',
 u'nflstreams',
 u'DebateAltRight',
 u'POLITIC',
 u'metacanada',
 u'marriedredpill',
 u'privacy',
 u'MiniDayZ',
 u'ElitePS',
 u'Narcolepsy',
 u'Incels',
 u'CastleClash']

In [40]:
[idx_to_subreddit(x) for x in np.argsort(svc5[subreddit_to_idx('ForeverAlone'),:])[:-20-1:-1]]

[u'ForeverAlone',
 u'Incels',
 u'RightwingLGBT',
 u'IncelTears',
 u'IncelReddit',
 u'truerateme',
 u'The_Donald',
 u'arresteddevelopment',
 u'dbz',
 u'childfree',
 u'MEIOUandTaxes',
 u'badwomensanatomy',
 u'askgaybros',
 u'PurplePillDebate',
 u'GalaxyS7',
 u'ProtectAndServe',
 u'amiugly',
 u'DotA2',
 u'AskMen',
 u'coys']

In [65]:
[idx_to_subreddit(x) for x in np.argsort(svc6[subreddit_to_idx('aww'),:])[:-20-1:-1]]

[u'aww',
 u'funny',
 u'todayilearned',
 u'watchpeopledie',
 u'GetMotivated',
 u'mildlyinteresting',
 u'pics',
 u'news',
 u'OldSchoolCool',
 u'Futurology',
 u'Jokes',
 u'science',
 u'ImagesOfNewYork',
 u'TwoXChromosomes',
 u'videos',
 u'sports',
 u'ants',
 u'UpliftingNews',
 u'gaming',
 u'nottheonion']

In [33]:
[idx_to_subreddit(x) for x in np.argsort(svc7[subreddit_to_idx('electronicmusic'),:])[:-20-1:-1]]

[u'electronicmusic',
 u'edmproduction',
 u'WeAreTheMusicMakers',
 u'vinyl',
 u'trap',
 u'audioengineering',
 u'ableton',
 u'Jazz',
 u'DJs',
 u'synthesizers',
 u'LetsTalkMusic',
 u'HVAC',
 u'baltimore',
 u'ContestOfChampionsLFG',
 u'spotify',
 u'Psychonaut',
 u'musictheory',
 u'KGATLW',
 u'Meditation',
 u'FL_Studio']

In [28]:
[idx_to_subreddit(x) for x in np.argsort(svc2[subreddit_to_idx('wow'),:])[:-20-1:-1]]

[u'wow',
 u'CompetitiveWoW',
 u'SubaruForester',
 u'AirBnB',
 u'TrueSTL',
 u'fakehistoryporn',
 u'behindthegifs',
 u'copypasta',
 u'asianminivans',
 u'ImaginaryBehemoths',
 u'EngageMetv',
 u'rccars',
 u'heroesofthestorm',
 u'coaxedintoasnafu',
 u'Awesomenauts',
 u'ColoradoAvalanche',
 u'trebuchetmemes',
 u'Hydroponics',
 u'MicrosoftBand',
 u'AnimalsWithoutNecks']

In [297]:
len(vals2)

100

In [298]:
[idx_to_subreddit(x) for x in np.argsort(svc[subreddit_to_idx('anime'),:])[:-20-1:-1]]

[u'anime',
 u'Animemes',
 u'manga',
 u'anime_irl',
 u'Overwatch',
 u'OnePiece',
 u'ftfanime',
 u'polandball',
 u'gameofthrones',
 u'AskReddit',
 u'FireEmblemHeroes',
 u'grandorder',
 u'worldnews',
 u'facepalm',
 u'Naruto',
 u'nfl',
 u'eu4',
 u'DotA2',
 u'leagueoflegends',
 u'BokuNoHeroAcademia']

In [285]:
print subreddit_to_idx('anime'), subreddit_to_idx('boston')

26 391


In [286]:
[idx_to_subreddit(x) for x in np.argsort(svc2[subreddit_to_idx('anime'),:])[:-20-1:-1]]

[u'anime',
 u'ftfanime',
 u'fatestaynight',
 u'visualnovels',
 u'swordartonline',
 u'RPGStuck',
 u'Animesuggest',
 u'Animemes',
 u'MemoryDefrag',
 u'anime_irl',
 u'grandorder',
 u'OnePiece',
 u'kancolle',
 u'Naruto',
 u'SchoolIdolFestival',
 u'Persona5',
 u'manga',
 u'Gundam',
 u'BokuNoHeroAcademia',
 u'PvZHeroes']

In [284]:
[idx_to_subreddit(x) for x in np.argsort(svc2[subreddit_to_idx('boston'),:])[:-20-1:-1]]

[u'boston',
 u'TrumpCriticizesTrump',
 u'Nexus6P',
 u'politics',
 u'LosAngeles',
 u'photocritique',
 u'Judaism',
 u'TickTockManitowoc',
 u'NorthCarolina',
 u'leafs',
 u'BitcoinMarkets',
 u'mattcolville',
 u'ipad',
 u'baltimore',
 u'CarAV',
 u'homestuck',
 u'ShinyPokemon',
 u'FanTheories',
 u'friendsafari',
 u'StLouis']

In [272]:
[idx_to_subreddit(x) for x in np.argsort(svc2[subreddit_to_idx('politics'),:])[:-20-1:-1]]

[u'politics',
 u'Dream_Market',
 u'friendsafari',
 u'CarAV',
 u'BitcoinMarkets',
 u'PuzzleAndDragons',
 u'Glocks',
 u'sanfrancisco',
 u'thelongdark',
 u'learnmath',
 u'Kanye',
 u'AskNYC',
 u'Nexus6P',
 u'California',
 u'medicalschool',
 u'aznidentity',
 u'MrRobot',
 u'TheNewRight',
 u'Lollapalooza',
 u'boston']

In [104]:
[idx_to_subreddit(x) for x in np.argsort(svc[subreddit_to_idx('Games'),:])[:-10-1:-1]]

[u'Games',
 u'motorcycles',
 u'funkopop',
 u'Mariners',
 u'oculus',
 u'AFL',
 u'churning',
 u'keto',
 u'Sneakers',
 u'opiates']

In [122]:
vals, svc = svd_correlation_matrix(nda, 500)
[idx_to_subreddit(x) for x in np.argsort(svc[subreddit_to_idx('The_Donald'),:])[:-10-1:-1]]

[u'The_Donald',
 u'AskThe_Donald',
 u'milliondollarextreme',
 u'AskTrumpSupporters',
 u'Warhammer40k',
 u'ContestOfChampions',
 u'uncensorednews',
 u'albiononline',
 u'asktrp',
 u'Tekken']

In [117]:
vals, svc = svd_correlation_matrix(nda, 100)
[idx_to_subreddit(x) for x in np.argsort(svc[subreddit_to_idx('The_Donald'),:])[:-10-1:-1]]

[u'The_Donald',
 u'AskTrumpSupporters',
 u'AskThe_Donald',
 u'milliondollarextreme',
 u'Warhammer40k',
 u'bjj',
 u'btc',
 u'ethtrader',
 u'Libertarian',
 u'Conservative']

In [118]:
vals, svc = svd_correlation_matrix(nda, 200)
[idx_to_subreddit(x) for x in np.argsort(svc[subreddit_to_idx('The_Donald'),:])[:-10-1:-1]]

[u'The_Donald',
 u'AskTrumpSupporters',
 u'AskThe_Donald',
 u'Warhammer40k',
 u'uncensorednews',
 u'milliondollarextreme',
 u'ContestOfChampions',
 u'TheRedPill',
 u'bjj',
 u'Advice']

In [119]:
vals, svc = svd_correlation_matrix(nda, 300)
[idx_to_subreddit(x) for x in np.argsort(svc[subreddit_to_idx('The_Donald'),:])[:-10-1:-1]]

[u'The_Donald',
 u'AskTrumpSupporters',
 u'AskThe_Donald',
 u'ContestOfChampions',
 u'4chan4trump',
 u'Watches',
 u'albiononline',
 u'Warhammer40k',
 u'NoMansSkyTheGame',
 u'Advice']

In [130]:
vals, svc = svd_correlation_matrix(nda, 470)
[idx_to_subreddit(x) for x in np.argsort(svc[subreddit_to_idx('The_Donald'),:])[:-10-1:-1]]

[u'The_Donald',
 u'thedivision',
 u'Whatcouldgowrong',
 u'CringeAnarchy',
 u'AskThe_Donald',
 u'AskTrumpSupporters',
 u'sadcringe',
 u'FreeKarma4You',
 u'conspiracy',
 u'SquaredCircle']

In [78]:
[idx_to_subreddit(x) for x in np.argsort(svc[subreddit_to_idx('The_Donald'),:])[:-10-1:-1]]

[u'The_Donald',
 u'funkopop',
 u'fakeid',
 u'blackdesertonline',
 u'streetwear',
 u'oculus',
 u'sysadmin',
 u'funny',
 u'Mariners',
 u'worldnews']

### numpy implementation:

In [17]:
spark_top_k_subs(tf_ij, 'The_Donald', subreddit_to_idx, k = 10)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 17 in stage 259.0 failed 4 times, most recent failure: Lost task 17.3 in stage 259.0 (TID 3030, 10.0.0.11, executor 1): java.net.SocketException: Connection reset
	at java.net.SocketInputStream.read(SocketInputStream.java:209)
	at java.net.SocketInputStream.read(SocketInputStream.java:141)
	at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
	at java.io.BufferedInputStream.read(BufferedInputStream.java:265)
	at java.io.DataInputStream.readInt(DataInputStream.java:387)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:390)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:441)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor84.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketException: Connection reset
	at java.net.SocketInputStream.read(SocketInputStream.java:209)
	at java.net.SocketInputStream.read(SocketInputStream.java:141)
	at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
	at java.io.BufferedInputStream.read(BufferedInputStream.java:265)
	at java.io.DataInputStream.readInt(DataInputStream.java:387)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:166)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:390)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)


In [18]:
spark_top_k_subs(tf_ij2, 'The_Donald', subreddit_to_idx, k = 20)

[u'The_Donald',
 u'AskThe_Donald',
 u'AskTrumpSupporters',
 u'Whatcouldgowrong',
 u'CringeAnarchy',
 u'uncensorednews',
 u'milliondollarextreme',
 u'sadcringe',
 u'conspiracy',
 u'Conservative',
 u'Warhammer40k',
 u'SquaredCircle',
 u'KotakuInAction',
 u'AdviceAnimals',
 u'Watches',
 u'Mcat',
 u'Libertarian',
 u'ImGoingToHellForThis',
 u'Tekken',
 u'Advice']

In [136]:
spark_top_k_subs(tf_ij, 'baseball', subreddit_to_idx, k = 10)

[u'baseball',
 u'WahoosTipi',
 u'redsox',
 u'CHICubs',
 u'NYYankees',
 u'nfl',
 u'Dodgers',
 u'fantasybaseball',
 u'nba',
 u'magicTCG']

In [1243]:
spark_top_k_subs(p_ij, 'Games', subreddit_to_idx, k = 5)

[u'Games', u'xboxone', u'RocketLeagueExchange', u'movies', u'FFBraveExvius']

In [26]:
spark_top_k_subs(tf_ij, 'DotA2', subreddit_to_idx, k = 5)

[u'DotA2', u'learndota2', u'pawg', u'bravefrontier', u'forwardsfromgrandma']




In [11]:
spark_top_k_subs(tf_ij, 'DotA2', subreddit_to_idx, k = 5)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 62.0 failed 4 times, most recent failure: Lost task 3.3 in stage 62.0 (TID 1438, 10.0.0.11, executor 1): java.net.SocketException: Connection reset
	at java.net.SocketInputStream.read(SocketInputStream.java:209)
	at java.net.SocketInputStream.read(SocketInputStream.java:141)
	at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
	at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
	at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
	at java.io.DataInputStream.readFully(DataInputStream.java:195)
	at java.io.DataInputStream.readFully(DataInputStream.java:169)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:169)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:156)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:152)
	at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:43)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:504)
	at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:328)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1951)
	at org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:269)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:441)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketException: Connection reset
	at java.net.SocketInputStream.read(SocketInputStream.java:209)
	at java.net.SocketInputStream.read(SocketInputStream.java:141)
	at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
	at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
	at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
	at java.io.DataInputStream.readFully(DataInputStream.java:195)
	at java.io.DataInputStream.readFully(DataInputStream.java:169)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:169)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:156)
	at org.apache.spark.api.python.PythonRunner$$anon$1.next(PythonRDD.scala:152)
	at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:43)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:504)
	at org.apache.spark.api.python.PythonRunner$WriterThread$$anonfun$run$3.apply(PythonRDD.scala:328)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1951)
	at org.apache.spark.api.python.PythonRunner$WriterThread.run(PythonRDD.scala:269)


In [9]:
spark_top_k_subs(tf_ij, 'technology', subreddit_to_idx, k = 5)

In [10]:
lst

[u'DaftPunk', u'gorillaz', u'baltimore', u'INDYCAR', u'cowboys']

In [34]:
import sqlite3

In [35]:
connection = sqlite3.connect("redicommend5.db")
cursor = connection.cursor()

In [36]:
sql_command = """
CREATE TABLE reddit ( 
key VARCHAR(30), 
related VARCHAR(150));"""

In [16]:
#cursor.execute(sql_command)

In [37]:
cursor.execute(sql_command)

<sqlite3.Cursor at 0x7f81924656c0>

In [27]:
cursor.execute("delete FROM reddit")

OperationalError: no such table: reddit

In [38]:
def insert_one_svd(sim, subreddit, k = 20):
    related = [idx_to_subreddit(x) for x in np.argsort(sim[subreddit_to_idx(subreddit),:])[:-k-1:-1]]
    if related is None:
        return
    val = ', '.join(related[1:])
    
    sql_command = """INSERT INTO reddit (key, related)
    VALUES ("%s", "%s");""" % (subreddit, val)
    cursor.execute(sql_command)

In [35]:
def insert_one(subreddit):
    related = spark_top_k_subs(tf_ij2, subreddit, subreddit_to_idx, k = 20)
    if related is None:
        return
    val = ', '.join(related[1:])
    
    sql_command = """INSERT INTO reddit (key, related)
    VALUES ("%s", "%s");""" % (subreddit, val)
    cursor.execute(sql_command)

In [19]:
subreddit_mapper.keys()

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [39]:
for sub in idx_mapper.keys():
    insert_one_svd(svc7, sub)

In [40]:
connection.commit()
connection.close()

In [74]:
idx_mapper.keys()

[u'danganronpa',
 u'Roofing',
 u'AsianBeautyAdvice',
 u'AppleWatch',
 u'relationships',
 u'instant_regret',
 u'SteamKiwi',
 u'SamandTolki',
 u'USMCboot',
 u'PRLegacyWars',
 u'Honda',
 u'datingoverthirty',
 u'HistoryPorn',
 u'detroitlions',
 u'2007scape',
 u'fountainpens',
 u'skyrimmods',
 u'PokemonShuffle',
 u'ofcoursethatsathing',
 u'KaynMains',
 u'bestoflegaladvice',
 u'Steam',
 u'Pokemongiveaway',
 u'Freeclams',
 u'HollowKnight',
 u'askgaybros',
 u'SteamGameSwap',
 u'BiggerThanYouThought',
 u'Bayonetta',
 u'starcitizen',
 u'polyamory',
 u'BackYardChickens',
 u'fixit',
 u'totalwar',
 u'entp',
 u'electricians',
 u'AskAcademia',
 u'MechanicAdvice',
 u'kratom',
 u'LongDistance',
 u'Hammers',
 u'razer',
 u'LSD',
 u'AdventureCapitalist',
 u'PlaceNostalgia',
 u'HaloStory',
 u'GCdebatesQT',
 u'bipolar',
 u'gamedev',
 u'Nioh',
 u'Addons4Kodi',
 u'90sAlternative',
 u'Beatmatch',
 u'titanfall',
 u'3DMark',
 u'ModelUSHouse',
 u'AskThe_Donald',
 u'melbourne',
 u'HaggardGarage',
 u'Shoplifting',


In [1249]:
#related_subs(tf_ij, 'DotA2', subreddit_to_idx)

[u'DotA2', u'DestinyTheGame', u'aww', u'funny', u'Rainbow6']

In [None]:
related_subs(tf_ij, 'DotA2', subreddit_to_idx)

In [1143]:
tf_ij.numCols()

1370L

In [504]:
def transform_tf_if(val):
    newval = np.log(np.abs(val) + 1)
    if val < 0:
        return -newval
    else:
        return newval

# TODO: maybe save this for later, when noise can be reduced via dimensionality reduction
log_tf_ij = coordinateMatrixElementwise(tf_ij, transform_tf_if)

a_ij = coordinatematrix_multiply_vector_elementwise(log_tf_ij, entropy_i)

In [37]:
def predict_nobias(ratings, similarity, kind='user'):
    if kind == 'user':
        user_bias = ratings.mean(axis=1)
        ratings = (ratings - user_bias[:, np.newaxis]).copy()
        pred = similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
        pred += user_bias[:, np.newaxis]
    elif kind == 'item':
        item_bias = ratings.mean(axis=0)
        ratings = (ratings - item_bias[np.newaxis, :]).copy()
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
        pred += item_bias[np.newaxis, :]
        
    return pred

In [93]:
def predict_topk(ratings, similarity, kind='user', k=40):
    pred = np.zeros(ratings.shape)
    if kind == 'user':
        for i in xrange(ratings.shape[0]):
            top_k_users = [np.argsort(similarity[:,i])[:-k-1:-1]]
            for j in xrange(ratings.shape[1]):
                pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users]) 
                pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
    if kind == 'item':
        for j in xrange(ratings.shape[1]):
            top_k_items = [np.argsort(similarity[:,j])[:-k-1:-1]]
            for i in xrange(ratings.shape[0]):
                pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T) 
                pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))        
    
    return pred

In [38]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [901]:
def top_k_movies(similarity, movie_idx, k=6):
    return [notebook_idx_to_subreddit(x) for x in np.argsort(similarity[movie_idx,:])[:-k-1:-1]]

In [1206]:
#get_mse(predict_nobias, test)

In [756]:
bare_occurrences.toPandas()

Unnamed: 0,rid,uid,tally
0.0,1,173573,1
1.0,2,2510,1
2.0,3,202843,1
3.0,4,71343,2
4.0,5,157008,1
5.0,6,22113,2
6.0,6,29476,11
7.0,6,41712,6
8.0,6,52207,4
9.0,6,68993,9


In [799]:
ndmat = coordinate_matrix_to_ndarr(tf_ij)
#ndmat/=np.mean(ndmat)
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]
train, test = train_test_split(ndmat)
similarity = fast_similarity(train)

sim [[  1.00000000e+00   1.00000000e-09   1.50000000e+01 ...,   1.00000000e-09
    1.00000000e-09   1.00000000e-09]
 [  1.00000000e-09   4.90000000e+01   1.00000000e-09 ...,   1.00000000e-09
    1.00000000e-09   1.00000000e-09]
 [  1.50000000e+01   1.00000000e-09   2.25000000e+02 ...,   1.00000000e-09
    1.00000000e-09   1.00000000e-09]
 ..., 
 [  1.00000000e-09   1.00000000e-09   1.00000000e-09 ...,   1.00000000e+00
    1.00000000e-09   1.00000000e-09]
 [  1.00000000e-09   1.00000000e-09   1.00000000e-09 ...,   1.00000000e-09
    1.00000000e+00   1.00000000e-09]
 [  1.00000000e-09   1.00000000e-09   1.00000000e-09 ...,   1.00000000e-09
    1.00000000e-09   4.90000000e+01]]
diag [   1.   49.  225. ...,    1.    1.   49.]
[[  1.   7.  15. ...,   1.   1.   7.]]


In [880]:
def run_spark_ndarray(ndmat, subreddit):
    """
    ndmat : np.ndarray
    
    row indices: users
    column indices: subreddits
    """
    sim = similarity_matrix(ndarr_to_coord_array(ndmat.T))
    return spark_top_k_subs(sim, subreddit_to_idx(subreddit, idx_mapper), subreddit_mapper)

In [408]:
def func1(ndmat):
    ndmat = ndmat.copy()
    ndmat /= (epsilon + np.std(ndmat, axis = 0))
    ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)
    return ndmat

In [411]:
matfuncs = [func1]
def findtop_allmodels(sub_name):
    ndmat = coordinate_matrix_to_ndarr(tf_ij)
    ndmat = ndmat.T
    ndmat = ndmat[1:, 1:]
    
    def do_one(func, ndmat):
        ndmat = func(ndmat)
        train, test = train_test_split(ndmat)
        similarity = fast_similarity(train)

        item_similarity = fast_similarity(train, kind = 'item')
        print top_k_movies(item_similarity, subreddit_to_idx(sub_name), k = 10)
    [do_one(f, ndmat) for f in matfuncs]

In [873]:
def generate_ndmat():
    epsilon = 1e-9
    ndmat = coordinate_matrix_to_ndarr(tf_ij)
    ndmat = ndmat.T
    ndmat = ndmat[1:, 1:]

    #ndmat = np.mean(ndmat, axis = 0)
    ndmat /= (epsilon + np.std(ndmat, axis = 0))
    ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)
    
    print md5.md5(pickle.dumps(train)).digest()
    return ndmat

In [1009]:
epsilon = 1e-9
ndmat = coordinate_matrix_to_ndarr(tf_ij)
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]

#ndmat = np.mean(ndmat, axis = 0)
ndmat /= (epsilon + np.std(ndmat, axis = 0))
ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)

train, test = train_test_split(ndmat)
similarity = fast_similarity(ndmat)

item_similarity = fast_similarity(ndmat, kind = 'item')

md5.md5(pickle.dumps(train)).digest()

'\x1b\xbb\xc46 O\xe3\xdb\x83\xb2\x15\x84Rt\x00\x06'

In [52]:
import cf_numpy
ndmat = coordinate_matrix_to_ndarr(tf_ij)
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]
subreddit_mapper = dict(act.rdd.map(lambda entry: (entry.ordered_id, entry.subreddit)).collect())

NameError: name 'coordinate_matrix_to_ndarr' is not defined

In [51]:
subreddit_mapper1 = subreddit_mapper

In [537]:
epsilon = 1e-9
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]

# TODO: play with this
ndmat /= (epsilon + np.std(ndmat, axis = 0))
ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)

train, test = train_test_split(ndmat)
state['item_similarity_full'] = fast_similarity(ndmat, kind = 'item')
state['item_similarity_sampled'] = fast_similarity(train, kind = 'item')
state['subreddit_mapper'] = subreddit_mapper
state['idx_mapper'] = {v: k for k, v in subreddit_mapper.iteritems()}

ValueError: Cannot take a larger sample than population when 'replace=False'

In [567]:
reload(cf_numpy)
cf_numpy.init(ndmat, subreddit_mapper)

In [297]:
predictions = predict_nobias(train, item_similarity, kind = 'item')

Implement LSA

Test stuff

In [94]:
test_coordmat2 = ndarr_to_coord_array(test_array2)

SVD implementation

In [80]:
from random import normalvariate

In [83]:
from random import normalvariate
def random_unit(n):
    vec = np.array([normalvariate(0, 1) for _ in range(n)])
    return vec / np.linalg.norm(vec)

def SVD_1D(A, threshold = 1e-10):
    """
    A : CoordinateMatrix
    """
    n, m = A.numRows(), A.numCols()
    x = random_unit(m)
    # TODO finish this

In [None]:
import numpy as np
from numpy.linalg import norm

def svd_1d(A, epsilon=1e-10):
    ''' The one-dimensional SVD '''
    n, m = A.shape
    x = randomUnitVector(m)
    lastV = None
    currentV = x
    B = np.dot(A.T, A)
 
    iterations = 0
    while True:
        iterations += 1
        lastV = currentV
        currentV = np.dot(B, lastV)
        currentV = currentV / norm(currentV)
 
        if abs(np.dot(currentV, lastV)) > 1 - epsilon:
            print("converged in {} iterations!".format(iterations))
            return currentV