In [1]:
import pyspark

In [2]:
import numpy as np

In [3]:
from pyspark.mllib.linalg.distributed import CoordinateMatrix
from pyspark.mllib.linalg.distributed import MatrixEntry
from pyspark.sql.types import *
from operator import add
from pyspark.mllib.linalg import Vectors, ArrayType

In [4]:
minimal_fields = [ 
          StructField("author", StringType(), True),
          StructField("score", LongType(), True),
          StructField("subreddit", StringType(), True)]

In [5]:
def df_most_active_subreddits(k = 1000):
    most_active = sqlContext.sql("""
    select * from
        (select *, dense_rank() over (order by activity desc) as ordered_id
        from (select rid, subreddit, sum(activity) as activity
            from occurrences
            group by rid, subreddit))
""").persist(StorageLevel.MEMORY_AND_DISK_SER)
    most_active.registerTempTable('most_active')
    return most_active

def df_valid_users(min_subreddits = 2, max_subreddits = 20):
    """
    filter users by the number of subreddits they've posted in, among the above-defined most active subreddits
    """
    most_active_users = sqlContext.sql("""
    select * from
        (select author, count(subreddit) as count
        from 
            (select * from occurrences
            where subreddit in (select subreddit from most_active))
        group by author
        order by count desc)
    where count>=%d and count<=%d""" % (min_subreddits, max_subreddits)).persist(StorageLevel.MEMORY_AND_DISK_SER)
    most_active_users.registerTempTable('most_active_users')
    return most_active_users

def load_and_preprocess(json_uri, num_subreddit, user_min_active_subreddits = 4, user_max_active_subreddits = 20):
    """
    Load json and do preprocessing via some SQL queries
    """
    sj = sqlContext.read.json(json_uri, StructType(minimal_fields))
    sj.registerTempTable('test')
    
    occurrences = sqlContext.sql("""
    select *, dense_rank() over (order by subreddit desc) as rid 
    from  (SELECT subreddit, author, sum(sign(score)) as tally,\
        count(score) as activity, dense_rank() over (order by author desc) as uid
    from test
    group by subreddit, author)
    where tally!=0
    """).persist(StorageLevel.MEMORY_AND_DISK_SER)
    occurrences.registerTempTable('occurrences')
        
    df_most_active_subreddits()
    df_valid_users(user_min_active_subreddits, user_max_active_subreddits)
    
    test2 = sqlContext.sql("""
    select test.author, test.score, test.subreddit, most_active.ordered_id as ordered_id
    from test
    inner join most_active on most_active.subreddit=test.subreddit""")
    test2.registerTempTable('test2')
    
    occurrences_pruned = sqlContext.sql("""
    select *
    from  (SELECT test2.subreddit, author, test2.ordered_id, sum(score) as tally,\
        sum(abs(score)) as activity, dense_rank() over (order by author desc) as uid
        from test2
        where author in (select author from most_active_users)
        group by test2.subreddit, test2.ordered_id, author)
    where tally!=0
    """).persist(StorageLevel.MEMORY_AND_DISK_SER)
    occurrences_pruned.registerTempTable('occurrences_pruned')
    
    bare_occurrences = sqlContext.sql("""
    select ordered_id, uid, tally
    from occurrences_pruned
    """).persist(StorageLevel.MEMORY_AND_DISK_SER)
    bare_occurrences.registerTempTable('bare_occurrences')
    
def gen_frequency_matrix():
    # subreddit-activity matrix
    bare_occurrences = sqlContext.sql("""select * from bare_occurrences""")
    tf_ij = CoordinateMatrix(bare_occurrences.rdd.map(
            lambda row: (row.ordered_id, (row.uid, row.tally)))\
        .sortByKey().map(lambda entry: (entry[0] - 1, entry[1][0], entry[1][1])))
    return tf_ij

### Now initialize stuff

In [6]:
#json_uri = "s3a://insight-ohoidn/sample3.json"
json_uri = "s3a://insight-ohoidn/sample10M.json"
numreddits = 100
user_min_active_subreddits = 5
user_max_active_subreddits = 20

In [7]:
load_and_preprocess(json_uri, numreddits,
                    user_min_active_subreddits = user_min_active_subreddits,
                    user_max_active_subreddits = user_max_active_subreddits)

subreddit_mapper = dict(df_most_active_subreddits(numreddits)\
                        .rdd.map(lambda entry: (entry.ordered_id, entry.subreddit)).collect())
idx_mapper = {v: k for k, v in subreddit_mapper.iteritems()}

def idx_to_subreddit(idx):
    return subreddit_mapper[idx + 1]

def subreddit_to_idx(sub):
    return idx_mapper[sub] - 1

subreddit_to_idx.inverse = idx_to_subreddit
idx_to_subreddit.inverse = subreddit_to_idx

tf_ij = gen_frequency_matrix()

In [None]:
tf_ij.entries.count()

### Subreddit activity:

In [1247]:
i_sumtally_tuples = bare_occurrences.rdd.map(lambda row: (row.ordered_id, row.tally)).sortByKey()\
.reduceByKey(add)
gf_i = CoordinateMatrix(i_sumtally_tuples.map(lambda entry: (entry[0] - 1, 0, entry[1])))


nusers = tf_ij.numCols()

p_ij = coordinate_matrix_elementwise_vector_division(tf_ij, gf_i)

# TODO figure out nan issues
logp_ij = coordinateMatrixElementwise(p_ij, lambda elt: np.log((abs(elt))/np.log(nusers)))

entropy_i = coordinateMatrixElementwise(coordinate_matrix_sumj(logp_ij), lambda elt: elt + 1) # + 1, but this has to be handled separately to conserve matrix sparsity

In [27]:
%run cf_spark.py
%run cf_numpy.py

### numpy implementation:

In [1205]:
related_subs(tf_ij, 'DotA2', subreddit_to_idx)

[u'DotA2', u'2007scape', u'FireEmblemHeroes', u'leagueoflegends', u'ethtrader']

In [1202]:
spark_top_k_subs(sim, 'DotA2', subreddit_to_idx, k = 5)

[u'DotA2', u'2007scape', u'FireEmblemHeroes', u'leagueoflegends', u'ethtrader']

In [1236]:
related_subs(tf_ij, 'DotA2', subreddit_to_idx)

[u'DotA2', u'DestinyTheGame', u'aww', u'funny', u'Rainbow6']

In [1243]:
spark_top_k_subs(p_ij, 'Games', subreddit_to_idx, k = 5)

[u'Games', u'xboxone', u'RocketLeagueExchange', u'movies', u'FFBraveExvius']

In [1239]:
spark_top_k_subs(p_ij, 'DotA2', subreddit_to_idx, k = 5)

[u'DotA2', u'aww', u'funny', u'Rainbow6', u'TheSilphRoad']

In [1250]:
spark_top_k_subs(tf_ij, 'DotA2', subreddit_to_idx, k = 5)

[u'DotA2', u'DestinyTheGame', u'aww', u'funny', u'Rainbow6']

In [None]:
#related_subs(tf_ij, 'investing', subreddit_to_idx)

In [9]:
lst = spark_top_k_subs(tf_ij, 'DaftPunk', subreddit_to_idx, k = 5)

In [10]:
lst

[u'DaftPunk', u'gorillaz', u'baltimore', u'INDYCAR', u'cowboys']

In [11]:
import sqlite3

In [13]:
connection = sqlite3.connect("redicommend2.db")
cursor = connection.cursor()

In [14]:
sql_command = """
CREATE TABLE reddit ( 
key VARCHAR(30), 
related VARCHAR(150));"""

In [16]:
#cursor.execute(sql_command)

In [92]:
sql_command = """INSERT INTO reddit (key, related)
    VALUES ("testing", "foo, bar, baz");""" 

In [93]:
cursor.execute(sql_command)

<sqlite3.Cursor at 0x7fc17079adc0>

In [37]:
cursor.execute("delete FROM reddit")

<sqlite3.Cursor at 0x7f429faaea40>

In [38]:
cursor.fetchall() 

[]

<sqlite3.Cursor at 0x7f429faaea40>

[(u'intel, nvidia, AyyMD, simracing',)]

In [49]:
related_subs_from_sql('Amd')

u'intel, nvidia, AyyMD, simracing'

In [39]:
def insert_one(subreddit):
    related = spark_top_k_subs(tf_ij, subreddit, subreddit_to_idx, k = 5)
    if related is None:
        return
    val = ', '.join(related[1:])
    
    sql_command = """INSERT INTO reddit (key, related)
    VALUES ("%s", "%s");""" % (subreddit, val)
    cursor.execute(sql_command)

In [19]:
subreddit_mapper.keys()

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [40]:
for sub in idx_mapper.keys():
    insert_one(sub)

key not found 787
key not found 634
key not found 1229
key not found 442
key not found 803


In [50]:
connection.commit()
connection.close()

In [74]:
idx_mapper.keys()

[u'danganronpa',
 u'Roofing',
 u'AsianBeautyAdvice',
 u'AppleWatch',
 u'relationships',
 u'instant_regret',
 u'SteamKiwi',
 u'SamandTolki',
 u'USMCboot',
 u'PRLegacyWars',
 u'Honda',
 u'datingoverthirty',
 u'HistoryPorn',
 u'detroitlions',
 u'2007scape',
 u'fountainpens',
 u'skyrimmods',
 u'PokemonShuffle',
 u'ofcoursethatsathing',
 u'KaynMains',
 u'bestoflegaladvice',
 u'Steam',
 u'Pokemongiveaway',
 u'Freeclams',
 u'HollowKnight',
 u'askgaybros',
 u'SteamGameSwap',
 u'BiggerThanYouThought',
 u'Bayonetta',
 u'starcitizen',
 u'polyamory',
 u'BackYardChickens',
 u'fixit',
 u'totalwar',
 u'entp',
 u'electricians',
 u'AskAcademia',
 u'MechanicAdvice',
 u'kratom',
 u'LongDistance',
 u'Hammers',
 u'razer',
 u'LSD',
 u'AdventureCapitalist',
 u'PlaceNostalgia',
 u'HaloStory',
 u'GCdebatesQT',
 u'bipolar',
 u'gamedev',
 u'Nioh',
 u'Addons4Kodi',
 u'90sAlternative',
 u'Beatmatch',
 u'titanfall',
 u'3DMark',
 u'ModelUSHouse',
 u'AskThe_Donald',
 u'melbourne',
 u'HaggardGarage',
 u'Shoplifting',


In [1249]:
#related_subs(tf_ij, 'DotA2', subreddit_to_idx)

[u'DotA2', u'DestinyTheGame', u'aww', u'funny', u'Rainbow6']

In [None]:
related_subs(tf_ij, 'DotA2', subreddit_to_idx)

In [1143]:
tf_ij.numCols()

1370L

In [504]:
def transform_tf_if(val):
    newval = np.log(np.abs(val) + 1)
    if val < 0:
        return -newval
    else:
        return newval

# TODO: maybe save this for later, when noise can be reduced via dimensionality reduction
log_tf_ij = coordinateMatrixElementwise(tf_ij, transform_tf_if)

a_ij = coordinatematrix_multiply_vector_elementwise(log_tf_ij, entropy_i)

In [37]:
def predict_nobias(ratings, similarity, kind='user'):
    if kind == 'user':
        user_bias = ratings.mean(axis=1)
        ratings = (ratings - user_bias[:, np.newaxis]).copy()
        pred = similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
        pred += user_bias[:, np.newaxis]
    elif kind == 'item':
        item_bias = ratings.mean(axis=0)
        ratings = (ratings - item_bias[np.newaxis, :]).copy()
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
        pred += item_bias[np.newaxis, :]
        
    return pred

In [93]:
def predict_topk(ratings, similarity, kind='user', k=40):
    pred = np.zeros(ratings.shape)
    if kind == 'user':
        for i in xrange(ratings.shape[0]):
            top_k_users = [np.argsort(similarity[:,i])[:-k-1:-1]]
            for j in xrange(ratings.shape[1]):
                pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users]) 
                pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
    if kind == 'item':
        for j in xrange(ratings.shape[1]):
            top_k_items = [np.argsort(similarity[:,j])[:-k-1:-1]]
            for i in xrange(ratings.shape[0]):
                pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T) 
                pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))        
    
    return pred

In [38]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [901]:
def top_k_movies(similarity, movie_idx, k=6):
    return [notebook_idx_to_subreddit(x) for x in np.argsort(similarity[movie_idx,:])[:-k-1:-1]]

In [1206]:
#get_mse(predict_nobias, test)

In [756]:
bare_occurrences.toPandas()

Unnamed: 0,rid,uid,tally
0.0,1,173573,1
1.0,2,2510,1
2.0,3,202843,1
3.0,4,71343,2
4.0,5,157008,1
5.0,6,22113,2
6.0,6,29476,11
7.0,6,41712,6
8.0,6,52207,4
9.0,6,68993,9


In [799]:
ndmat = coordinate_matrix_to_ndarr(tf_ij)
#ndmat/=np.mean(ndmat)
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]
train, test = train_test_split(ndmat)
similarity = fast_similarity(train)

sim [[  1.00000000e+00   1.00000000e-09   1.50000000e+01 ...,   1.00000000e-09
    1.00000000e-09   1.00000000e-09]
 [  1.00000000e-09   4.90000000e+01   1.00000000e-09 ...,   1.00000000e-09
    1.00000000e-09   1.00000000e-09]
 [  1.50000000e+01   1.00000000e-09   2.25000000e+02 ...,   1.00000000e-09
    1.00000000e-09   1.00000000e-09]
 ..., 
 [  1.00000000e-09   1.00000000e-09   1.00000000e-09 ...,   1.00000000e+00
    1.00000000e-09   1.00000000e-09]
 [  1.00000000e-09   1.00000000e-09   1.00000000e-09 ...,   1.00000000e-09
    1.00000000e+00   1.00000000e-09]
 [  1.00000000e-09   1.00000000e-09   1.00000000e-09 ...,   1.00000000e-09
    1.00000000e-09   4.90000000e+01]]
diag [   1.   49.  225. ...,    1.    1.   49.]
[[  1.   7.  15. ...,   1.   1.   7.]]


In [880]:
def run_spark_ndarray(ndmat, subreddit):
    """
    ndmat : np.ndarray
    
    row indices: users
    column indices: subreddits
    """
    sim = similarity_matrix(ndarr_to_coord_array(ndmat.T))
    return spark_top_k_subs(sim, subreddit_to_idx(subreddit, idx_mapper), subreddit_mapper)

In [408]:
def func1(ndmat):
    ndmat = ndmat.copy()
    ndmat /= (epsilon + np.std(ndmat, axis = 0))
    ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)
    return ndmat

In [411]:
matfuncs = [func1]
def findtop_allmodels(sub_name):
    ndmat = coordinate_matrix_to_ndarr(tf_ij)
    ndmat = ndmat.T
    ndmat = ndmat[1:, 1:]
    
    def do_one(func, ndmat):
        ndmat = func(ndmat)
        train, test = train_test_split(ndmat)
        similarity = fast_similarity(train)

        item_similarity = fast_similarity(train, kind = 'item')
        print top_k_movies(item_similarity, subreddit_to_idx(sub_name), k = 10)
    [do_one(f, ndmat) for f in matfuncs]

In [873]:
def generate_ndmat():
    epsilon = 1e-9
    ndmat = coordinate_matrix_to_ndarr(tf_ij)
    ndmat = ndmat.T
    ndmat = ndmat[1:, 1:]

    #ndmat = np.mean(ndmat, axis = 0)
    ndmat /= (epsilon + np.std(ndmat, axis = 0))
    ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)
    
    print md5.md5(pickle.dumps(train)).digest()
    return ndmat

In [1009]:
epsilon = 1e-9
ndmat = coordinate_matrix_to_ndarr(tf_ij)
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]

#ndmat = np.mean(ndmat, axis = 0)
ndmat /= (epsilon + np.std(ndmat, axis = 0))
ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)

train, test = train_test_split(ndmat)
similarity = fast_similarity(ndmat)

item_similarity = fast_similarity(ndmat, kind = 'item')

md5.md5(pickle.dumps(train)).digest()

'\x1b\xbb\xc46 O\xe3\xdb\x83\xb2\x15\x84Rt\x00\x06'

In [530]:
import cf_numpy
ndmat = coordinate_matrix_to_ndarr(tf_ij)
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]
subreddit_mapper = dict(act.rdd.map(lambda entry: (entry.ordered_id, entry.subreddit)).collect())

In [537]:
epsilon = 1e-9
ndmat = ndmat.T
ndmat = ndmat[1:, 1:]

# TODO: play with this
ndmat /= (epsilon + np.std(ndmat, axis = 0))
ndmat /= (epsilon + np.std(ndmat.T, axis = 1).T)

train, test = train_test_split(ndmat)
state['item_similarity_full'] = fast_similarity(ndmat, kind = 'item')
state['item_similarity_sampled'] = fast_similarity(train, kind = 'item')
state['subreddit_mapper'] = subreddit_mapper
state['idx_mapper'] = {v: k for k, v in subreddit_mapper.iteritems()}

ValueError: Cannot take a larger sample than population when 'replace=False'

In [567]:
reload(cf_numpy)
cf_numpy.init(ndmat, subreddit_mapper)

In [297]:
predictions = predict_nobias(train, item_similarity, kind = 'item')

Implement LSA

Test stuff

In [94]:
test_coordmat2 = ndarr_to_coord_array(test_array2)

SVD implementation

In [80]:
from random import normalvariate

In [83]:
from random import normalvariate
def random_unit(n):
    vec = np.array([normalvariate(0, 1) for _ in range(n)])
    return vec / np.linalg.norm(vec)

def SVD_1D(A, threshold = 1e-10):
    """
    A : CoordinateMatrix
    """
    n, m = A.numRows(), A.numCols()
    x = random_unit(m)
    # TODO finish this

In [None]:
import numpy as np
from numpy.linalg import norm

def svd_1d(A, epsilon=1e-10):
    ''' The one-dimensional SVD '''
    n, m = A.shape
    x = randomUnitVector(m)
    lastV = None
    currentV = x
    B = np.dot(A.T, A)
 
    iterations = 0
    while True:
        iterations += 1
        lastV = currentV
        currentV = np.dot(B, lastV)
        currentV = currentV / norm(currentV)
 
        if abs(np.dot(currentV, lastV)) > 1 - epsilon:
            print("converged in {} iterations!".format(iterations))
            return currentV