In [130]:
train_dataset = "/Users/arun/Downloads/Restaurants/train/PA_train_yelp_academic_dataset_review.csv"
test_dataset='/Users/arun/Downloads/Restaurants/test/PA_test_yelp_academic_dataset_review.csv'
valid_dataset = '/Users/arun/Downloads/Restaurants/valid/PA_valid_yelp_academic_dataset_review.csv'

In [131]:
import pyspark
from pyspark import SparkContext

In [163]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("graph analysis").getOrCreate()

In [133]:
sc = spark.sparkContext

In [134]:
pyspark.SparkContext.setCheckpointDir(sc, '/tmp/spark-checkpoints')

In [135]:
train_df = spark.read.csv(train_dataset, header=True, quote='"', escape='"', multiLine=True)
valid_df = spark.read.csv(valid_dataset, header=True, quote='"', escape='"', multiLine=True)
test_df = spark.read.csv(test_dataset, header=True, quote='"', escape='"', multiLine=True)

In [136]:
train_df.columns

['funny',
 'user_id',
 'review_id',
 'text',
 'business_id',
 'stars',
 'date',
 'useful',
 'cool',
 '1overN',
 '2overN',
 'percentile\r']

In [137]:
train_df.count()

84887

In [138]:
test_df.count()

65591

In [139]:
valid_df.count()

27255

In [140]:
train_rdd = train_df.select("user_id", "business_id", "stars").rdd.map(lambda (user, business, star): (user, business, int(star)))
test_rdd = test_df.select("user_id", "business_id", "stars").rdd.map(lambda (user, business, star): (user, business, int(star)))

In [141]:
train_rdd.take(5)

[(u'eG6HneK9zLcuZpVuKcsCGQ', u'XqNDr54eLDLRfZwo4l4dVA', 4),
 (u'AlzerMK7z84E4KU6GjPzIQ', u'PyTHy9VPOhBCiGLsi-PA2Q', 3),
 (u'AlzerMK7z84E4KU6GjPzIQ', u'zzwhN7x37nyjP0ZM8oiHmw', 4),
 (u'AlzerMK7z84E4KU6GjPzIQ', u'Ul6JwluSTm12PVDIqnNaTg', 4),
 (u'AlzerMK7z84E4KU6GjPzIQ', u'2Ezp_HYCIVE-h7hpBBvtxw', 4)]

In [142]:
def preprocess_data(train, test):
    
    train_dev_test = sc.union([train, test])
    
    user_index = train_dev_test.map(lambda x: x[0]).distinct().zipWithIndex()
    bus_index = train_dev_test.map(lambda x: x[1]).distinct().zipWithIndex()
    
    train_index = train.map(lambda x: (x[0], (x[1], x[2]))).join(user_index)\
    .map(lambda x: (x[1][0][0], (x[1][0][1], x[1][1]))).join(bus_index)\
    .map(lambda x: (x[1][0][1], x[1][1], int(x[1][0][0])))
    
    test_index = test.map(lambda x: (x[0], (x[1], x[2]))).join(user_index)\
    .map(lambda x: (x[1][0][0], (x[1][0][1], x[1][1]))).join(bus_index)\
    .map(lambda x: (x[1][0][1], x[1][1], int(x[1][0][0])))
    
    return (train_index, test_index, user_index, bus_index)

In [143]:
train, test, user_index, bus_index = preprocess_data(train_rdd, test_rdd)

In [144]:
train.checkpoint()
# dev.checkpoint()
test.checkpoint()

In [145]:
train.take(5)

[(50221, 932, 5),
 (11002, 932, 2),
 (48676, 932, 5),
 (14468, 932, 5),
 (49833, 932, 5)]

## Compute weight matrix

In [146]:
from scipy import sparse
import numpy as np

In [147]:
rcv = np.array(train.collect())

In [148]:
user_indexes = rcv[:, 0]
business_indexes = rcv[:, 1]
ratings = rcv[:, 2]

In [149]:
num_users = max(user_indexes) + 1
num_businesses = max(business_indexes) + 1

In [150]:
weight_matrix = sparse.csr_matrix((ratings, (user_indexes, business_indexes)), shape=(num_users, num_businesses))

In [151]:
weight_matrix.shape

(56141, 3782)

## User and business diagonal vectors

In [152]:
d_user = np.array(train.map(lambda x: (x[0], x[2])).reduceByKey(lambda x, y: x+y).mapValues(lambda x: 1/np.sqrt(x)).collect())

In [153]:
user_diag_matrix = sparse.csr_matrix((d_user[:, 1], (d_user[:,0], d_user[:,0])), shape=(num_users, num_users))

In [154]:
d_business = np.array(train.map(lambda x: (x[1], x[2])).reduceByKey(lambda x, y: x+y).mapValues(lambda x: 1/np.sqrt(x)).collect())
bus_diag_matrix = sparse.csr_matrix((d_business[:, 1], (d_business[:,0], d_business[:,0])), shape=(num_businesses, num_businesses))


## Product of all 3 matrices

In [155]:
%%time
S = sparse.csr_matrix.dot((sparse.csr_matrix.dot(user_diag_matrix, weight_matrix)), bus_diag_matrix)

CPU times: user 8.46 ms, sys: 45.4 ms, total: 53.9 ms
Wall time: 167 ms


In [72]:
## Is it ok to initialize all values to less than 0


In [156]:
p0 = np.zeros(num_businesses)
u0 = np.zeros(num_users)

In [157]:
np.random.seed(100)

p = np.random.rand(num_businesses)
u = np.random.rand(num_users)

print p
print u
alpha = 0.9
beta = 0.9


for i in range(2000):
    p_new = alpha * sparse.csc_matrix.dot(S.T, u) + (1 - alpha) * p0
    u_new = beta * sparse.csr_matrix.dot(S, p) + (1 - beta) * u0
    
    # Change
    if i%500 == 0:
        print np.linalg.norm(p_new - p)
        print np.linalg.norm(u_new - u)
    
    p = p_new
    u = u_new

[0.54340494 0.27836939 0.42451759 ... 0.32243079 0.00827528 0.97578537]
[0.27138646 0.36400738 0.15940905 ... 0.06023534 0.44394428 0.6714796 ]
32.26305413906152
131.85173805941514
2.235201699724393e-22
3.1379647877223362e-22
2.955094942859544e-45
4.148611678406025e-45
3.906844792749965e-68
5.484758441377981e-68


In [99]:
b_index = dict()
reverse_b_index = dict()

for row in bus_index.collect():
    b_index[row[0]] = row[1]
    reverse_b_index[row[1]] = row[0]

In [172]:
u_index = dict()
reverse_u_index = dict()

for row in user_index.collect():
    u_index[row[0]] = row[1]
    reverse_u_index[row[1]] = row[0]

In [124]:
most_popular_business = np.argmax(p)
np.sum(weight_matrix[:,most_popular_business].toarray())

5583

In [125]:
most_active_user = np.argmax(u)
np.sum(weight_matrix[most_active_user].toarray())

3618

## Predict top 50 business for each business in test

In [160]:
test_user_set = set()

for (user, business, rating) in test.collect():
    test_user_set.add(user)

In [189]:
%%time

predictions = dict()
count = 0
for user in test_user_set:
    count += 1
    u0 = get_user_prior(user)
    p0 = np.zeros(num_businesses)

    p = np.random.rand(num_businesses)
    u = np.random.rand(num_users)

    alpha = 0.1
    beta = 0.2

    for i in range(500):
        p_new = alpha * sparse.csc_matrix.dot(S.T, u) + (1 - alpha) * p0
        u_new = beta * sparse.csr_matrix.dot(S, p) + (1 - beta) * u0

        # Change
#         if i%500 == 0:
#             print np.linalg.norm(p_new - p)
#             print np.linalg.norm(u_new - u)

        p = p_new
        u = u_new
        
    predictions[user] = get_business_predictions(p)
    
    if count > 100:
        break

CPU times: user 1min 8s, sys: 940 ms, total: 1min 9s
Wall time: 1min 19s


In [190]:
write_predictions(predictions)

In [166]:
def get_user_prior(user_id):
    u0 = np.zeros(num_users)
    t_f = train.filter(lambda (x, y, z): x==user_id)
    for (u, b, r) in t_f.collect():
        u0[b] = r
    
    return u0

In [178]:
def get_business_predictions(b_vector):
    return np.argsort(b_vector)[-50:]

In [180]:
def write_predictions(predictions):
    with open("birank_predictions.txt", "w") as out:
        for user in predictions:
            res = [reverse_u_index[user]]
            for pred in predictions[user]:
                res.append(reverse_b_index[pred])
            out.write(",".join(res) + '\n')