In [30]:
import time
import os
import pandas as pd
import numpy as np

### Loading training and testing review data

In [31]:
data_path = './data/'

# define file path
train_file = os.path.join(data_path, 'review_train.csv')
test_file = os.path.join(data_path, 'review_test.csv')


In [32]:
# read datasets
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)


### Generating business pairs for the Pearson correlation

In [33]:
# notice that the similarity of (item1, item2) is the same as (item2, item1)
# can reduce the number of pairs so as to reduce the computation

users = list(set(train_data['uid']).intersection(set(test_data['uid'])))
train_user_group = train_data.groupby('uid')
test_user_group = test_data.groupby('uid')

business_pairs = []
for uid in users:
    train_b_list = list(train_user_group.get_group(uid)['bid'])
    test_b_list = list(test_user_group.get_group(uid)['bid'])
    business_pairs += [sorted([a, b]) for a in train_b_list for b in test_b_list if a != b]

# remove duplicates
business_pairs_set = set(map(tuple, business_pairs))
business_pairs = [list(x) for x in business_pairs_set]


### Computing the weight matrix (Pearson similarity)

In [34]:
# convert the review data into a dictionary {(uid, bid): rating}
review_dict = {(row[0], row[1]): row[2] for row in train_data.values.tolist()}

# map each business_id to a list of users rated on that {business_id: list[user_ids]}
business_groups = train_data.groupby('bid')
business_user_dict = {bid: list(business_groups.get_group(bid)['uid']) for bid in business_groups.groups}


In [35]:
# define a function to find common (co-rated) user list for two given businesses

def find_co_rated_users(bid_1, bid_2):
        
    user_list_1 = business_user_dict.get(bid_1, [])
    user_list_2 = business_user_dict.get(bid_2, [])
    co_rated_users = set(user_list_1).intersection(set(user_list_2))
    return co_rated_users


In [36]:
# define a function to compute Pearson correlation

import math

def compute_pearson_correlation(rating_list_1, rating_list_2):

    n_ele = len(rating_list_1)
    avg_rating_1 = float(sum(rating_list_1))/float(len(rating_list_1))
    avg_rating_2 = float(sum(rating_list_2))/float(len(rating_list_2))

    var_star_1 = [x - avg_rating_1 for x in rating_list_1]
    var_star_2 = [x - avg_rating_2 for x in rating_list_2]
    weight_sum, weight_1, weight_2 = 0.0, 0.0, 0.0

    for i in range(n_ele):
        if var_star_1[i] * var_star_2[i] != 0.0:
            weight_sum += var_star_1[i] * var_star_2[i]
            weight_1 += var_star_1[i] * var_star_1[i]
            weight_2 += var_star_2[i] * var_star_2[i]

    if weight_1 == 0.0 or weight_2 == 0.0:
        return 0.0
    else:
        return weight_sum / math.sqrt(weight_1) / math.sqrt(weight_2)



In [37]:
# compute the Pearson correlation for each business pair

import time

weight_matrix = []
PEARSON_THRED = 0.1

start_time = time.time()

for pair in business_pairs:
    
    bid_1, bid_2 = pair[0], pair[1]
    
    # fine the co-rated users
    co_rated_users = find_co_rated_users(bid_1, bid_2) 
    
    if len(co_rated_users) <= 1:
        continue
    
    # get the rating list of the co-rated users
    rating_list_1 = [review_dict[(u, bid_1)]for u in co_rated_users]
    rating_list_2 = [review_dict[(u, bid_2)] for u in co_rated_users]

    # compute the Pearson correlation
    weight = compute_pearson_correlation(rating_list_1, rating_list_2)

    if weight > PEARSON_THRED:  # you can set some threshold to filter the low correlated values
        weight_matrix.append([bid_1, bid_2, weight])

print('Time for computing weight matrix = {}.'.format(time.time() - start_time))


Time for computing weight matrix = 74.5968849658966.


In [39]:
# write the results to a CSV file

weight_matrix_df = pd.DataFrame(weight_matrix, columns=['bid1', 'bid2', 'corr'])
weight_matrix_file_path = os.path.join(data_path, 'weight_matrix.csv')
weight_matrix_df.to_csv(weight_matrix_file_path, index=False)
