In [21]:
import time
import os
import pandas as pd
import numpy as np

### Loading training and testing review data

In [22]:
data_path = './data/'

# define file path
train_file = os.path.join(data_path, 'review_train.csv')

# read datasets
train_data = pd.read_csv(train_file)


### Generating business pairs for the Pearson correlation

In [23]:
# notice that the similarity of (item1, item2) is the same as (item2, item1)
# reduce the number of pairs in order to reduce the computation

businesses = list(set(train_data['bid']))
business_pairs = []

for i in range(len(businesses)):
    for j in range((i + 1), len(businesses)):
        business_pairs.append(sorted([businesses[i], businesses[j]]))
    

In [24]:
business_pairs[:5]

[['sKA6EOpxvBtCg7Ipuhl1RQ', 'xDvl_i1g1HCJ4EmIRmwfqg'],
 ['RNi6tW22UMgHwWLAb0mYdA', 'sKA6EOpxvBtCg7Ipuhl1RQ'],
 ['e2ApirIzYID9xIye0r_gKQ', 'sKA6EOpxvBtCg7Ipuhl1RQ'],
 ['WXSsJIO_uGGSxS9qC8x1gQ', 'sKA6EOpxvBtCg7Ipuhl1RQ'],
 ['l_GV0hgEoTUf70uJVT0_hg', 'sKA6EOpxvBtCg7Ipuhl1RQ']]

### Computing the weight matrix (Pearson similarity)

In [11]:
# convert the review data into a dictionary {(uid, bid): rating}
review_dict = {(row[0], row[1]): row[2] for row in train_data.values.tolist()}

# map each business_id to a list of users rated on that {business_id: list[user_ids]}
business_groups = train_data.groupby('bid')
business_user_dict = {bid: list(business_groups.get_group(bid)['uid']) for bid in business_groups.groups}


In [12]:
# define a function to find common (co-rated) user list for two given businesses

def find_co_rated_users(bid_1, bid_2):
        
    user_list_1 = business_user_dict.get(bid_1, [])
    user_list_2 = business_user_dict.get(bid_2, [])
    co_rated_users = list(set(user_list_1).intersection(set(user_list_2)))
    return co_rated_users


In [17]:
# define a function to compute Pearson correlation

import math

def compute_pearson_correlation(rating_list_1, rating_list_2):

    n_ele = len(rating_list_1)
    avg_rating_1 = float(sum(rating_list_1))/float(len(rating_list_1))
    avg_rating_2 = float(sum(rating_list_2))/float(len(rating_list_2))

    var_star_1 = [x - avg_rating_1 for x in rating_list_1]
    var_star_2 = [x - avg_rating_2 for x in rating_list_2]
    weight_sum, weight_1, weight_2 = 0.0, 0.0, 0.0

    for i in range(n_ele):
        weight_sum += var_star_1[i] * var_star_2[i]
        weight_1 += var_star_1[i] * var_star_1[i]
        weight_2 += var_star_2[i] * var_star_2[i]

    if weight_1 == 0.0 or weight_2 == 0.0:
        return 0.0
    else:
        return weight_sum / math.sqrt(weight_1) / math.sqrt(weight_2)



In [18]:
# compute the Pearson correlation for each business pair

import time

weight_matrix = []
PEARSON_THRED = 0.1

start_time = time.time()

for pair in business_pairs:
    
    bid_1, bid_2 = pair[0], pair[1]
    
    # fine the co-rated users
    co_rated_users = find_co_rated_users(bid_1, bid_2) 
    
    if len(co_rated_users) <= 1:
        continue
    
    # get the rating list of the co-rated users
    rating_list_1 = [review_dict[(u, bid_1)]for u in co_rated_users]
    rating_list_2 = [review_dict[(u, bid_2)] for u in co_rated_users]

    # compute the Pearson correlation
    weight = compute_pearson_correlation(rating_list_1, rating_list_2)

    if weight > PEARSON_THRED:  # you can set some threshold to filter the low correlated values
        weight_matrix.append([bid_1, bid_2, weight])

print('Time for computing weight matrix = {}.'.format(time.time() - start_time))


Time for computing weight matrix = 18.731312036514282.


In [19]:
weight_matrix[:5]

[['WXSsJIO_uGGSxS9qC8x1gQ', 'sKA6EOpxvBtCg7Ipuhl1RQ', 0.9999999999999999],
 ['l_GV0hgEoTUf70uJVT0_hg', 'sKA6EOpxvBtCg7Ipuhl1RQ', 0.1720618004029213],
 ['GJ_bXUPv672YwNg4TneJog', 'sKA6EOpxvBtCg7Ipuhl1RQ', 0.30151134457776363],
 ['Yqgyx8SJ5SqFYc-4yH6Z1g', 'sKA6EOpxvBtCg7Ipuhl1RQ', 0.8660254037844385],
 ['-95mbLJsa0CxXhpaNL4LvA', 'sKA6EOpxvBtCg7Ipuhl1RQ', 0.7071067811865475]]

In [16]:
# write the results to a CSV file

weight_matrix_df = pd.DataFrame(weight_matrix, columns=['bid1', 'bid2', 'corr'])
weight_matrix_file_path = os.path.join(data_path, 'weight_matrix.csv')
weight_matrix_df.to_csv(weight_matrix_file_path, index=False)
