In [2]:
import os
import pandas as pd
import numpy as np

### Loading training and testing review data

In [3]:
data_path = './data/'

# define file path
train_file = os.path.join(data_path, 'review_train.csv')


In [4]:
# read datasets
train_data = pd.read_csv(train_file)

# change the column name
train_data = train_data.rename(columns={'user_id': 'uid', 'business_id': 'bid', 'stars': 'rating'})


### Generating business pairs for the Pearson correlation

In [6]:
# notice that the similarity of (item1, item2) is the same as (item2, item1)
# reduce the number of pairs in order to reduce the computation

businesses = list(set(train_data['bid']))
business_pairs = []

for i in range(len(businesses)):
    for j in range((i + 1), len(businesses)):
        business_pairs.append(sorted([businesses[i], businesses[j]]))
    

### Computing the weight matrix (Pearson similarity)

In [9]:
# convert the review data into a dictionary {(uid, bid): rating}
review_dict = {(row[0], row[1]): row[2] for row in train_data.values.tolist()}

# map each business id to a list of users rated on that {business_id: list[user_ids]}
business_groups = train_data.groupby('bid')
business_user_dict = {bid: list(business_groups.get_group(bid)['uid']) for bid in business_groups.groups}


In [None]:
# define a function to find common (co-rated) user list for two given businesses

def find_co_rated_users(bid_1, bid_2):
    
    # Hint: utilize the "business_user_dict" to query the user lists
    
    ### pseudocode ###
    # user_list_1 = the users who rated on bid_1
    # user_list_2 = the users who rated on bid_2
    # co_rated_users = the intersection (user_list_1, user_list_2)
    # return co_rated_users


In [None]:
# define a function to compute Pearson correlation

import math

def compute_pearson_correlation(rating_list_1, rating_list_2):

    ### pseudocode ###
    # avg_rating_1 = the average rating of rating_list_1 
    # avg_rating_2 = the average rating of rating_list_2

    # weight_sum, weight_1, weight_2 = 0.0, 0.0, 0.0
    # calculate the value for weight_sum (numerator) and weight_1, weight_2 (denominator)
        
    # return weight_sum / sqrt(weight_1) / sqrt(weight_2)


In [None]:
# compute the Pearson correlation for each business pair

PEARSON_THRED = 0.1 
weight_matrix = []

for pair in business_pairs:
    
    bid_1, bid_2 = pair[0], pair[1]
    
    ### pseudocode ###
    
    # call find_co_rated_users()
    # if the number of co_rated_users <= 1, weight should be 0.0
    
    # get the rating list of the co-rated users on bid_1 and bid_2
    # call compute_pearson_correlation()

    if weight > PEARSON_THRED:  # you can set some threshold to filter the low correlated values
        weight_matrix.append([bid_1, bid_2, weight])  # (bid_1, bid_2) is in alphabetical order


In [None]:
# write the results to a CSV file

weight_matrix_df = pd.DataFrame(weight_matrix, columns=['bid1', 'bid2', 'corr'])
weight_matrix_file_path = os.path.join(data_path, 'weight_matrix.csv')
weight_matrix_df.to_csv(weight_matrix_file_path, index=False)
