In [1]:
import os
import pandas as pd
import numpy as np

### Loading datasets

In [2]:
data_path = './data'

# define file path
train_file = os.path.join(data_path, 'review_train.csv')
test_file = os.path.join(data_path, 'review_test.csv')
weight_matrix_file = os.path.join(data_path, 'weight_matrix.csv')


In [3]:
# read datasets
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
weight_matrix = pd.read_csv(weight_matrix_file)


### Item-based CF Prediction

In [5]:
# convert the review data into a dictionary {(uid, bid): rating}
review_dict = {(row[0], row[1]): row[2] for row in train_data.values.tolist()}

# map each user_id to a list of business {user_id: list[business_ids]}
user_groups = train_data.groupby('uid')
user_business_dict = {uid: list(user_groups.get_group(uid)['bid']) for uid in user_groups.groups}

# transfer weight_matrix to a dictionary {(bid1, bid2): weight}
weight_matrix_dict = {(row[0], row[1]): row[2] for row in weight_matrix.values.tolist()}


In [24]:
# find the most N correlated business according to the weight matrix

def find_n_nearest_business(target_bid, neighbor_business, n=3):
    
    ### pseudocode ###
    
    # for bid in neighbor_business:
    #     get the correlation between the target_bid and bid
    
    neighbor_business_weight = {}

    for bid in neighbor_business:
        b_pair = tuple(sorted([target_bid, bid]))
        if weight_matrix_dict.get(b_pair):
            neighbor_business_weight[bid] = weight_matrix_dict[b_pair]

    sorted_neighbor_business_weight = sorted(neighbor_business_weight.items(), key=lambda kv: kv[1], reverse=True)
    
    if len(sorted_neighbor_business_weight) > n:
        return sorted_neighbor_business_weight[:n]
    else:
        return sorted_neighbor_business_weight

    # find the most N correlated business
    # return top N business


In [31]:
# compute weighted average over neighborhood set

def weighted_average_prediction(target_user, target_business, nearest_business_weight):
    
    ### pseudocode ###
    
    weighted_sum, sum_weight = 0.0, 0.0
    
    w_list, r_list = [], []
    
    for item in nearest_business_weight:  
        bid, weight = item[0], item[1]
        rating = review_dict[(target_user, bid)]
        w_list.append(weight)
        r_list.append(rating)

    weighted_sum = sum([w_list[i] * r for i, r in enumerate(r_list)]) if len(r_list) > 0 else 0.0
    sum_weight = sum(w_list) if len(w_list) > 0 else 0.0
    
    prediction = weighted_sum / sum_weight if sum_weight != 0.0 else 0.0
    
    #     rating = the rating of (target_user, bid)
    #     weighted_sum += rating * weight
    #     sum_weight += weight
    
    return prediction



In [60]:
# make prediction for each given (user, business) pair in the testing data

results = []
NEIGHBOR_THRE = 3

for pair in test_data.values.tolist():
    
    target_user, target_business = pair[0], pair[1]
    
    if target_user != 't3RHt1j91D8GP5XD5Mvxow' or target_business != 'mDR12Hafvr84ctpsV6YLag':
        continue

    neighbor_business = user_business_dict.get(target_user)
    print(neighbor_business)
    
    if neighbor_business is None:
        results.append([target_user, target_business, 0.0])
        continue

    ### pseudocode ###
    n_nearest_business = find_n_nearest_business(target_business, neighbor_business, n=3)
    print(n_nearest_business)

    prediction =  weighted_average_prediction(target_user, target_business, n_nearest_business)
    
    results.append([target_user, target_business, prediction])
    
    

['JxKWfZ6hG8iRnK8QL65jTw', 'PQER4ba8Q0zqB1G2QiXamQ']
[('JxKWfZ6hG8iRnK8QL65jTw', 1.0)]


In [34]:
results

[['HEvyblFw4I-UsMqgPGYY_Q', 'iA8Ve2sZKN5Vz3mYKrtCaQ', 3.664494905923594],
 ['DNNkLmbwfI0ufKGqQfmvKQ', 'faPVqws-x-5k2CQKDNtHxw', 4.42614776657187],
 ['F_5_UNX-wrAFCXuAkBZRDw', 'UPIYuRaZvknINOd1w8kqRQ', 3.9825510440634764],
 ['ldqh2aWLTW6D2RHDCj_2TA', 'DHUAQ4pzH9KKzGZDm1jZLg', 3.6666666666666665],
 ['f17-l69K0G7WAeTmPHtptw', 'HYKTKG3X7jtLe6elxp63JQ', 0.0],
 ['FAjCZoxiGw9HJKueB8YWTg', 'O7ot_LMlCfLpOP9tBqeNfw', 3.5414754769936088],
 ['leyNDNVu09Ldbg5ujPWMhQ', 'RESDUcs7fIiihp38-d6_6g', 3.6666666666666665],
 ['yobIvoK2taIhooUPSrjuiQ', 'R7LyTeiOHLyTNkA8HssBRQ', 3.3909606591267694],
 ['HrKnIiuaXdl6oj26mleUuA', '4JNXUYY8wbaaDmk3BPzlWw', 4.0],
 ['QnaHUkoOaLQ5ywDJmoZVGQ', 'UmfEj1Q2Ts0tTsC0LGs1Tw', 0.0],
 ['kmE8w5Y785eZmodsx0V6Ag', 'Zrx25j1M794Nh8fUGB8E9A', 3.0],
 ['YQPDBJKKFFseQybsvo2rog', 'YZGSNhgTS6YeyUYoivD-Ww', 3.0],
 ['U-nR7ND8CDN2x8ia73CH2Q', 'CVRTly4ag_q5ua0R3iqrVw', 3.3333333333333335],
 ['UrOofvcBl0gAbKPAEaj83A', 'N7yuiiu8jhQ-Fl9Npflreg', 2.614692547327901],
 ['P3FlF8WA18fTXSj77AX69g', '

In [44]:
# finalize the results
# fill the missing predictions with average values

avg_rating = sum(list(train_data['ratings'])) / len(train_data)

def quick_check(x):
    if x > 5:
        return 5.0
    elif x < 1:
        return 1.0
    else:
        return x

final_results = []

for result in results:
    uid, bid, prediction = result[0], result[1], result[2]
    prediction = avg_rating if prediction == 0.0 else prediction
    prediction = quick_check(prediction)
    final_results.append([uid, bid, prediction])


In [45]:
len(final_results)

36480

In [46]:
len(test_data)

36480

In [47]:
# write the results to a CSV file

results_df = pd.DataFrame(final_results, columns=['uid', 'bid', 'prediction'])
results_file_path = os.path.join(data_path, 'review_prediction.csv')
results_df.to_csv(results_file_path, index=False)


### Evaluation

In [48]:
# load ground truth for the testing data

test_ground_truth_file = os.path.join(data_path, 'review_test_ground_truth.csv')
test_ground_truth_data = pd.read_csv(test_ground_truth_file)

evaluation = test_ground_truth_data.merge(results_df, on=['uid', 'bid'])


In [50]:
print(evaluation[:10])

                      uid                     bid  ratings  prediction
0  HEvyblFw4I-UsMqgPGYY_Q  iA8Ve2sZKN5Vz3mYKrtCaQ      3.0    3.664495
1  HEvyblFw4I-UsMqgPGYY_Q  iA8Ve2sZKN5Vz3mYKrtCaQ      3.0    3.664495
2  HEvyblFw4I-UsMqgPGYY_Q  iA8Ve2sZKN5Vz3mYKrtCaQ      1.0    3.664495
3  HEvyblFw4I-UsMqgPGYY_Q  iA8Ve2sZKN5Vz3mYKrtCaQ      1.0    3.664495
4  DNNkLmbwfI0ufKGqQfmvKQ  faPVqws-x-5k2CQKDNtHxw      5.0    4.426148
5  F_5_UNX-wrAFCXuAkBZRDw  UPIYuRaZvknINOd1w8kqRQ      4.0    3.982551
6  ldqh2aWLTW6D2RHDCj_2TA  DHUAQ4pzH9KKzGZDm1jZLg      3.0    3.666667
7  f17-l69K0G7WAeTmPHtptw  HYKTKG3X7jtLe6elxp63JQ      4.0    3.764051
8  FAjCZoxiGw9HJKueB8YWTg  O7ot_LMlCfLpOP9tBqeNfw      4.0    3.541475
9  leyNDNVu09Ldbg5ujPWMhQ  RESDUcs7fIiihp38-d6_6g      3.0    3.666667


In [57]:
# method 1: count the segments

segments = {0:0, 1:0, 2:0, 3:0, 4:0}

for _, row in evaluation.iterrows():
    if row['delta'] < 1.0:
        segments[0] += 1
    elif 1.0 <= row['delta'] < 2.0:
        segments[1] += 1
    elif 2.0 <= row['delta'] < 3.0:
        segments[2] += 1
    elif 3.0 <= row['delta'] < 4.0:
        segments[3] += 1
    elif row['delta'] >= 4.0:
        segments[4] += 1

print(segments)


{0: 28147, 1: 7838, 2: 1449, 3: 258, 4: 60}


In [58]:

segments = []

for _, row in evaluation.iterrows():
    if row['delta'] >= 4.0:
        segments.append(row.values.tolist())
    
segments


[['t3RHt1j91D8GP5XD5Mvxow', 'mDR12Hafvr84ctpsV6YLag', 5.0, 1.0, 4.0],
 ['qr06IGEdVEIQmyv-NZe-og', 'hihud--QRriCYZw1zZvW4g', 5.0, 1.0, 4.0],
 ['EzXDaJjdCwMtWdBJiRcC6Q', 'GI-CAiZ_Gg3h21PwrANB4Q', 5.0, 1.0, 4.0],
 ['ZjHrVjxHDFiGv_5ufMYs6Q', 'DkYS3arLOhA8si5uUEmHOw', 5.0, 1.0, 4.0],
 ['H8k77jyHPfl38P-5o525WA', '16Fplxu-OwVmTEFxQAUP4g', 5.0, 1.0, 4.0],
 ['mi-Wx6DGzx6loUWbGRW0-A', 'pH0BLkL4cbxKzu471VZnuA', 5.0, 1.0, 4.0],
 ['0uU-AkvZMbzTMSLRtm1m3A', 'Z0YOHYA6YtW131xULbnMzQ', 5.0, 1.0, 4.0],
 ['N4K69ZdyU-B8NzVYxwjK0A', 'EAwh1OmG6t6p3nRaZOW_AA', 5.0, 1.0, 4.0],
 ['DQ5Szn4T14ddEd6F5jHHpg', '364hhL5st0LV16UcBHRJ3A', 5.0, 1.0, 4.0],
 ['fBEwo1ogY45H85oRW86L_w', 'JDZ6_yycNQFTpUZzLIKHUg', 5.0, 1.0, 4.0],
 ['fBEwo1ogY45H85oRW86L_w', 'JDZ6_yycNQFTpUZzLIKHUg', 5.0, 1.0, 4.0],
 ['fBEwo1ogY45H85oRW86L_w', 'JDZ6_yycNQFTpUZzLIKHUg', 5.0, 1.0, 4.0],
 ['fBEwo1ogY45H85oRW86L_w', 'JDZ6_yycNQFTpUZzLIKHUg', 5.0, 1.0, 4.0],
 ['HQnfONrTQKBdHPV7iebtxQ', 'iLlPXAA6NYTgWaG4DgCNEA', 5.0, 1.0, 4.0],
 ['3zczu4YDXk5M27xMQ

In [55]:
segments

[]

In [51]:
# method 2: compute RMSE

evaluation['delta'] = evaluation['ratings'] - evaluation['prediction']

RMSE = (sum(evaluation['delta'] ** 2) / len(evaluation)) ** 0.5
print('RMSE = {}.'.format(RMSE))


RMSE = 1.2209066307426057.
