In [1]:
import os
import pandas as pd
import numpy as np

### Loading datasets

In [2]:
data_path = './data/'

# define file path
train_file = os.path.join(data_path, 'review_train.csv')
test_file = os.path.join(data_path, 'review_test.csv')
weight_matrix_file = os.path.join(data_path, 'weight_matrix.csv')


In [3]:
# read datasets
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
weight_matrix = pd.read_csv(weight_matrix_file)


### Item-based CF Prediction

In [4]:
# convert the review data into a dictionary {(uid, bid): rating}
review_dict = {(row[0], row[1]): row[2] for row in train_data.values.tolist()}

# map each user_id to a list of business {user_id: list[business_ids]}
user_groups = train_data.groupby('uid')
user_business_dict = {uid: list(user_groups.get_group(uid)['bid']) for uid in user_groups.groups}

# transfer weight_matrix to a dictionary {(bid1, bid2): weight}
weight_matrix_dict = {(row[0], row[1]): row[2] for row in weight_matrix.values.tolist()}


In [5]:
# find the most N correlated business according to the weight matrix

WEIGHT_THRESHOLD = 0.0

def find_n_nearest_business(target_bid, neighbor_business, n=3):
    
    neighbor_business_weight = {}
    
    for bid in neighbor_business:
        b_pair = tuple(sorted([target_bid, bid]))
        if weight_matrix_dict.get(b_pair) and weight_matrix_dict[b_pair] > WEIGHT_THRESHOLD:
            neighbor_business_weight[bid] = weight_matrix_dict[b_pair]
            
    sorted_neighbor_business_weight = sorted(neighbor_business_weight.items(), key=lambda kv: kv[1], reverse=True)

    if len(sorted_neighbor_business_weight) >= n: 
        return sorted_neighbor_business_weight[:n]
    elif len(sorted_neighbor_business_weight) <= 1:
        return []
    else:
        return sorted_neighbor_business_weight


In [6]:
# compute weighted average over neighborhood set

def weighted_average_prediction(target_user, target_business, nearest_business):
    
    w_list, r_list = [], []

    for item in nearest_business:
        
        bid, weight = item[0], item[1]
        rating = review_dict[(target_user, bid)]
        w_list.append(weight)
        r_list.append(rating)

#     print(w_list, r_list)
    weighted_sum = sum([w_list[i] * r for i, r in enumerate(r_list)]) if len(r_list) > 0 else 0.0
    sum_weight = sum([abs(x) for x in w_list]) if len(w_list) > 0 else 0.0
    prediction = weighted_sum / sum_weight if sum_weight != 0.0 else 0.0
    
    return prediction


In [7]:
# make prediction for each given (user, business) pair in the testing data

results = []

for pair in test_data.values.tolist():
    
    target_user, target_business = pair[0], pair[1]
    neighbor_business = user_business_dict.get(target_user)
    
    if neighbor_business:
        
        nearest_business = find_n_nearest_business(target_business, neighbor_business, n=3)
        prediction = weighted_average_prediction(target_user,target_business, nearest_business)
        results.append([target_user, target_business, prediction])
    
    else:  # cannot do the prediction if a user never appears in training data => cold start problem
        results.append([target_user, target_business, 0.0])


In [8]:
print(len(results))

36480


In [9]:
len(test_data)

36480

In [10]:
# finalize the results
# fill the missing predictions with average values

avg_rating = sum(list(train_data['ratings'])) / len(train_data)

def quick_check(x):
    if x > 5:
        return 5.0
    elif x < 1:
        return 1.0
    else:
        return x

final_results = []

for result in results:
    uid, bid, prediction = result[0], result[1], result[2]
    prediction = avg_rating if prediction == 0.0 else prediction
    prediction = quick_check(prediction)
    final_results.append([uid, bid, prediction])


In [11]:
# write the results to a CSV file

results_df = pd.DataFrame(final_results, columns=['uid', 'bid', 'prediction'])
results_file_path = os.path.join(data_path, 'review_prediction.csv')
results_df.to_csv(results_file_path, index=False)


### Evaluation

In [12]:
# load ground truth for the testing data

test_ground_truth_file = os.path.join(data_path, 'review_test_ground_truth.csv')
test_ground_truth_data = pd.read_csv(test_ground_truth_file)

evaluation = test_ground_truth_data.merge(results_df, on=['uid', 'bid'])
evaluation['delta'] = evaluation['ratings'] - evaluation['prediction']

RMSE = (sum(evaluation['delta'] ** 2) / len(evaluation)) ** 0.5
print('RMSE = {}.'.format(RMSE))


RMSE = 1.169495745354903.


In [13]:
evaluation[:10]

Unnamed: 0,uid,bid,ratings,prediction,delta
0,HEvyblFw4I-UsMqgPGYY_Q,iA8Ve2sZKN5Vz3mYKrtCaQ,3.0,3.631157,-0.631157
1,HEvyblFw4I-UsMqgPGYY_Q,iA8Ve2sZKN5Vz3mYKrtCaQ,3.0,3.631157,-0.631157
2,HEvyblFw4I-UsMqgPGYY_Q,iA8Ve2sZKN5Vz3mYKrtCaQ,1.0,3.631157,-2.631157
3,HEvyblFw4I-UsMqgPGYY_Q,iA8Ve2sZKN5Vz3mYKrtCaQ,1.0,3.631157,-2.631157
4,DNNkLmbwfI0ufKGqQfmvKQ,faPVqws-x-5k2CQKDNtHxw,5.0,4.426148,0.573852
5,F_5_UNX-wrAFCXuAkBZRDw,UPIYuRaZvknINOd1w8kqRQ,4.0,4.260522,-0.260522
6,ldqh2aWLTW6D2RHDCj_2TA,DHUAQ4pzH9KKzGZDm1jZLg,3.0,3.666667,-0.666667
7,f17-l69K0G7WAeTmPHtptw,HYKTKG3X7jtLe6elxp63JQ,4.0,3.764051,0.235949
8,FAjCZoxiGw9HJKueB8YWTg,O7ot_LMlCfLpOP9tBqeNfw,4.0,3.541475,0.458525
9,leyNDNVu09Ldbg5ujPWMhQ,RESDUcs7fIiihp38-d6_6g,3.0,3.562917,-0.562917


In [14]:
segments = {0:0, 1:0, 2:0, 3:0, 4:0}

for _, row in evaluation.iterrows():
    if row['delta'] < 1.0:
        segments[0] += 1
    elif 1.0 <= row['delta'] < 2.0:
        segments[1] += 1
    elif 2.0 <= row['delta'] < 3.0:
        segments[2] += 1
    elif 3.0 <= row['delta'] < 4.0:
        segments[3] += 1
    elif row['delta'] >= 4.0:
        segments[4] += 1

print(segments)


{0: 28302, 1: 8328, 2: 995, 3: 121, 4: 6}


In [12]:
data_path = './data'

# define file path
review_file = os.path.join(data_path, 'review_train.csv')
business_file = os.path.join(data_path, 'sub_business.csv')

review_data = pd.read_csv(review_file)
business_data = pd.read_csv(business_file)

reviews_with_location = review_data.merge(business_data, on='bid')
reviews_with_location_file_path = os.path.join(data_path, 'reviews_with_location.csv')
reviews_with_location.to_csv(reviews_with_location_file_path, index=False)
