In [2]:
import os
import pandas as pd
import numpy as np

### Loading datasets

In [7]:
data_path = '/Users/yijunlin/PycharmProjects/recommendation_system/data/'

# define file path
train_file = os.path.join(data_path, 'review_train.csv')
test_file = os.path.join(data_path, 'review_test.csv')
weight_matrix_file = os.path.join(data_path, 'weight_matrix.csv')


In [8]:
# read datasets
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
weight_matrix = pd.read_csv(weight_matrix_file)

# change the column name
train_data = train_data.rename(columns={'user_id': 'uid', 'business_id': 'bid', 'stars': 'rating'})
test_data = test_data.rename(columns={'user_id': 'uid', 'business_id': 'bid'})


### Item-based CF Prediction

In [33]:
# convert the review data into a dictionary {(uid, bid): rating}
review_dict = {(row[0], row[1]): row[2] for row in train_data.values.tolist()}

# map each user_id to a list of business {user_id: list[business_ids]}
user_groups = train_data.groupby('uid')
user_business_dict = {uid: list(user_groups.get_group(uid)['bid']) for uid in user_groups.groups}

# transfer weight_matrix to a dictionary {(bid1, bid2): weight}
weight_matrix_dict = {(row[0], row[1]): row[2] for row in weight_matrix.values.tolist()}


In [34]:
# find the most N correlated business according to the weight matrix

def find_n_nearest_business(target_bid, neighbor_business, n=3):
    
    neighbor_business_weight = {}
    
    for bid in neighbor_business:
        b_pair = tuple(sorted([target_bid, bid]))
        if weight_matrix_dict.get(b_pair):
            neighbor_business_weight[bid] = weight_matrix_dict[b_pair]
            
    sorted_neighbor_business_weight = sorted(neighbor_business_weight.items(), key=lambda kv: kv[1], reverse=True)

    if len(sorted_neighbor_business_weight) >= n: 
        return sorted_neighbor_business_weight[:n]
    else:
        return sorted_neighbor_business_weight


In [35]:
# compute weighted average over neighborhood set

def weighted_average_prediction(target_user, target_business, nearest_business):
    
    w_list, r_list = [], []

    for item in nearest_business:
        
        bid, weight = item[0], item[1]
        weighted_rating = review_dict[(target_user, bid)] * weight
        w_list.append(weight)
        r_list.append(weighted_rating)

    weighted_sum = sum([w_list[i] * r for i, r in enumerate(r_list)]) if len(r_list) > 0 else 0.0
    sum_weight = sum([abs(x) for x in w_list]) if len(w_list) > 0 else 0.0
    prediction = weighted_sum / sum_weight if sum_weight != 0.0 else 0.0
    
    return prediction


In [36]:
# make prediction for each given (user, business) pair in the testing data

results = []

for pair in test_data.values.tolist():
    
    target_user, target_business = pair[0], pair[1]
    neighbor_business = user_business_dict.get(target_user)
    
    if neighbor_business:
        
        nearest_business = find_n_nearest_business(target_business, neighbor_business)
        prediction = weighted_average_prediction(target_user,target_business, nearest_business)
        results.append([target_user, target_business, prediction])
    
    else:  # cannot do the prediction if a user never appears in training data => cold start problem
        results.append([target_user, target_business, 0.0])


In [None]:
# finalize the results
# fill the missing predictions with average values

avg_rating = sum(list(train_data['rating'])) / len(train_data)

def quick_check(x):
    if x > 5:
        return 5.0
    elif x < 1:
        return 1.0
    else:
        return x

final_results = []

for result in results:
    uid, bid, prediction = result[0], result[1], result[2]
    prediction = avg_rating if prediction == 0.0 else prediction
    prediction = quick_check(prediction)
    final_results.append([uid, bid, prediction])


In [37]:
# write the results to a CSV file

results_df = pd.DataFrame(final_results, columns=['uid', 'bid', 'prediction'])
results_file_path = os.path.join(data_path, 'review_prediction.csv')
results_df.to_csv(results_file_path, index=False)


In [None]:
data_path = './data'

# define file path
review_file = os.path.join(data_path, 'review_train.csv')
business_file = os.path.join(data_path, 'sub_business.csv')

review_data = pd.read_csv(review_file)
business_data = pd.read_csv(business_file)

output_review = agg_review.merge(business_data, on='bid')
