In [10]:
import numpy as np
import pandas as pd
import warnings
import random
from scipy import sparse
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import mean_squared_error



In [11]:
warnings.filterwarnings('ignore')

def seed_everything(random_seed):
    np.random.seed(random_seed)
    random.seed(random_seed)

seed = 1
seed_everything(seed)

In [30]:
# dataset loading
train_data = pd.read_csv('./New_data/train_data.csv', sep=',')
train, valid = train_test_split(train_data, test_size=0.15, stratify=train_data['user_id'], random_state=1234)

test_data = pd.read_csv('./New_data/test_data.csv', sep=',')
testID_lookup = pd.read_csv('./New_data/testID_lookup.csv', sep=',')
test = pd.DataFrame(columns=['user_id','item_id','rating'])
test['user_id'] = test_data['user_id']
test['item_id'] = test_data['item_id']
test['rating'] = 0

user_list = list(np.union1d(train_data['user_id'].unique(), test_data['user_id'].unique()))
item_list = list(np.union1d(train_data['item_id'].unique(), test_data['item_id'].unique()))

num_users = len(user_list)
num_items = len(item_list)
print(f"# of users: {num_users},  # of items: {num_items}")

train = train.to_numpy()
valid = valid.to_numpy()
test = test.to_numpy()

matrix = sparse.lil_matrix((num_users, num_items))
for (u, i, r) in train:
    matrix[u, i] = r
train = sparse.csr_matrix(matrix)

matrix = sparse.lil_matrix((num_users, num_items))
for (u, i, r) in valid:
    matrix[u, i] = r
valid = sparse.csr_matrix(matrix)


train = train.toarray()
valid = valid.toarray()



# of users: 2141,  # of items: 586


In [31]:
def UserKNN(train, valid, test, top_k):
    num_users = train.shape[0]
    num_items = train.shape[1]

    for i, row in enumerate(train):
        train[i, np.where(row < 0.5)[0]] = np.nan

    user_mean = np.nanmean(train, axis=1)
    user_mean[np.isnan(user_mean)] = 0.0
    train = train - user_mean[:, None]
    user_user_sim_matrix = np.zeros((num_users, num_users))

    for user_i in tqdm(range(0, num_users), desc='user_user_sim_matrix (k=%d)' % top_k):
        for user_j in range(user_i+1, num_users):
            a = train[user_i]
            b = train[user_j]

            co_rated = ~np.logical_or(np.isnan(a), np.isnan(b))
            a = np.compress(co_rated, a)
            b = np.compress(co_rated, b)

            if len(a) == 0:
                continue 

            dot_a_b = np.dot(a, b)
            if dot_a_b == 0:
                continue

            user_user_sim_matrix[user_i, user_j] = dot_a_b / (np.linalg.norm(a) * np.linalg.norm(b))
    
    user_user_sim_matrix = (user_user_sim_matrix + user_user_sim_matrix.T)

    print("model evaluation")

    ori_top_k = top_k
    rmse_list = []


    for user_id in range(len(train)):
        test_by_user = valid[user_id]
        target_u = np.where(test_by_user >= 0.5)[0]
        target_u_score = test_by_user[target_u]      

        predicted_values=[]

        for one_missing_item in target_u:
            # item i를 시청한 사용자들
            rated_users = np.where(~np.isnan(train[:, one_missing_item]))[0]
            unsorted_sim = user_user_sim_matrix[user_id, rated_users]

            # 유사도 정렬
            sorted_users = np.argsort(unsorted_sim)
            sorted_users = sorted_users[::-1]

            # Top K 이웃 구하기
            if ori_top_k > len(sorted_users):
                top_k = len(sorted_users)
            else:
                top_k = ori_top_k
            sorted_users = sorted_users[0:top_k]
            top_k_users = rated_users[sorted_users]

            # 예측 값 구하기
            if top_k == 0:
                predicted_values.append(0.0)
            else:
                users_rate = train[top_k_users, one_missing_item]
                users_sim = user_user_sim_matrix[user_id, top_k_users]
                users_sim[users_sim < 0.0] = 0.0

                if np.sum(users_sim) == 0.0:
                    predicted_rate = user_mean[user_id]
                else:
                    predicted_rate = user_mean[user_id] + np.sum(users_rate*users_sim)/np.sum(users_sim)
                predicted_values.append(predicted_rate)
        
        
        if target_u_score != []:
            rmse = mean_squared_error(target_u_score, predicted_values)
            rmse_list.append(rmse)
        
        
    
    for i in range(len(test)):
        rated_users = np.where(~np.isnan(train[:,test[i,1]]))[0]
        unsorted_sim = user_user_sim_matrix[test[i,0], rated_users]

        sorted_users = np.argsort(unsorted_sim)
        sorted_users = sorted_users[::-1]

        if ori_top_k > len(sorted_users):
            top_k = len(sorted_users)
        else:
            top_k = ori_top_k
        sorted_users = sorted_users[0:top_k]
        top_k_users = rated_users[sorted_users]

        users_rate = train[top_k_users, test[i,1]]
        users_sim = user_user_sim_matrix[test[i,0], top_k_users]
        users_sim[users_sim < 0.0] = 0.0

        if np.sum(users_sim) == 0.0:
            predicted_rate = user_mean[test[i,0]]
        else:
            predicted_rate = user_mean[test[i,0]] + np.sum(users_rate*users_sim)/np.sum(users_sim)
        if predicted_rate < 1:
            predicted_rate = 1
        elif predicted_rate > 10:
            predicted_rate = 10
        test[i,2]=predicted_rate

    submission = pd.DataFrame(columns = ['user_id','item_id','rating'])
    submission['user_id'] = test[:,0]
    submission['item_id'] = test[:,1]
    submission['rating'] = test[:,2]

    return np.mean(rmse_list), submission

In [28]:
def ItemKNN(train, valid, test, top_k):
    num_users = train.shape[0]
    num_items = train.shape[1]

    for i, row in enumerate(train):
        train[i, np.where(row < 0.5)[0]] = np.nan

    user_mean = np.nanmean(train, axis=1)
    user_mean[np.isnan(user_mean)] = 0.0
    train = train - user_mean[:, None]
    item_item_sim_matrix = np.zeros((num_items, num_items))

    for item_i in tqdm(range(0, num_users), desc='item_item_sim_matrix (k=%d)' % top_k):
        for item_j in range(item_i+1, num_items):
            a = train.T[item_i]
            b = train.T[item_j]

            co_rated = ~np.logical_or(np.isnan(a), np.isnan(b))
            a = np.compress(co_rated, a)
            b = np.compress(co_rated, b)

            if len(a) == 0:
                continue 

            dot_a_b = np.dot(a, b)
            if dot_a_b == 0:
                continue

            item_item_sim_matrix[item_i, item_j] = dot_a_b / (np.linalg.norm(a) * np.linalg.norm(b))
    
    item_item_sim_matrix = (item_item_sim_matrix + item_item_sim_matrix.T)

    print("model evaluation")

    ori_top_k = top_k
    rmse_list = []


    for item_id in range(len(train.T)):
        test_by_item = valid.T[item_id]
        target_u = np.where(test_by_item >= 0.5)[0]
        # target_u_score = test_by_item[target_u]      

        predicted_values=[]

        for one_missing_user in target_u:
            # item i를 시청한 사용자들
            rated_items = np.where(~np.isnan(train.T[:, one_missing_user]))[0]
            unsorted_sim = item_item_sim_matrix[item_id, rated_items]

            # 유사도 정렬
            sorted_items = np.argsort(unsorted_sim)
            sorted_items = sorted_items[::-1]

            # Top K 이웃 구하기
            if ori_top_k > len(sorted_items):
                top_k = len(sorted_items)
            else:
                top_k = ori_top_k
            sorted_items = sorted_items[0:top_k]
            top_k_items = rated_items[sorted_items]

            # 예측 값 구하기
            if top_k == 0:
                predicted_values.append(0.0)
            else:
                item_rate = train[one_missing_user, top_k_items] + user_mean[one_missing_user]
                items_sim = item_item_sim_matrix[item_id, top_k_items]
                items_sim[items_sim < 0.0] = 0.0

                if np.sum(items_sim) == 0.0:
                    predicted_rate = np.sum(items_rate)/len(items_rate)
                else:
                    predicted_rate = np.sum(items_sim*items_rate)/np.sum(items_sim)
                predicted_values.append(predicted_rate)
        pred_matrix[missing_user_ids, item_id] = predicted_values
        
    for user_id in range(len(train_data)):
        test_by_user = test_data[user_id]
        target_u = np.where(test_by_user >= 0.5)[0]
        target_u_score = test_by_user[target_u]

        pred_u_score = pred_matrix[user_id, target_u]

        rmse = mean_squared_error(target_u_score, pred_u_score)
        rmse_list.append(rmse)
        
        
        
        
        
    
    for i in range(len(test.T)):
        rated_items = np.where(~np.isnan(train.T[test[i,1],:]))[0]
        unsorted_sim = item_user_sim_matrix[test[0,i], rated_items]

        sorted_items = np.argsort(unsorted_sim)
        sorted_items = sorted_items[::-1]

        if ori_top_k > len(sorted_items):
            top_k = len(sorted_items)
        else:
            top_k = ori_top_k
        sorted_items = sorted_items[0:top_k]
        top_k_items = rated_items[sorted_items]

        items_rate = train.T[test[i,1],top_k_items]
        items_sim = item_item_sim_matrix[top_k_items,test[i,0]]
        items_sim[items_sim < 0.0] = 0.0

        if np.sum(items_sim) == 0.0:
            predicted_rate = np.sum(items_rate)/len(items_rate)
        else:
            predicted_rate = np.sum(items_sim*items_rate)/np.sum(items_sim)
        if predicted_rate < 1:
            predicted_rate = 1
        elif predicted_rate > 10:
            predicted_rate = 10
        test[i,2]=predicted_rate

    submission = pd.DataFrame(columns = ['user_id','item_id','rating'])
    submission['user_id'] = test[:,0]
    submission['item_id'] = test[:,1]
    submission['rating'] = test[:,2]

    return np.mean(rmse_list), submission

In [32]:
valid_RMSE, Submission = UserKNN(train = train, valid = valid, test = test, top_k=10)
print("Valid RMSE: ", valid_RMSE)

new_submission = pd.DataFrame(columns=['test_id','rating'])
new_submission['test_id'] = range(1, len(Submission['rating'])+1)
new_submission['rating'] = Submission['rating']
# make submission
new_submission.to_csv("submission2.csv",sep=',',index=False)

user_user_sim_matrix (k=10): 100%|██████████| 2141/2141 [00:30<00:00, 71.33it/s] 


model evaluation
Valid RMSE:  4.2794367636727015
