In [32]:
import pandas as pd
import numpy as np

from pathlib2 import Path
from sklearn import model_selection as cv
from sklearn.metrics.pairwise import pairwise_distances
from metric import ndcg, mae_matrix, rmse_matrix

# Load data

In [13]:
data_dir = Path('movie_lens_100k')
train_df = pd.read_csv(data_dir / 'movielens_100k_u1.base', sep='\t', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])
test_df = pd.read_csv(data_dir / 'movielens_100k_u1.test', sep='\t', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])
n_users = 943 
n_items = 1682

In [34]:
user_item_matrix = np.zeros((n_users, n_items))
for row in train_df.itertuples():
    user_item_matrix[row[1]-1, row[2]-1] = row[3]
    
test_data_matrix = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    test_data_matrix[row[1]-1, row[2]-1] = row[3]
test_mask = test_data_matrix > 0

In [15]:
item_similarity = np.ones((n_items, n_items)) - pairwise_distances(user_item_matrix.T, metric='cosine')
user_similarity = np.ones((n_users,n_users)) - pairwise_distances(user_item_matrix, metric='cosine')

## Collabrative filtering

In [16]:
def cf_predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])    
    return pred

In [40]:
user_based_prediction = cf_predict(user_item_matrix, user_similarity, type='user')
user_based_prediction = np.clip(user_based_prediction, 1, 5)
item_based_prediction = cf_predict(user_item_matrix, item_similarity, type='item')
item_based_prediction = np.clip(item_based_prediction, 1, 5)

In [41]:
print('rmse for user based collaborative: {}'.format(rmse_matrix(user_based_prediction, test_mask, test_data_matrix)))
print('rmse for item based collaborative: {}'.format(rmse_matrix(item_based_prediction, test_mask, test_data_matrix)))
print('ndcg for user based collaborative: {}'.format(ndcg(user_based_prediction, test_data_matrix)))
print('ndcg for item based collaborative: {}'.format(ndcg(item_based_prediction, test_data_matrix)))
print('mae for user based collaborative: {}'.format(mae_matrix(user_based_prediction, test_mask, test_data_matrix)))
print('mae for item based collaborative: {}'.format(mae_matrix(item_based_prediction, test_mask, test_data_matrix)))

rmse for user based collaborative: 2.6531998983609273
rmse for item based collaborative: 2.7775689746676595
ndcg for user based collaborative: 0.8215091624249355
ndcg for item based collaborative: 0.8174741502448656
mae for user based collaborative: 2.406420567461062
mae for item based collaborative: 2.5274213393493827


## Random walk

In [22]:
def random_walk_predict(ratings, item_similarity, n_items, alpha=0.8, beta=0.8):
    ones = np.eye(n_items)
    transition_matrix = np.zeros((n_items, n_items))

    for i in range(n_items):
        sum_ = item_similarity[i].sum()
        for j in range(n_items):
            transition_matrix[i][j] = beta*item_similarity[i][j]/sum_+(1-beta)/n_items

    p_tilde = np.linalg.pinv(ones-alpha*transition_matrix)
    final_rating = alpha*np.dot(ratings, np.dot(transition_matrix, p_tilde))
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaled_prediction = (scaler.fit_transform(final_rating.T)*5).T
    return scaled_prediction

In [42]:
randomwork_based_prediction = random_walk_predict(user_item_matrix, item_similarity, n_items)
random_walk_prediction = np.clip(randomwork_based_prediction, 1, 5)

In [45]:
print('rmse for random walk: {}'.format(rmse_matrix(randomwork_based_prediction, test_mask, test_data_matrix)))
print('ndcg for random walk: {}'.format(ndcg(randomwork_based_prediction, test_data_matrix)))
print('mae for random walk: {}'.format(mae_matrix(randomwork_based_prediction, test_mask, test_data_matrix)))

rmse for random walk: 1.324171422227052
ndcg for random walk: 0.8419438847857781
mae for random walk: 1.0525639684335009
