In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy import sparse as spsp

In [2]:
import pickle
data = pickle.load(open('movielens_data.pickle', 'rb'))

Using backend: pytorch


In [3]:
ratings = data.ratings
user_id = np.array(ratings['user_idx'])
movie_id = np.array(ratings['movie_idx'])
user_movie_spm = spsp.coo_matrix((np.ones((len(user_id),)), (user_id, movie_id)))
num_users, num_movies = user_movie_spm.shape
print('#user-movie iterations:', len(movie_id))
print('#users:', num_users)
print('#movies:', num_movies)

#user-movie iterations: 1000205
#users: 6040
#movies: 3702


In [4]:
# coo_matrix((data, (i, j)), shape=(M, N)])
# 아래와 같은 3개의 배열을 이용해 만든다.
# data[:] 는 순서에 상관없이 matrix 전체를 이용
# i[:]는 matrix의 행 색인을 이용
# j[:]는 matrix의 열 색인을 이용
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_count,timerank,test_mask,valid_mask,user_idx,movie_idx
0,1,1193,5,978300760,1725,11,False,False,0,1103
1,1,661,3,978302109,525,31,False,False,0,638
2,1,914,3,978301968,636,26,False,False,0,852
3,1,3408,4,978300275,1315,7,False,False,0,3174
4,1,2355,5,978824291,1703,49,False,False,0,2159
...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,373,182,False,False,6039,1018
1000205,6040,1094,5,956704887,1229,49,False,False,6039,1021
1000206,6040,562,5,956704746,478,37,False,False,6039,547
1000207,6040,1096,4,956715648,344,109,False,False,6039,1023


In [5]:
ratings_train = ratings[~(ratings['valid_mask'] | ratings['test_mask'])]
user_latest_item_indices = (
        ratings_train.groupby('user_id')['timestamp'].transform(pd.Series.max) ==
        ratings_train['timestamp'])
user_latest_item = ratings_train[user_latest_item_indices]
user_latest_item = dict(
        zip(user_latest_item['user_idx'].values, user_latest_item['movie_idx'].values))

In [6]:
# The training dataset
user_id = np.array(ratings_train['user_idx'])
movie_id = np.array(ratings_train['movie_idx'])
# print(len(user_id),len(movie_id))
user_movie_spm = spsp.coo_matrix((np.ones((len(user_id),)), (user_id, movie_id)))
assert num_users == user_movie_spm.shape[0]
assert num_movies == user_movie_spm.shape[1]
train_size = len(user_id)
print('#training size:', train_size)

# The validation and testing dataset
users_valid = ratings[ratings['valid_mask']]['user_idx'].values
movies_valid = ratings[ratings['valid_mask']]['movie_idx'].values
users_test = ratings[ratings['test_mask']]['user_idx'].values
movies_test = ratings[ratings['test_mask']]['movie_idx'].values
valid_size = len(users_valid)
test_size = len(users_test)
print('#valid size:', valid_size)
print('#test size:', test_size)

#training size: 986669
#valid size: 8164
#test size: 5372


In [7]:
from SLIM import SLIM, SLIMatrix
from slim_load import read_csr

def gen_slim(user_movie_spm):
    model = SLIM()
    params = {'algo': 'cd', 'nthreads': 2, 'l1r': 1.0, 'l2r': 1.0}
    trainmat = SLIMatrix(user_movie_spm.tocsr())
    print(trainmat)
    model.train(params, trainmat)
    model.save_model(modelfname='slim_model.csr', mapfname='slim_map.csr')

    # Load the SLIM similarity matrix into DGL. We store the vertex similarity as edge data on DGL.
    movie_spm = read_csr('slim_model.csr')
    print('#edges:', movie_spm.nnz)
    print('most similar:', np.max(movie_spm.data))
    print('most unsimilar:', np.min(movie_spm.data))

    return movie_spm

In [10]:
item_matrix_slim = gen_slim(user_movie_spm)

<SLIM.core.SLIMatrix object at 0x7f4f184be6d8>
Learning takes 31.780 secs.
#edges: 305219
most similar: 0.679996
most unsimilar: 0.0


In [12]:
item_matrix_slim

<3702x3702 sparse matrix of type '<class 'numpy.float32'>'
	with 305219 stored elements in COOrdinate format>

In [13]:
downsample_factor = 1e-6

def gen_cooccur(user_movie_spm):
    user_id = user_movie_spm.row
    movie_id = user_movie_spm.col
    spm_t = user_movie_spm.transpose()
    
    movie_deg = spm_t.dot(np.ones((num_users,)))
    movie_ratio = movie_deg / np.sum(movie_deg)
    # 1e-6 is a hyperparameter for this dataset.
    movie_sample_prob = 1 - np.maximum(1 - np.sqrt(downsample_factor / movie_ratio), 0)
    sample_prob = movie_sample_prob[movie_id]
    sample = np.random.uniform(size=(len(movie_id),))
    print(sample_prob)
    print(sample)
    
    user_id = user_id[sample_prob > sample]
    movie_id = movie_id[sample_prob > sample]
    print('#samples:', len(user_id))
    user_movie_spm = spsp.coo_matrix((np.ones((len(user_id),)), (user_id, movie_id)))
    movie_deg = spm_t.dot(np.ones((num_users,)))
    print(np.sum(movie_deg == 0))
    
    movie_spm = np.dot(user_movie_spm.transpose(), user_movie_spm)
    #dense_movie = np.sort(movie_spm.todense())
    #topk_movie = dense_movie[:,-50]
    #movie_spm1 = movie_spm >= topk_movie
    print('most similar:', np.max(movie_spm.data))
    print('most unsimilar:', np.min(movie_spm.data))
    
    return movie_spm

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
topk = 20
def gen_cosine(user_movie_spm):
    movie_spm = cosine_similarity(user_movie_spm.transpose(),dense_output=False)

    dense_movie = np.sort(movie_spm.todense())
    topk_movie = dense_movie[:,-topk]
    topk_movie_spm = movie_spm > topk_movie
    topk_movie_spm = movie_spm.multiply(topk_movie_spm)

    return topk_movie_spm

In [15]:
item_matrix_coocurrence = gen_cooccur(user_movie_spm)

[0.02624909 0.04368582 0.0395745  ... 0.04548064 0.05386991 0.02178505]
[0.96788039 0.93071604 0.44196522 ... 0.22323823 0.1565952  0.08350952]
986669 986669
#samples: 48238
0
most similar: 68.0
most unsimilar: 1.0


In [16]:
item_matrix_cosine = gen_cosine(user_movie_spm)

In [17]:
item_matrix_cosine

<3702x3702 sparse matrix of type '<class 'numpy.float64'>'
	with 11300724 stored elements in COOrdinate format>