# ML Train/Valid/Test split

In [129]:
import numpy as np
import pandas as pd
import random
import torch

In [197]:
# UserID::Gender::Age::Occupation::Zip-code
# MovieID::Title::Genres
# UserID::MovieID::Rating::Timestamp (5-star scale)

# Importing the dataset
#movies = pd.read_csv('./data/ml_1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
#users = pd.read_csv('./data/ml_1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')
ratings = pd.read_csv('./data/ml_100k/u.data', sep = '\t', header = None, engine = 'python', encoding = 'latin-1')


In [198]:
total_length = len(ratings)
ratings = ratings.sample(frac=1)

len_train = int(total_length*0.85)
len_val   = int(total_length*0.9)

rating_train = ratings[:len_train]
rating_val   = ratings[len_train:len_val]
rating_test  = ratings[len_val:]

In [199]:
num_users  = 943#6040
num_items = 1682#3706
rating_cnt = 5

In [200]:
user_dict = {}
item_dict = {}

rating_sum = torch.zeros(rating_cnt, num_users, num_items)
for i, ratings in enumerate([rating_train, rating_val, rating_test]):
    rating_mtx = torch.zeros(rating_cnt, num_users, num_items)
    
    for index, row in ratings.iterrows():
        u = int(row[0])-1
        v = int(row[1])-1
        r = int(row[2])-1
        
        if user_dict.get(u) is not None:
            u = user_dict[u]
        else:
            user_dict[u] = len(user_dict)
            u = user_dict[u]
        
        if item_dict.get(v) is not None:
            v = item_dict[v]
        else:
            item_dict[v] = len(item_dict)
            v = item_dict[v]
        
        rating_mtx[r, u, v] = 1
    
    
    """Row-normalize sparse matrix"""
    '''
    rowsum = torch.sum(torch.sum(rating_mtx, 0),1)
    r_inv = torch.pow(rowsum, -1).view(-1)
    r_inv[torch.isinf(r_inv)] = 0.
    r_mat_inv = torch.diag(r_inv)
    mx = torch.matmul(rating_mtx.permute(0,2,1), r_mat_inv)
    mx = torch.stack((mx.permute(0,2,1), rating_mtx), 0)
    '''
    
    rating_sum += rating_mtx

    #torch.save(mx, './data/rating_norm_%d.pkl'%i)

In [218]:
len(user_dict), len(item_dict), mx.size(), rating_mtx.size()

(943, 1682, torch.Size([71, 6000, 6000]), torch.Size([5, 3000, 3000]))

In [235]:
A = rating_sum
mx = torch.cat((torch.cat((torch.zeros(A.size(0),A.size(1),A.size(1)), A), 2),
                torch.cat((A.permute(0,2,1), torch.zeros(A.size(0),A.size(2),A.size(2))), 2)), 1)


square = []
for i, x in enumerate(mx):
    square.append(torch.mm(x,x))
    
square = torch.sum(torch.stack(square, 0), 0)


In [236]:
print(len(torch.nonzero(square))/(square.size(0)*square.size(1)))
print(len(torch.nonzero(square)), square.size(0), square.size(1), square.size(1)*square.size(0))


0.10603991666666666
3817437 6000 6000 36000000


# ML-1M

In [89]:
def map_data(data):
    """
    Map data to proper indices in case they are not in a continues [0, N) range

    Parameters
    ----------
    data : np.int32 arrays

    Returns
    -------
    mapped_data : np.int32 arrays
    n : length of mapped_data

    """
    uniq = list(set(data))

    id_dict = {old: new for new, old in enumerate(sorted(uniq))}
    data = np.array(list(map(lambda x: id_dict[x], data)))
    n = len(uniq)

    return data, id_dict, n


In [90]:
seed = 1234
sep = r'\:\:'
filename = './data/ml_1m/ratings.dat'
dtypes = {
            'u_nodes': np.int64, 'v_nodes': np.int64,
            'ratings': np.float32, 'timestamp': np.float64}

# use engine='python' to ignore warning about switching to python backend when using regexp for sep
data = pd.read_csv(filename, sep=sep, header=None,
                   names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], converters=dtypes, engine='python')

# shuffle here like cf-nade paper with python's own random class
# make sure to convert to list, otherwise random.shuffle acts weird on it without a warning
data_array = data.as_matrix().tolist()
random.seed(seed)
random.shuffle(data_array)
data_array = np.array(data_array)

u_nodes_ratings = data_array[:, 0].astype(dtypes['u_nodes'])
v_nodes_ratings = data_array[:, 1].astype(dtypes['v_nodes'])
ratings = data_array[:, 2].astype(dtypes['ratings'])

u_nodes_ratings, u_dict, num_users = map_data(u_nodes_ratings)
v_nodes_ratings, v_dict, num_items = map_data(v_nodes_ratings)

u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int64), v_nodes_ratings.astype(np.int64)
ratings = ratings.astype(np.float32)


  mask |= (ar1 == a)


In [94]:
u_nodes = u_nodes_ratings
v_nodes = v_nodes_ratings

In [95]:
neutral_rating = -1
num_users = 6040
num_items = 3706

rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])
labels = labels.reshape([-1])

# number of test and validation edges
num_test = int(np.ceil(ratings.shape[0] * 0.1))
num_val = int(np.ceil(ratings.shape[0] * 0.9 * 0.05))
num_train = ratings.shape[0] - num_val - num_test

pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])

idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero])

train_idx = idx_nonzero[0:num_train]
val_idx = idx_nonzero[num_train:num_train + num_val]
test_idx = idx_nonzero[num_train + num_val:]

train_pairs_idx = pairs_nonzero[0:num_train]
val_pairs_idx = pairs_nonzero[num_train:num_train + num_val]
test_pairs_idx = pairs_nonzero[num_train + num_val:]

u_test_idx, v_test_idx = test_pairs_idx.transpose()
u_val_idx, v_val_idx = val_pairs_idx.transpose()
u_train_idx, v_train_idx = train_pairs_idx.transpose()

# create labels
train_labels = labels[train_idx]
val_labels = labels[val_idx]
test_labels = labels[test_idx]


In [119]:
for i, data in enumerate([zip(u_train_idx,v_train_idx,train_labels), 
                          zip(u_val_idx,v_val_idx,val_labels), 
                          zip(u_test_idx,v_test_idx,test_labels)]):
    rating_mx = torch.zeros(len(rating_dict), num_users, num_items)
    
    for u, v, r in list(data):
        rating_mx[r, u, v] = 1

    torch.save(rating_mx, './data/ml_1m_%d.pkl'%i)

# Flixster/Douban/YahooMusic

In [32]:
def load_matlab_file(path_file, name_field):
    db = h5py.File(path_file, 'r')
    ds = db[name_field]
    try:
        if 'ir' in ds.keys():
            data = np.asarray(ds['data'])
            ir = np.asarray(ds['ir'])
            jc = np.asarray(ds['jc'])
            out = sp.csc_matrix((data, ir, jc)).astype(np.float32)
    except AttributeError:
        # Transpose in case is a dense matrix because of the row- vs column- major ordering between python and matlab
        out = np.asarray(ds).astype(np.float32).T

    db.close()
    
    return out

In [229]:
import numpy as np
import h5py

dataset = 'douban'#'flixster'#yahoo_music'#'
path_dataset = './data/{}/training_test_dataset.mat'.format(dataset)
name_field= 'W_users'

M = load_matlab_file(path_dataset, 'M')
Otraining = load_matlab_file(path_dataset, 'Otraining')
Otest = load_matlab_file(path_dataset, 'Otest')

num_users = M.shape[0]
num_items = M.shape[1]

if dataset == 'flixster':
    Wrow = load_matlab_file(path_dataset, 'W_users')
    Wcol = load_matlab_file(path_dataset, 'W_movies')
    u_features = Wrow
    v_features = Wcol
    # print(num_items, v_features.shape)
    #v_features = np.eye(num_items)

elif dataset == 'douban':
    Wrow = load_matlab_file(path_dataset, 'W_users')
    u_features = Wrow
    v_features = np.eye(num_items)
elif dataset == 'yahoo_music':
    Wcol = load_matlab_file(path_dataset, 'W_tracks')
    u_features = np.eye(num_users)
    v_features = Wcol

u_nodes_ratings = np.where(M)[0]
v_nodes_ratings = np.where(M)[1]
ratings = M[np.where(M)]

In [230]:
u_nodes_ratings, v_nodes_ratings = u_nodes_ratings.astype(np.int64), v_nodes_ratings.astype(np.int32)
ratings = ratings.astype(np.float64)

u_nodes = u_nodes_ratings
v_nodes = v_nodes_ratings

print('number of users = ', len(set(u_nodes)))
print('number of item = ', len(set(v_nodes)))

neutral_rating = 0 # int(np.ceil(np.float(num_classes)/2.)) - 1

# assumes that ratings_train contains at least one example of every rating type
rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32)
labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings])

for i in range(len(u_nodes)):
    assert(labels[u_nodes[i], v_nodes[i]] == rating_dict[ratings[i]])
    
labels = labels.reshape([-1])

number of users =  2999
number of item =  3000


In [231]:
# number of test and validation edges

num_train = np.where(Otraining)[0].shape[0]
num_test = np.where(Otest)[0].shape[0]
num_val = int(np.ceil(num_train * 0.2))
num_train = num_train - num_val


In [232]:
pairs_nonzero_train = np.array([[u, v] for u, v in zip(np.where(Otraining)[0], np.where(Otraining)[1])])
idx_nonzero_train = np.array([u * num_items + v for u, v in pairs_nonzero_train])

pairs_nonzero_test = np.array([[u, v] for u, v in zip(np.where(Otest)[0], np.where(Otest)[1])])
idx_nonzero_test = np.array([u * num_items + v for u, v in pairs_nonzero_test])

# Internally shuffle training set (before splitting off validation set)
rand_idx = list(range(len(idx_nonzero_train)))
np.random.seed(42)
np.random.shuffle(rand_idx)
idx_nonzero_train = idx_nonzero_train[rand_idx]
pairs_nonzero_train = pairs_nonzero_train[rand_idx]

idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0)
pairs_nonzero = np.concatenate([pairs_nonzero_train, pairs_nonzero_test], axis=0)

val_idx = idx_nonzero[0:num_val]
train_idx = idx_nonzero[num_val:num_train + num_val]
test_idx = idx_nonzero[num_train + num_val:]

assert(len(test_idx) == num_test)

val_pairs_idx = pairs_nonzero[0:num_val]
train_pairs_idx = pairs_nonzero[num_val:num_train + num_val]
test_pairs_idx = pairs_nonzero[num_train + num_val:]

u_test_idx, v_test_idx = test_pairs_idx.transpose()
u_val_idx, v_val_idx = val_pairs_idx.transpose()
u_train_idx, v_train_idx = train_pairs_idx.transpose()

# create labels
train_labels = labels[train_idx]
val_labels = labels[val_idx]
test_labels = labels[test_idx]


In [233]:
minimum = min(min(test_labels), min(train_labels), min(val_labels))
maximum = max(max(test_labels), max(train_labels), max(val_labels))
print(minimum, maximum)


0 4


In [234]:
import torch

rating_train = zip(u_train_idx, v_train_idx, train_labels)
rating_val = zip(u_val_idx, v_val_idx, val_labels)
rating_test = zip(u_test_idx, v_test_idx, test_labels)

rating_sum = torch.zeros((maximum-minimum+1, 3000, 3000))
for i, ratings in enumerate([rating_train, rating_val, rating_test]):
    rating_mtx = torch.zeros((maximum-minimum+1, 3000, 3000))
    
    for (u, v, r) in ratings:
        
        rating_mtx[r, u, v] = 1
        
    rating_sum += rating_mtx
    #torch.save(rating_mtx, './data/%s_%d.pkl'%(dataset,i))

In [64]:
0 in train_labels

True