In [1]:
import numpy as np

In [2]:
np.random.seed(555)
DATASET = 'movie'  # or 'book'

In [3]:
# news dataset has not been included yet
RATING_FILE_NAME = dict({'movie': 'movie_ratings.dat', 'book': 'book_ratings.csv', 'news': 'ratings.txt'})

# different rating files have different separators
SEP = dict({'movie': '::', 'book': ';', 'news': '\t'})

THRESHOLD = dict({'movie': 4, 'book': 0, 'news': 0})

# whether to skip the heading line in file
SKIP_LINE = dict({'movie': 0, 'book': 1, 'news': 0})

In [4]:
entity_id2index = dict()
item_index_old2new = dict()
relation_id2index = dict()

In [5]:
# item_index2entity_id_rehashed.txt maps the id in movie and book dataset
# to the kg satori id
def read_item_index_to_entity_id_file():
    file = 'data/' + DATASET + '/item_index2entity_id_rehashed.txt'
    print('reading item index to entity id file: ' + file + ' ...')
    i = 0
    for line in open(file, encoding='utf-8').readlines():
        item_index = line.strip().split('\t')[0]  # item id from the movie dataset
        satori_id = line.strip().split('\t')[1]  # satori id from the kg
        item_index_old2new[item_index] = i
        entity_id2index[satori_id] = i
        i += 1

In [6]:
read_item_index_to_entity_id_file()

reading item index to entity id file: data/movie/item_index2entity_id_rehashed.txt ...


In [7]:
list(entity_id2index.items())[:20]

[('0', 0),
 ('1', 1),
 ('2', 2),
 ('3', 3),
 ('4', 4),
 ('5', 5),
 ('6', 6),
 ('7', 7),
 ('8', 8),
 ('9', 9),
 ('10', 10),
 ('11', 11),
 ('12', 12),
 ('13', 13),
 ('14', 14),
 ('15', 15),
 ('16', 16),
 ('17', 17),
 ('18', 18),
 ('19', 19)]

In [8]:
list(item_index_old2new.items())[:20]

[('1', 0),
 ('2', 1),
 ('3', 2),
 ('4', 3),
 ('5', 4),
 ('8', 5),
 ('10', 6),
 ('11', 7),
 ('12', 8),
 ('13', 9),
 ('14', 10),
 ('15', 11),
 ('17', 12),
 ('18', 13),
 ('19', 14),
 ('20', 15),
 ('21', 16),
 ('25', 17),
 ('27', 18),
 ('29', 19)]

In [9]:
## these are the items found in the kg (such as movie and book)
item_set = set(item_index_old2new.values())
len(item_set)

2445

In [10]:
def convert_rating():
    file = 'data/' + DATASET + '/' + RATING_FILE_NAME[DATASET]
    skip_line = SKIP_LINE[DATASET] - 1  # skip heading if needed

    print('reading rating file ...')
    item_set = set(item_index_old2new.values())  # len(item_set) is 2445 the total number of items

    # change scaled ratings to binary: positive or negative
    user_pos_ratings = dict()  
    user_neg_ratings = dict()

    # open the rating file
    for i, line in enumerate(open(file, encoding='utf-8').readlines()):
        if i == skip_line:  # skip heading if needed
            continue
        array = line.strip().split(SEP[DATASET])  # different dataset has different separators 

        # remove prefix and suffix quotation marks for book rating dataset
        if DATASET == 'book':
            array = list(map(lambda x: x[1:-1], array))
        
        # array user-SEP-movie-SEP-rating

        item_index_old = array[1] # old index in the original rating file

        # the following logic is that we only use the movie that showed up in the kg
        # the item is not in the final item set
        if item_index_old not in item_index_old2new:
            continue  # skip the one that are not in the kg
        #print(f'{item_index_old} in kg')
        item_index = item_index_old2new[item_index_old]
        #print(f'{item_index} is the new index in kg')

        user_index_old = int(array[0])

        rating = float(array[2])  # convert to pos and neg ratings
        if rating >= THRESHOLD[DATASET]:
            if user_index_old not in user_pos_ratings:
                user_pos_ratings[user_index_old] = set()
            user_pos_ratings[user_index_old].add(item_index)
        else:
            if user_index_old not in user_neg_ratings:
                user_neg_ratings[user_index_old] = set()
            user_neg_ratings[user_index_old].add(item_index)

    #print(user_pos_ratings)
    print('converting rating file ...')
    writer = open('data/' + DATASET + '/ratings_final.txt', 'w', encoding='utf-8')
    user_cnt = 0
    user_index_old2new = dict()
    for user_index_old, pos_item_set in user_pos_ratings.items():
        if user_index_old not in user_index_old2new:
            user_index_old2new[user_index_old] = user_cnt
            user_cnt += 1
        user_index = user_index_old2new[user_index_old]

        for item in pos_item_set:
            writer.write('%d\t%d\t1\n' % (user_index, item)) # user_index tab item tab 1
        unwatched_set = item_set - pos_item_set
        # the following part logic is not quite clear
        # see this issue: https://github.com/hwwang55/RippleNet/issues/18
        # basically, the author's logic is if a movie rating is >=4 then, it's positive
        # otherwise, remove the negative items from the item set if any and then
        # random choose same amount of items as the positive ones as the negative ones
        # note here 1 means interested 0 means not interested - low rating is also considered interestd by authors
        if user_index_old in user_neg_ratings:
            unwatched_set -= user_neg_ratings[user_index_old] # remove the negative ones
        for item in np.random.choice(list(unwatched_set), size=len(pos_item_set), replace=False):  # random choose from the remaining set
            writer.write('%d\t%d\t0\n' % (user_index, item))
    writer.close()
    print('number of users: %d' % user_cnt)
    print('number of items: %d' % len(item_set))

In [11]:
convert_rating()

reading rating file ...
converting rating file ...
number of users: 6036
number of items: 2445


In [12]:
def convert_kg():
    print('converting kg file ...')
    entity_cnt = len(entity_id2index)
    relation_cnt = 0

    writer = open('data/' + DATASET + '/kg_final.txt', 'w', encoding='utf-8')

    files = []
    if DATASET == 'movie':
        files.append(open('data/' + DATASET + '/kg_part1_rehashed.txt', encoding='utf-8'))
        files.append(open('data/' + DATASET + '/kg_part2_rehashed.txt', encoding='utf-8'))
    else:
        files.append(open('data/' + DATASET + '/kg_rehashed.txt', encoding='utf-8'))

    for file in files:  # each line is a triplet head-TAB-relation-TAB-tail: 2451	film.actor.film	2452
        for line in file:
            array = line.strip().split('\t')  # split by tab
            head_old = array[0]  # head
            relation_old = array[1]  # relation
            tail_old = array[2]  # tail

            if head_old not in entity_id2index:
                entity_id2index[head_old] = entity_cnt
                entity_cnt += 1
            head = entity_id2index[head_old]

            if tail_old not in entity_id2index:
                entity_id2index[tail_old] = entity_cnt
                entity_cnt += 1
            tail = entity_id2index[tail_old]

            if relation_old not in relation_id2index:
                relation_id2index[relation_old] = relation_cnt
                relation_cnt += 1
            relation = relation_id2index[relation_old]

            writer.write('%d\t%d\t%d\n' % (head, relation, tail))

    writer.close()  # the kg final: head-TAB-relationID-TAB-tail
    print('number of entities (containing items): %d' % entity_cnt)
    print('number of relations: %d' % relation_cnt)

In [13]:
convert_kg()

converting kg file ...
number of entities (containing items): 182011
number of relations: 12


The followings are from the tools/load_data.py

In [14]:
import numpy as np

In [15]:
rating_np = np.loadtxt("./data/movie/ratings_final.txt",  dtype=np.int32)

In [16]:
rating_np.shape[0]

753774

In [17]:
test_ratio = 0.2
n_ratings = rating_np.shape[0]  # total number of ratings, movie 753774
n_ratings

753774

In [18]:
# get the test ratings indices 20%
test_indices = np.random.choice(n_ratings,
                                size=int(n_ratings * test_ratio),
                                replace=False)
train_indices = set(range(n_ratings)) - set(test_indices)  # train 80%
len(train_indices)

603020

In [19]:
# traverse training data, only keeping the users with positive ratings
user_history_dict = dict()
for i in train_indices:
    user = rating_np[i][0]  # user
    item = rating_np[i][1]  # item
    rating = rating_np[i][2]  # rating 1 or 0
    if rating == 1:  # positive rating
        if user not in user_history_dict:
            user_history_dict[user] = []
        user_history_dict[user].append(item)

# user_history_dict has all users and their corresponding positive rated items 
train_indices = [i for i in train_indices
                    if rating_np[i][0] in user_history_dict]
test_indices = [i for i in test_indices
                if rating_np[i][0] in user_history_dict]
print(len(train_indices))
train_data = rating_np[train_indices]
test_data = rating_np[test_indices]

603020


In [20]:
# for the movie dataset, all users in the rating are kept
total = len(train_data) + len(test_data)
total

753774

In [21]:
# first two items in the dict, user 0 and user 1
list(user_history_dict.items())[:2]

[(0,
  [0,
   1669,
   1419,
   780,
   1687,
   1179,
   670,
   1696,
   2082,
   939,
   1093,
   1225,
   341,
   1624,
   1760,
   737,
   1889,
   1383,
   624,
   767]),
 (1,
  [768,
   1169,
   1554,
   1174,
   1175,
   669,
   418,
   930,
   167,
   1833,
   1451,
   1327,
   309,
   1974,
   1207,
   1342,
   574,
   2114,
   1091,
   837,
   1225,
   206,
   975,
   1361,
   1875,
   1883,
   733,
   737,
   738,
   226,
   740,
   231,
   745,
   1515,
   748,
   237,
   1773,
   2031,
   1906,
   379,
   380,
   767])]

In [22]:
# itemID - relationID - itemID
kg_np = np.loadtxt('./data/movie/kg_final.txt', dtype=np.int32)
kg_np.shape

(1241995, 3)

In [23]:
kg_np[:2]

array([[2445,    0, 2446],
       [2447,    1, 2448]], dtype=int32)

In [24]:
n_entity = len(set(kg_np[:, 0]) | set(kg_np[:, 2])) # unique number of entities
n_relation = len(set(kg_np[:, 1]))  # unique number of relations
print(n_entity, n_relation)

182011 12


In [25]:
import collections
print('constructing knowledge graph ...')
kg = collections.defaultdict(list)
for head, relation, tail in kg_np:
    kg[head].append((tail, relation))
print('done')

constructing knowledge graph ...
done


In [26]:
kg[4279] # 4279 is the head item followed by all (tail, relation)

[(2450, 2),
 (3264, 2),
 (2458, 3),
 (2533, 6),
 (5662, 2),
 (2955, 9),
 (2755, 10),
 (11559, 8)]

In [27]:
# user_history_dict has all users and their corresponding positive rated items 
list(user_history_dict.items())[:1]  # user 0

[(0,
  [0,
   1669,
   1419,
   780,
   1687,
   1179,
   670,
   1696,
   2082,
   939,
   1093,
   1225,
   341,
   1624,
   1760,
   737,
   1889,
   1383,
   624,
   767])]

In [28]:
for i in range(2):
    print(i)

0
1


In [29]:
# user_history_dict[user]
user_history_dict[0]

[0,
 1669,
 1419,
 780,
 1687,
 1179,
 670,
 1696,
 2082,
 939,
 1093,
 1225,
 341,
 1624,
 1760,
 737,
 1889,
 1383,
 624,
 767]

In [31]:
# test for user 0

# the following shows the ripple set content
# essentiall, the hop_0_tails becomes the hop_1_heads

# user -> [(hop_0_heads, hop_0_relations, hop_0_tails),
#          (hop_1_heads, hop_1_relations, hop_1_tails), ...]
ripple_set = collections.defaultdict(list)
user = 0
n_hop = 2
n_memory = 32 # size of ripple set - total of entities, set in main.py

#for user in user_history_dict:
for h in range(n_hop):
    memories_h = []
    memories_r = []
    memories_t = []

    if h == 0:  # first iteration initialization with all positive rated items
        tails_of_last_hop = user_history_dict[user]
    else:
        # ripple_set[user][-1] means the previous (memories_h, memories_r, memories_t)
        # [2] chooses memories_t from (memories_h, memories_r, memories_t)
        tails_of_last_hop = ripple_set[user][-1][2]

    for entity in tails_of_last_hop:
        for tail_and_relation in kg[entity]:
            memories_h.append(entity)
            memories_r.append(tail_and_relation[1])
            memories_t.append(tail_and_relation[0])

    """
    if the current ripple set of the given user is empty,
    we simply copy the ripple set of the last hop here
    this won't happen for h = 0,
    because only the items that appear in the KG have been selected
    this only happens on 154 users in Book-Crossing dataset
    (since both book dataset and the KG are sparse)
    """
    if len(memories_h) == 0:
        ripple_set[user].append(ripple_set[user][-1])
    else:
        # sample a fixed-size 1-hop memory for each user
        # if memories_h <32 then, replace =True, may sample the same entity multiple times
        replace = len(memories_h) < n_memory
        indices = np.random.choice(
            len(memories_h),
            size=n_memory,
            replace=replace
            )
        memories_h = [memories_h[i] for i in indices]
        memories_r = [memories_r[i] for i in indices]
        memories_t = [memories_t[i] for i in indices]
        ripple_set[user].append(
            (memories_h, memories_r, memories_t)
            )

In [32]:
ripple_set

defaultdict(list,
            {0: [([1760,
                1760,
                0,
                624,
                780,
                767,
                1624,
                341,
                1419,
                737,
                1696,
                2082,
                341,
                2082,
                1419,
                767,
                767,
                1624,
                1225,
                1687,
                0,
                1383,
                939,
                1383,
                1225,
                1669,
                624,
                341,
                1093,
                1687,
                1383,
                1383],
               [8,
                6,
                7,
                2,
                7,
                3,
                2,
                3,
                7,
                2,
                8,
                6,
                7,
                2,
                8,
      