In [3]:
import numpy as np

In [4]:
np.random.seed(555)
DATASET = 'movie'  # or 'book'

In [5]:
# news dataset has not been included yet
RATING_FILE_NAME = dict({'movie': 'movie_ratings.dat', 'book': 'book_ratings.csv', 'news': 'ratings.txt'})

# different rating files have different separators
SEP = dict({'movie': '::', 'book': ';', 'news': '\t'})

THRESHOLD = dict({'movie': 4, 'book': 0, 'news': 0})

# whether to skip the heading line in file
SKIP_LINE = dict({'movie': 0, 'book': 1, 'news': 0})

In [6]:
entity_id2index = dict()
item_index_old2new = dict()
relation_id2index = dict()

In [7]:
# item_index2entity_id_rehashed.txt maps the id in movie and book dataset
# to the kg satori id
def read_item_index_to_entity_id_file():
    file = 'data/' + DATASET + '/item_index2entity_id_rehashed.txt'
    print('reading item index to entity id file: ' + file + ' ...')
    i = 0
    for line in open(file, encoding='utf-8').readlines():
        item_index = line.strip().split('\t')[0]  # item id from the movie dataset
        satori_id = line.strip().split('\t')[1]  # satori id from the kg
        item_index_old2new[item_index] = i
        entity_id2index[satori_id] = i
        i += 1

In [8]:
read_item_index_to_entity_id_file()

reading item index to entity id file: data/movie/item_index2entity_id_rehashed.txt ...


In [9]:
list(entity_id2index.items())[:20]

[('0', 0),
 ('1', 1),
 ('2', 2),
 ('3', 3),
 ('4', 4),
 ('5', 5),
 ('6', 6),
 ('7', 7),
 ('8', 8),
 ('9', 9),
 ('10', 10),
 ('11', 11),
 ('12', 12),
 ('13', 13),
 ('14', 14),
 ('15', 15),
 ('16', 16),
 ('17', 17),
 ('18', 18),
 ('19', 19)]

In [10]:
list(item_index_old2new.items())[:20]

[('1', 0),
 ('2', 1),
 ('3', 2),
 ('4', 3),
 ('5', 4),
 ('8', 5),
 ('10', 6),
 ('11', 7),
 ('12', 8),
 ('13', 9),
 ('14', 10),
 ('15', 11),
 ('17', 12),
 ('18', 13),
 ('19', 14),
 ('20', 15),
 ('21', 16),
 ('25', 17),
 ('27', 18),
 ('29', 19)]

In [11]:
## these are the items found in the kg (such as movie and book)
item_set = set(item_index_old2new.values())
len(item_set)

2445

In [21]:
def convert_rating():
    file = 'data/' + DATASET + '/' + RATING_FILE_NAME[DATASET]
    skip_line = SKIP_LINE[DATASET] - 1  # skip heading if needed

    print('reading rating file ...')
    item_set = set(item_index_old2new.values())  # len(item_set) is 2445 the total number of items

    # change scaled ratings to binary: positive or negative
    user_pos_ratings = dict()  
    user_neg_ratings = dict()

    # open the rating file
    for i, line in enumerate(open(file, encoding='utf-8').readlines()):
        if i == skip_line:  # skip heading if needed
            continue
        array = line.strip().split(SEP[DATASET])  # different dataset has different separators 

        # remove prefix and suffix quotation marks for book rating dataset
        if DATASET == 'book':
            array = list(map(lambda x: x[1:-1], array))
        
        # array user-SEP-movie-SEP-rating

        item_index_old = array[1] # old index in the original rating file

        # the following logic is that we only use the movie that showed up in the kg
        # the item is not in the final item set
        if item_index_old not in item_index_old2new:
            continue  # skip the one that are not in the kg
        #print(f'{item_index_old} in kg')
        item_index = item_index_old2new[item_index_old]
        #print(f'{item_index} is the new index in kg')

        user_index_old = int(array[0])

        rating = float(array[2])  # convert to pos and neg ratings
        if rating >= THRESHOLD[DATASET]:
            if user_index_old not in user_pos_ratings:
                user_pos_ratings[user_index_old] = set()
            user_pos_ratings[user_index_old].add(item_index)
        else:
            if user_index_old not in user_neg_ratings:
                user_neg_ratings[user_index_old] = set()
            user_neg_ratings[user_index_old].add(item_index)

    #print(user_pos_ratings)
    print('converting rating file ...')
    writer = open('data/' + DATASET + '/ratings_final.txt', 'w', encoding='utf-8')
    user_cnt = 0
    user_index_old2new = dict()
    for user_index_old, pos_item_set in user_pos_ratings.items():
        if user_index_old not in user_index_old2new:
            user_index_old2new[user_index_old] = user_cnt
            user_cnt += 1
        user_index = user_index_old2new[user_index_old]

        for item in pos_item_set:
            writer.write('%d\t%d\t1\n' % (user_index, item)) # user_index tab item tab 1
        unwatched_set = item_set - pos_item_set
        # the following part logic is not quite clear
        # see this issue: https://github.com/hwwang55/RippleNet/issues/18
        # basically, the author's logic is if a movie rating is >=4 then, it's positive
        # otherwise, remove the negative items from the item set if any and then
        # random choose same amount of items as the positive ones as the negative ones
        # note here 1 means interested 0 means not interested - low rating is also considered interestd by authors
        if user_index_old in user_neg_ratings:
            unwatched_set -= user_neg_ratings[user_index_old] # remove the negative ones
        for item in np.random.choice(list(unwatched_set), size=len(pos_item_set), replace=False):  # random choose from the remaining set
            writer.write('%d\t%d\t0\n' % (user_index, item))
    writer.close()
    print('number of users: %d' % user_cnt)
    print('number of items: %d' % len(item_set))

In [22]:
convert_rating()

reading rating file ...
converting rating file ...
number of users: 6036
number of items: 2445


In [22]:
def convert_kg():
    print('converting kg file ...')
    entity_cnt = len(entity_id2index)
    relation_cnt = 0

    writer = open('data/' + DATASET + '/kg_final.txt', 'w', encoding='utf-8')

    files = []
    if DATASET == 'movie':
        files.append(open('data/' + DATASET + '/kg_part1_rehashed.txt', encoding='utf-8'))
        files.append(open('data/' + DATASET + '/kg_part2_rehashed.txt', encoding='utf-8'))
    else:
        files.append(open('data/' + DATASET + '/kg_rehashed.txt', encoding='utf-8'))

    for file in files:  # each line is a triplet head-TAB-relation-TAB-tail: 2451	film.actor.film	2452
        for line in file:
            array = line.strip().split('\t')  # split by tab
            head_old = array[0]  # head
            relation_old = array[1]  # relation
            tail_old = array[2]  # tail

            if head_old not in entity_id2index:
                entity_id2index[head_old] = entity_cnt
                entity_cnt += 1
            head = entity_id2index[head_old]

            if tail_old not in entity_id2index:
                entity_id2index[tail_old] = entity_cnt
                entity_cnt += 1
            tail = entity_id2index[tail_old]

            if relation_old not in relation_id2index:
                relation_id2index[relation_old] = relation_cnt
                relation_cnt += 1
            relation = relation_id2index[relation_old]

            writer.write('%d\t%d\t%d\n' % (head, relation, tail))

    writer.close()  # the kg final: head-TAB-relationID-TAB-tail
    print('number of entities (containing items): %d' % entity_cnt)
    print('number of relations: %d' % relation_cnt)

In [23]:
convert_kg()

converting kg file ...
number of entities (containing items): 182011
number of relations: 12
done
