In [8]:
import pandas as pd
import gzip

In [9]:
# load_rating_df

rating_df = pd.read_csv('../data/games/ratings_Video_Games.csv', header=None)
rating_df.columns = ['uid', 'sid', 'rating', 'timestamp']
rating_df[:5]

Unnamed: 0,uid,sid,rating,timestamp
0,AB9S9279OZ3QO,0078764343,5.0,1373155200
1,A24SSUT5CSW8BH,0078764343,5.0,1377302400
2,AK3V0HEBJMQ7J,0078764343,4.0,1372896000
3,A10BECPH7W8HM7,043933702X,5.0,1404950400
4,A2PRV9OULX1TWP,043933702X,5.0,1386115200


In [10]:
# load_metadict

path = "../data/games/meta_Video_Games.json.gz"
meta_dict = {}

with gzip.open(path, 'rb') as f:
    for line in f:
        item = eval(line)
        if 'title' in item and len(item['title']) > 0:
            meta_dict[item['asin'].strip()] = item['title'].strip()


In [11]:
# 5개 출력
for i in list(meta_dict.keys())[:5]:
    print(f"{i} : '{meta_dict[i]}'")

0042000742 : 'Reversi Sensory Challenger'
0078764343 : 'Medal of Honor: Warfighter - Includes Battlefield 4 Beta - Limited Edition'
0276425316 : 'street fighter 2 II turbo super nintendo snes super nes video game'
0324411812 : 'Xbox 360 MAS STICK'
0439335310 : 'Phonics Alive! 3: The Speller'


In [12]:
df = rating_df.copy()
meta_raw = meta_dict

In [13]:
min_sc = 2
min_uc = 2

def filter_triplets(df):
    print('Filtering triplets')
    if min_sc > 1 or min_uc > 1:
        item_sizes = df.groupby('sid').size()
        good_items = item_sizes.index[item_sizes >= min_sc]
        user_sizes = df.groupby('uid').size()
        good_users = user_sizes.index[user_sizes >= min_uc]
        while len(good_items) < len(item_sizes) or len(good_users) < len(user_sizes):
            if min_sc > 1:
                item_sizes = df.groupby('sid').size()
                good_items = item_sizes.index[item_sizes >= min_sc]
                df = df[df['sid'].isin(good_items)]

            if min_uc > 1:
                user_sizes = df.groupby('uid').size()
                good_users = user_sizes.index[user_sizes >= min_uc]
                df = df[df['uid'].isin(good_users)]

            item_sizes = df.groupby('sid').size()
            good_items = item_sizes.index[item_sizes >= min_sc]
            user_sizes = df.groupby('uid').size()
            good_users = user_sizes.index[user_sizes >= min_uc]
            
    return df


In [14]:
df = df[df['sid'].isin(meta_raw)]  # filter items without meta info
df = filter_triplets(df)

df.head()

Filtering triplets


Unnamed: 0,uid,sid,rating,timestamp
1,A24SSUT5CSW8BH,0078764343,5.0,1377302400
2,AK3V0HEBJMQ7J,0078764343,4.0,1372896000
14,A14YVGE643TRJK,043940133X,1.0,1324944000
16,A3KO10N2ODLHBR,043940133X,5.0,1258329600
36,AFWPLXT2OD6H1,0439715571,4.0,1169769600


In [15]:
# 정수 값 밀집화
def densify_index(df):
    print('Densifying index')
    umap = {u: i for i, u in enumerate(set(df['uid']), start=1)}
    smap = {s: i for i, s in enumerate(set(df['sid']), start=1)}
    df['uid'] = df['uid'].map(umap)
    df['sid'] = df['sid'].map(smap)
    return df, umap, smap

# train, val, test 데이터 분리
def split_df(df, user_count):
    print('Splitting')
    user_group = df.groupby('uid')
    user2items = user_group.apply(
        lambda d: list(d.sort_values(by=['timestamp', 'sid'])['sid']))
    train, val, test = {}, {}, {}
    for i in range(user_count):
        user = i + 1
        items = user2items[user]
        train[user], val[user], test[user] = items[:-2], items[-2:-1], items[-1:]
    return train, val, test

In [16]:
df, umap, smap = densify_index(df)
train, val, test = split_df(df, len(umap))

Densifying index
Splitting


  user2items = user_group.apply(


In [17]:
meta = {smap[k]: v for k, v in meta_dict.items() if k in smap}

In [18]:
# 5개 출력
for i in list(meta.keys())[:5]:
    print(f"{i} : '{meta[i]}'")

20508 : 'Medal of Honor: Warfighter - Includes Battlefield 4 Beta - Limited Edition'
5612 : 'Star Wars Math: Jabba's Game Galaxy'
7998 : 'Mortal Kombat 4'
22828 : 'Video Game Tycoon Gold Edition'
7575 : 'Anno 2070'


In [19]:
dataset = {'train': train,
           'val': val,
           'test': test,
           'meta': meta,
           'umap': umap,
           'smap': smap}

In [20]:
for i in list(train.keys())[:5]:
    print(f"{i} : '{train[i]}'")

1 : '[]'
2 : '[]'
3 : '[12615, 23621, 2919, 13461]'
4 : '[19783, 13992, 17998, 21950, 6054, 22052, 612, 1880]'
5 : '[83, 3940, 8752, 19440]'


In [21]:
for i in list(val.keys())[:5]:
    print(f"{i} : '{val[i]}'")

1 : '[9941]'
2 : '[9855]'
3 : '[9583]'
4 : '[3844]'
5 : '[9971]'


In [22]:
for i in list(test.keys())[:5]:
    print(f"{i} : '{test[i]}'")

1 : '[10844]'
2 : '[2801]'
3 : '[18940]'
4 : '[22611]'
5 : '[9236]'


In [23]:
for i in list(meta.keys())[:5]:
    print(f"{i} : '{meta[i]}'")

20508 : 'Medal of Honor: Warfighter - Includes Battlefield 4 Beta - Limited Edition'
5612 : 'Star Wars Math: Jabba's Game Galaxy'
7998 : 'Mortal Kombat 4'
22828 : 'Video Game Tycoon Gold Edition'
7575 : 'Anno 2070'


In [24]:
for i in list(umap.keys())[:5]:
    print(f"{i} : '{umap[i]}'")

A3TS4JF2UUH3E7 : '1'
A3L5A35WZI3CE7 : '2'
A31VTQKR7OXD9E : '3'
A1FOACA8KHBA9W : '4'
A2GLTGOPI3Y3ER : '5'


In [25]:
for i in list(smap.keys())[:5]:
    print(f"{i} : '{smap[i]}'")

B00BIY3VEE : '1'
B005J3J3YW : '2'
B000038I9Q : '3'
B004LQSOGQ : '4'
B00007LVJA : '5'
