In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
from sklearn import svm, tree, linear_model
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [3]:
reviews_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/processed_reviews.csv')
reviews_genre_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset//processed_reviews+genre.csv')
genre_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/genres.csv')

In [4]:
def load_dataset(filepath):
    with np.load(filepath) as filedata:
        return dict(filedata['uid2idx']), dict(filedata['iid2idx']), \
            filedata['train_x'], filedata['train_y'], \
            filedata['valid_x'], filedata['valid_y'], \
            filedata['test_x'], filedata['test_y']

In [5]:
genre_model4 = keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/dataset/genre4_0.5246.hdf5')

In [6]:
filepath = '/content/drive/MyDrive/Colab Notebooks/dataset/genre_dataset.npz'
genre_uid2idx, genre_iid2idx, _, _, _, _, test_x, test_y = load_dataset(filepath)

In [7]:
genre_model4.evaluate(
    x=(test_x[:,0,None], test_x[:,1,None]),
    y=test_y
)



[0.534712553024292, 0.746999979019165]

In [8]:
game_model8 = keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/dataset/game8_0.6647.hdf5')

In [9]:
filepath = '/content/drive/MyDrive/Colab Notebooks/dataset/game_dataset.npz'
game_uid2idx, game_iid2idx, _, _, _, _, test_x, test_y = load_dataset(filepath)

In [10]:
game_model8.evaluate(
    x=(test_x[:,0,None], test_x[:,1,None]),
    y=test_y
)



[0.6674506068229675, 0.7089999914169312]

In [11]:
get_genre_uidx = genre_uid2idx.get
get_genre_iidx = dict(genre_df.values[::, ::-1]).get
get_game_uidx = game_uid2idx.get
get_game_iidx = game_iid2idx.get


def voctor_getter(model, layer, get_idx):
    layer_num = {'user': 2, 'item': 3}[layer]
    vectors = model.layers[layer_num].get_weights()[0]
    def func(_id):
        idx = get_idx(_id)
        return None if idx is None else vectors[idx]
    return func


get_genre_user_vec = voctor_getter(genre_model4, 'user', get_genre_uidx)
get_genre_item_vec = voctor_getter(genre_model4, 'item', get_genre_iidx)
get_game_user_vec = voctor_getter(game_model8, 'user', get_game_uidx)
get_game_item_vec = voctor_getter(game_model8, 'item', get_game_iidx)


def get_user_vec(user_id):
    return np.concatenate((get_genre_user_vec(user_id), get_game_user_vec(user_id)))

def get_item_vec(genre_names, id):
    genre_vectors = np.array([get_genre_item_vec(genre) for genre in genre_names])
    return np.concatenate((genre_vectors.mean(axis=0), get_game_item_vec(id)))

In [12]:
data_x = list()
data_y = reviews_df['voted_up'].values
for _, row in tqdm(reviews_df.iterrows()):
    user_vec = get_user_vec(row['user_id'])
    item_vec = get_item_vec(row['genres'].split(';'), row['game_id'])
    data_x.append(np.concatenate((user_vec, item_vec)))

data_x = np.array(data_x)
data_x.shape, data_y.shape

0it [00:00, ?it/s]

((2375530, 24), (2375530,))

In [13]:
train_x, valid_x, train_y, valid_y = train_test_split(
    data_x, data_y, test_size=8000,
)
valid_x, test_x, valid_y, test_y = train_test_split(
    valid_x, valid_y, test_size=2000,
)
np.savez_compressed(
    '/content/drive/MyDrive/Colab Notebooks/dataset/game+genre_dataset.npz',
    **{
        'train_x': train_x, 
        'train_y': train_y, 
        'valid_x': valid_x, 
        'valid_y': valid_y, 
        'test_x': test_x, 
        'test_y': test_y,
    },
)

In [14]:
with np.load('/content/drive/MyDrive/Colab Notebooks/dataset/game+genre_dataset.npz') as filedata:
    train_x = filedata['train_x']
    train_y = filedata['train_y']
    valid_x = filedata['valid_x']
    valid_y = filedata['valid_y']
    test_x = filedata['test_x']
    test_y = filedata['test_y']

In [39]:
def evaluate(model, x, y):
    pred = model.predict(x)
    pred_ratio = pred.sum() / len(y)
    y_ratio = y.sum() / len(y)
    acc = (y == pred).sum() / len(y)
    print(f'{pred_ratio}, {y_ratio}, acc = {acc}')

In [33]:
%%time
sample_size = 80000
rbf_svc = svm.SVC()
rbf_svc = rbf_svc.fit(train_x[:sample_size], train_y[:sample_size])

CPU times: user 8min 32s, sys: 1.96 s, total: 8min 34s
Wall time: 8min 34s


In [40]:
evaluate(rbf_svc, valid_x, valid_y)

0.735, 0.6473333333333333, acc = 0.798


In [41]:
evaluate(rbf_svc, test_x, test_y)

0.7315, 0.646, acc = 0.8025
