In [1]:
from mxnet.gluon import Block, nn, Trainer
from mxnet.gluon.loss import L2Loss
from mxnet import autograd, ndarray as F
import mxnet as mx

import numpy as np
import random
import logging
import re

__reference__ = 'https://www.endpoint.com/blog/2018/07/17/recommender-mxnet'

from mxnet.tools.bandwidth.measure import logger


class DataIter(mx.io.DataIter):
    def __init__(self, data, batch_size = 16):
        super(DataIter, self).__init__()
        self.batch_size = batch_size
        self.all_user_ids = set()
        self.data = data
        self.index = 0

        for user_id, item_id, _ in data:
            self.all_user_ids.add(user_id)

    @property
    def user_count(self):
        return len(self.all_user_ids)

    @property
    def item_count(self):
        # we just know the value even though 10 of them were
        # not voted
        return 150

    def next(self):
        index = self.index * self.batch_size
        endindex = index + self.batch_size

        if len(self.data) <= index:
            raise StopIteration
        else:
            user_ids = []
            item_ids = []
            ratings = []

            user_ids = self.data[index:endindex, 0]
            item_ids = self.data[index:endindex, 1]
            ratings   = self.data[index:endindex, 2]

            data_all = [mx.nd.array(user_ids), mx.nd.array(item_ids)]
            label_all = [mx.nd.array([r]) for r in ratings]

            self.index += 1

            return mx.io.DataBatch(data_all, label_all)

    def reset(self):
        self.index = 0
        random.shuffle(self.data)

def get_data(batch_size):
    user_ids = []
    item_ids = []
    ratings = []

    with open("ratings.dat", "r") as file:
        for line in file:
            user_id, item_id, rating, _ = line.strip().split("::")

            user_ids.append(int(user_id))
            item_ids.append(int(item_id))
            ratings.append(float(rating) / 10.0)

    all_raw = np.asarray(list(zip(user_ids, item_ids, ratings)), dtype='float32')

    return DataIter(all_raw, batch_size = batch_size)

train = get_data(64)

class Model(Block):
    def __init__(self, k, dataiter, **kwargs):
        super(Model, self).__init__(**kwargs)

        with self.name_scope():
            self.user_embedding = nn.Embedding(input_dim = dataiter.user_count, output_dim=k)
            self.item_embedding = nn.Embedding(input_dim = dataiter.item_count, output_dim=k)

    def forward(self, x):
        user = self.user_embedding(x[0] - 1)
        item = self.item_embedding(x[1] - 1)

        # the following is a dot product in essence
        # summing up of the element-wise multiplication
        pred = user * item
        return F.sum_axis(pred, axis = 1)

context = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu()
model = Model(16, train)
model.collect_params().initialize(mx.init.Xavier(), ctx=context)

# model.load_params("model.mxnet", ctx=context)

def fit(model, train, num_epoch):
    trainer = Trainer(model.collect_params(), 'adam')

    for epoch_id in range(num_epoch):
        print(f'epoch {epoch_id}')
        batch_id = 0
        train.reset()

        for batch in train:
            with autograd.record():
                targets = F.concat(*batch.label, dim=0)
                predictions = model(batch.data)
                L = L2Loss()
                loss = L(predictions, targets)
                loss.backward()

            trainer.step(batch.data[0].shape[0])

            if (batch_id + 1) % 1000 == 0:
                mean_loss = F.mean(loss).asnumpy()[0]
                logger.info(f'Epoch {epoch_id + 1} / {num_epoch} | Batch {batch_id + 1} | Mean Loss: {mean_loss}')

            batch_id += 1

        logger.info('Saving model parameters')
        model.save_params("model.mxnet")

fit(model, train, num_epoch=10)


epoch 0
epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9




In [5]:
user_embed = model.collect_params().get('embedding0_weight').data()
movie_embed = model.collect_params().get('embedding1_weight').data()


In [64]:
len(user_embed)

6040

In [55]:
import random 
from scipy import spatial

sample_user = random.randint(0, len(user_embed))
print(sample_user)
print(user_embed[sample_user])
max_sim = 0
max_index = sample_user
for i in range(len(user_embed)):
    if i == sample_user:
        continue
    result = 1 - spatial.distance.cosine(user_embed[sample_user].asnumpy(), user_embed[i].asnumpy())
    if result > max_sim:
        max_sim = result
        max_index = i
        
print(max_index)
print(user_embed[max_index])
print(1 - spatial.distance.cosine(user_embed[sample_user].asnumpy(), user_embed[max_index].asnumpy()))

import pandas as pd
uname = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(r'users.dat', sep='::', header=None, names=uname, engine='python')

print(users[users['user_id']==sample_user-1])
print(users[users['user_id']==max_index-1])


330

[-0.2814759   0.47955954 -0.4163226   0.6014935   0.3488514  -0.22989404
 -0.479457   -0.4579066   0.4853755  -0.41702837  0.39829582  0.42214832
 -0.36678374  0.39721078 -0.45943242  0.349886  ]
<NDArray 16 @cpu(0)>
105

[-0.24133831  0.36556458 -0.46817294  0.47563738  0.40615803 -0.29820225
 -0.45854884 -0.4493409   0.44573393 -0.47168472  0.41772455  0.44076234
 -0.4080366   0.4263224  -0.41405335  0.3240807 ]
<NDArray 16 @cpu(0)>
0.9908048510551453
     user_id gender  age  occupation    zip
328      329      M   35           7  02115
     user_id gender  age  occupation    zip
103      104      M   25          12  00926


In [79]:
import random 
from scipy import spatial
K = 10
sample_user = random.randint(0, len(user_embed))
print(sample_user)
print(user_embed[sample_user])
max_sim = 0
max_index = sample_user

result_list = []
for i in range(len(user_embed)):
    if i == sample_user:
        continue
    result = 1 - spatial.distance.cosine(user_embed[sample_user].asnumpy(), user_embed[i].asnumpy())
    result_list.append((i, result))
    result_list.sort(key=lambda x:x[1], reverse=True)
    if len(result_list) > K:
        result_list = result_list[:-1]
        
max_index, max_result = result_list[0]
print(max_index)
print(user_embed[max_index])
print(max_result)

import pandas as pd
uname = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(r'users.dat', sep='::', header=None, names=uname, engine='python')

print(users.iloc[sample_user])

print(f'Top K({K})')
for user_index, sim in result_list:
    print(users.iloc[user_index])
    print(f'sim: {sim}')
    print()

3938

[-0.15021153  0.12661785 -0.09784684  0.0980696   0.11741301 -0.1277754
 -0.10838747 -0.10479416  0.10494723 -0.12359551  0.11032974  0.11505858
 -0.14985555  0.1257495  -0.10110646  0.14811824]
<NDArray 16 @cpu(0)>
3746

[-0.19810379  0.16476549 -0.14969537  0.15984833  0.16337422 -0.18765411
 -0.15384136 -0.13775812  0.14950493 -0.16030228  0.18077767  0.15279788
 -0.19694464  0.16902941 -0.16141011  0.17144312]
<NDArray 16 @cpu(0)>
0.995668888092041
user_id        3939
gender            M
age              45
occupation        7
zip           91405
Name: 3938, dtype: object
Top K(10)
user_id        3747
gender            M
age              45
occupation       13
zip           89109
Name: 3746, dtype: object
sim: 0.995668888092041

user_id        4735
gender            M
age              45
occupation        3
zip           50312
Name: 4734, dtype: object
sim: 0.9953199028968811

user_id        1402
gender            M
age              18
occupation        4
zip           90601


In [1]:
import pandas as pd
uname = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(r'users.dat', sep='::', header=None, names=uname, engine='python')

In [3]:
users.to_pickle('users.pkl')