In [1]:
%%file default_settings.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals

DEFAULT_SETTINGS = {
            # redis
            'expire': 3600 * 24 * 30,
            'redis': {
                'host': 'localhost',
                'port': 6379,
                'db': 0
            },
            # recommendation engine settings
            'recommendation_count': 10,
            'recommendation': {
                'update_interval_sec': 600,
                'search_depth': 100,
                'max_history': 1000,
            },
}

Overwriting default_settings.py


In [2]:
%%file timeit.py
# -*- coding: utf-8 -*-
from functools import wraps
import time


def timeit(fun=None):
    def _timeit(f):
        @wraps(f)
        def _inner(*args, **kwargs):
            ts = time.time()
            result = f(*args, **kwargs)
            te = time.time()
            print u'task:{} args:[{}, {}] took: {} sec\n' \
                  .format(f.__name__, args, kwargs, te-ts)
            return result

        return _inner

    return _timeit(fun) if fun else _timeit

Writing timeit.py


In [3]:
%%file mutex.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals


class Lock(object):
    def __init__(self, client, key, expire):
        self._r = client
        self.key = key
        self.expire = expire

    def lock(self):
        self._r.setex(self.key, 1, self.expire)

    def unlock(self):
        self._r.delete(self.key)

    def is_lock(self):
        return bool(self._r.get(self.key))

Writing mutex.py


In [4]:
%%file recommender.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
from collections import defaultdict
from .timeit import timeit
from .repository import Repository
from .default_settings import DEFAULT_SETTINGS

DEFAULT_TAG = 'default'


class Recommender(object):
    _r = None

    def __init__(self, settings):
        DEFAULT_SETTINGS.update(settings)
        self.settings = DEFAULT_SETTINGS

    @property
    def repository(self):
        if self._r is None:
            self._r = Repository(self.settings)
        return self._r

    def get(self, goods_id, count=None):
        return self.repository.get(goods_id, count=count)

    def update(self, goods_id):
        """
        update recommendation list
        :param goods_id: str
        """
        self.repository.update_recommendation(goods_id)
        return

    def register(self, goods_id, tag=DEFAULT_TAG):
        """
        register goods_id
        :param goods_id: int
        :param tag: str
        :rtype : None
        """
        return self.repository.register(goods_id, tag)

    def like(self, user_id, goods_ids, realtime_update=True, enable_update_interval=True):
        """
        record user like history
        :param str or unicode user_id: user_id
        :param list[int] goods_ids: list of goods_id
        :param bool realtime_update: update recommendation
        :param bool enable_update_interval: will update recommendation list at a constant interval
        :rtype : None
        """
        if type(goods_ids) in [int, long, str, unicode]:
            goods_ids = list(goods_ids)
        assert type(goods_ids) == list

        # like
        self.repository.like(user_id, goods_ids)

        # update index
        self.repository.update_index(user_id, goods_ids)

        # update recommendation list
        if realtime_update:
            for goods_id in goods_ids:
                self.repository.update_recommendation(goods_id, enable_update_interval=enable_update_interval)  # RealTime update

        return

    def get_all_goods_ids(self):
        """
        all registered goods ids
        WARNING!! this is heavy method about 1-100sec
        :rtype : list[int]
        """
        return self.repository.get_all_goods_ids()

    def update_all(self, proc=1, scope=None):
        """
        update all recommendation
        :param int proc: Multiprocess thread count
        :param tuple(list[int, int]) scope: update scope [start, partition count]
        :rtype : None
        """
        all_goods_ids = self.get_all_goods_ids()
        targets = all_goods_ids
        if scope:
            targets = slice_list(all_goods_ids, scope)
        for goods_id in targets:
            self.repository.update_recommendation(goods_id)

    def recreate_all_index(self):
        """
        update all index
        WARNING!! this method use high memory
        100,000 user >> memory 100MByte
        1,000,000 user >> memory 1GByte
        10,000,000 user >> memory 10GByte
        :rtype : None
        """
        # get all goods ids
        all_goods_ids = self.get_all_goods_ids()

        # get all user's like history
        all_users_like_history = self.get_all_users_like_history()

        # marge user's like history by goods_id
        for tag in all_users_like_history:
            hist = defaultdict(list)
            users_like_history = all_users_like_history.get(tag)
            for user_id in users_like_history:
                for goods_id in users_like_history[user_id]:
                    hist[goods_id] += [user_id]

            # recreate index
            for goods_id in all_goods_ids:
                if goods_id in hist:
                    self.repository.recreate_index(goods_id, hist[goods_id])

    def get_all_users_like_history(self):
        """
        :rtype : dict{str: list[int]}
        :rtype dict{str: dict{str: list[str]}} : dict{user_id: dict{tag:list[goods_id]}}
        """
        # get all user like history keys
        all_user_keys = self.repository.get_all_user_ids()

        result = defaultdict(dict)
        for key in all_user_keys:
            tag, user_id = Repository.get_user_and_key_from_redis_key(key)
            result[tag].update({user_id: self.repository.get_user_like_history(user_id, tag)})
        return result

    def remove_goods(self, goods_id):
        self.repository.remove_goods(goods_id)

    def update_goods_tag(self, goods_id, new_tag):
        self.remove_goods(goods_id)
        self.register(goods_id, tag=new_tag)

    def remove_user(self, user_id):
        self.repository.remove_user(user_id)


def slice_list(l, scope):
    if scope is None:
        return l
    if scope[1] <= 1:
        raise ValueError
    if scope[0] == scope[1]:
        raise ValueError
    if len(scope) != 2:
        raise AssertionError
    l.sort()
    length = len(l)
    _base = length / scope[1]
    start = max((_base * scope[0]) - 10, 0)
    finish = _base * (scope[0] + 1) + 10

    return l[start:finish]

Writing recommender.py


In [5]:
%%file repository.py
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
from collections import defaultdict
import random
from redis import Redis
from .default_settings import DEFAULT_SETTINGS
from .mutex import Lock

# redis key
PREFIX = 'CF'
GOODS_TAG_BASE = '%s:GOODS:TAG:{}' % PREFIX
USER_LIKE_HISTORY_BASE = '%s:USER:LIKE-HIS:{}:{}' % PREFIX
INDEX_GOODS_USER_BASE = '%s:INDEX:GOODS-HIS:{}:{}' % PREFIX
GOODS_RECOMMENDATION = '%s:GOODS:RECO:{}:{}' % PREFIX
GOODS_MUTEX = '%s:GOODS:MUTEX:{}:{}' % PREFIX

# redis hash key
HASH_FIELD_GOODS_TAG = "TAG"


class Repository(object):
    _CACHE_GOODS_TAG = {}  # class cache
    _CLI = None

    def __init__(self, settings=DEFAULT_SETTINGS):
        DEFAULT_SETTINGS.update(settings)
        self.settings = DEFAULT_SETTINGS

    @classmethod
    def get_key_goods_tag(cls, goods_id):
        return GOODS_TAG_BASE.format(str(goods_id))

    @classmethod
    def get_key_user_like_history(cls, tag, user_id):
        return USER_LIKE_HISTORY_BASE.format(tag, user_id)

    @classmethod
    def get_key_index_goods_user_like_history(cls, tag, goods_id):
        return INDEX_GOODS_USER_BASE.format(tag, str(goods_id))

    @classmethod
    def get_key_goods_recommendation(cls, tag, goods_id):
        return GOODS_RECOMMENDATION.format(tag, str(goods_id))

    @classmethod
    def get_key_goods_mutex(cls, tag, goods_id):
        return GOODS_MUTEX.format(tag, str(goods_id))

    @classmethod
    def get_user_and_key_from_redis_key(cls, key):
        """
        >>> key = "CF_RECOMMENDER:USER:LIKE-HISTORY:BOOK:035A6959-B024-43CD-9FE9-5BCD4A0E5A92"
        >>> r = key.split(':')
        >>> r[3:]
        ['BOOK', '035A6959-B024-43CD-9FE9-5BCD4A0E5A92']
        :rtype : list[str]
        """
        r = key.split(':')
        return r[3:]

    @property
    def client(self):
        if Repository._CLI is None:
            Repository._CLI = Redis(host=self.settings.get('redis').get('host'),
                                    port=int(self.settings.get('redis').get('port')),
                                    db=int(self.settings.get('redis').get('db')), )
        return Repository._CLI

    @property
    def expire(self):
        return self.settings.get('expire')

    def touch(self, key):
        self.client.expire(key, self.expire)

    def get(self, goods_id, count=None):
        """
        get recommendation list
        :param goods_id: str
        :param count: int
        :rtype list[str]: list of recommendation goods
        """
        if not count:
            count = self.settings.get('recommendation_count')
        tag = self.get_tag(goods_id)
        key = Repository.get_key_goods_recommendation(tag, goods_id)
        self.touch(key)
        return self.client.zrevrange(key, 0, count - 1)

    def get_goods_tag(self, goods_id):
        tag = Repository._CACHE_GOODS_TAG.get(goods_id)
        if tag is None:
            tag = self.get_tag(goods_id)
            Repository._CACHE_GOODS_TAG[goods_id] = tag
        return tag

    def get_tag(self, goods_id):
        key = self.get_key_goods_tag(goods_id)
        return self.client.hget(key, HASH_FIELD_GOODS_TAG)

    def register(self, goods_id, tag):
        """
        register goods_id
        :param goods_id: str
        :param tag: str
        :rtype : None
        """
        key = Repository.get_key_goods_tag(goods_id)
        return self.client.hset(key, HASH_FIELD_GOODS_TAG, tag)

    def like(self, user_id, goods_ids):
        """
        record user like history
        :param user_id: str
        :param goods_ids: list[str]
        :rtype : None
        """
        goods_group = self.categorized(goods_ids)
        for tag in goods_group:
            key = Repository.get_key_user_like_history(tag, user_id)
            _goods_ids = goods_group[tag]
            if _goods_ids:
                self.client.rpush(key, *_goods_ids)
            self.touch(key)
            self.trim(key)
        return

    def categorized(self, goods_ids):
        """
        :param dict{str: list[str]} goods_ids: dict{tag: list[goods_id]}
        :return:
        """
        result = defaultdict(list)
        for goods_id in goods_ids:
            result[self.get_tag(str(goods_id))] += [str(goods_id)]
        return result

    def update_recommendation(self, goods_id, enable_update_interval=False):
        """
        Update goods recommendation list.
        If enable_update_interval is True, will update recommendation list at a constant interval
        :param goods_id: str
        :param enable_update_interval: bool
        """
        tag = self.get_tag(goods_id)
        if tag is None:
            return  # goods doesn't exist

        # will update at a constant interval
        if enable_update_interval:
            if self.is_lock(goods_id):
                return
            self.lock(goods_id)

        # get user
        users = self.get_goods_like_history(goods_id)

        # calc recommendation
        recommendation_list = []
        for user_id in users:
            recommendation_list += self.get_user_like_history(user_id, tag)

        result = defaultdict(int)
        for _tmp_goods_id in recommendation_list:
            tag = self.get_tag(_tmp_goods_id)
            if tag is None:
                continue

            if _tmp_goods_id == goods_id:
                continue
            result[_tmp_goods_id] += 1

        # set sorted set of redis
        key = Repository.get_key_goods_recommendation(tag, goods_id)
        self.client.delete(key)
        for _tmp_goods_id in result:
            self.push_recommendation(key, _tmp_goods_id, result[_tmp_goods_id])
        return

    def update_index(self, user_id, goods_ids):
        """
        update goods index
        :param user_id: str
        :param goods_ids: list[str]
        :rtype : None
        """
        for goods_id in goods_ids:
            tag = self.get_tag(goods_id)
            key = Repository.get_key_index_goods_user_like_history(tag, goods_id)
            self.client.rpush(key, user_id)
            self.trim(key)
        return

    def get_goods_like_history(self, goods_id, count=None):
        """
        :param goods_id: str
        :param count: int
        :rtype list[str]: liked users of goods
        """
        if not count:
            count = self.settings.get('recommendation').get('search_depth')
        tag = self.get_tag(goods_id)
        key = Repository.get_key_index_goods_user_like_history(tag, goods_id)
        return self.client.lrange(key, -1 * count, -1)

    def get_all_goods_ids(self):
        """
        all registered goods ids
        :rtype : list[str]
        """
        key = Repository.get_key_goods_tag('*')
        result = self.client.keys(key)
        del_word = GOODS_TAG_BASE[0:len(GOODS_TAG_BASE)-2]
        return map(lambda x: x.replace(del_word, ''), result)

    def get_all_user_ids(self):
        """
        all user ids
        :rtype : list[str]
        """
        key = Repository.get_key_user_like_history('*', '*')
        return self.client.keys(key)

    def get_user_like_history(self, user_id, tag, count=None):
        """
        :param user_id: str or unicode
        :rtype list[str]: goods_ids of user
        """
        if not count:
            count = self.settings.get('recommendation').get('search_depth')
        key = Repository.get_key_user_like_history(tag, user_id)
        result = self.client.lrange(key, -1 * count, -1)
        if not result:
            return []
        return result

    def push_recommendation(self, key, goods_id, value):
        """
        update recommendation sorted set
        :param str goods_id:
        :param str value: count
        """
        self.client.zadd(key, goods_id, int(value))
        self.touch(key)

    def recreate_index(self, goods_id, user_ids):
        """
        recreate goods_id liked users index
        :param goods_id: str
        :param user_ids: list[str or unicode]
        :rtype : None
        """
        if not user_ids:
            return
        tag = self.get_tag(goods_id)
        key = Repository.get_key_index_goods_user_like_history(tag, goods_id)
        self.client.delete(key)
        # update list
        self.client.rpush(key, *user_ids)
        return

    def get_all_goods_by_user(self, user_id):
        """
        get all liked goods by user
        :param user_id: str
        :rtype: list[goods_id]
        """
        keys_asterisk_pattern = Repository.get_key_user_like_history('*', user_id)
        keys = self.client.keys(keys_asterisk_pattern)
        goods_history = []
        for key in keys:
            goods_history += self.client.lrange(key, 0, -1)
        return goods_history

    def remove_goods(self, goods_id):
        # remove goods tag and recommendation
        tag = self.get_goods_tag(goods_id)
        key_tag = Repository.get_key_goods_tag(goods_id)
        key_recommendation = Repository.get_key_goods_recommendation(tag, goods_id)
        self.client.delete(key_tag, key_recommendation)
        self.client.delete(key_recommendation)

        # delete tag cache
        del Repository._CACHE_GOODS_TAG[goods_id]
        return

    def remove_user(self, user_id):
        """
        remove user
        :param user_id: str
        """
        # get user's redis key
        keys_asterisk_pattern = Repository.get_key_user_like_history('*', user_id)
        keys = self.client.keys(keys_asterisk_pattern)
        users_goods = self.get_all_goods_by_user(user_id)

        # delete user from index
        self.remove_user_from_index(user_id, users_goods)

        # delete user from history
        for key in keys:
            self.client.delete(key)

    def remove_user_from_index(self, user_id, goods_ids):
        """
        remove user from INDEX_GOODS_USER_BASE
        :param user_id: str
        :param goods_ids: list[str]
        """
        for goods_id in goods_ids:
            tag = self.get_goods_tag(goods_id)
            key = Repository.get_key_index_goods_user_like_history(tag, goods_id)
            self.client.lrem(key, user_id, 0)
        return

    def lock(self, goods_id, interval_sec=None):
        """
        When interval_sec is 0, not lock.
        :param goods_id: str
        :rtype : None
        """
        if interval_sec is None:
            interval_sec = self.settings.get('recommendation').get('update_interval_sec')
        if interval_sec == 0:
            return

        tag = self.get_goods_tag(goods_id)
        key = Repository.get_key_goods_mutex(tag, goods_id)
        self.get_lock(key, interval_sec).lock()
        return

    def is_lock(self, goods_id):
        """
        :param goods_id: str
        :rtype : bool
        """
        # When interval_sec is 0, not lock.
        if self.settings.get('recommendation').get('update_interval_sec') == 0:
            return False

        tag = self.get_goods_tag(goods_id)
        key = Repository.get_key_goods_mutex(tag, goods_id)
        return self.get_lock(key, 1).is_lock()

    def get_lock(self, key, interval_sec):
        """
        get Lock object
        :param key: str
        :param interval_sec: int
        :rtype : Lock
        """
        return Lock(self.client, key, interval_sec)

    def trim(self, key, _max=None, hardly_ever=True):
        """
        trim redis list data object
        :param str key:
        :param int _max: Trim Redis list If the max value is over
        :param bool hardly_ever: bool
        :rtype: None
        """
        if hardly_ever and random.randint(1, 20) != 1:
            return

        if _max is None:
            _max = self.settings.get('recommendation').get('max_history')

        if self.client.llen(key) < _max * 2:
            return
        self.client.ltrim(key, _max * -1, -1)
        return

Writing repository.py


In [3]:
# -*- coding: utf-8 -*-
#from __future__ import absolute_import, unicode_literals
from Arthur.core.recomeng.recommender.recommender import Recommender

cf_settings = {
    # redis
    'expire': 3600 * 24 * 30,
    'redis': {
        'host': 'localhost',
        'port': 6379,
        'db': 0
    },
    # recommendation engine settings
    'recommendation_count': 10,
    'recommendation': {
        'update_interval_sec': 600,
        'search_depth': 100,
        'max_history': 1000,
    },
}


# Get recommendation list
item_id = 'Item1'
recommendation = Recommender(cf_settings)
print recommendation.get(item_id, count=3)
#>>> ['Item10', 'Item3', 'Item2']

# register history
user_id = 'user-00001'
buy_items = ['Item10', 'Item10', 'Item10', 'Item3', 'Item3', 'Item1']
for item_id in buy_items:
    recommendation.register(item_id)
recommendation.like(user_id, buy_items)

[]


In [25]:
# -*- coding: utf-8 -*-
#from __future__ import absolute_import, unicode_literals
from Arthur.core.recomeng.recommender.recommender import Recommender
import random
from uuid import uuid4


settings = {
    'expire': 3600 * 24 * 100,
    # redis
    'redis': {
        'host': 'localhost',
        'port': 6379,
        'db': 11
    },
}

#_MAX = 10 * 10000
_MAX=100
# register new goods
r = Recommender(settings=settings)
for x in xrange(1, _MAX):
    r.register(x)

# like goods_ids
for x in xrange(1, 100):
    user_id = str(uuid4())
    like_goods_ids = [random.randint(1, _MAX) for _x in range(random.randint(1, 100))]
    r.like(user_id, like_goods_ids)
    if x % 100 == 0:
        print "{}/{}".format(str(x), str(_MAX))

In [26]:
r.get(2, count=3)

['47', '40', '31']

In [7]:
item_id = 'Item1'
recommendation = Recommender(cf_settings)
print recommendation.get(item_id, count=3)

['Item10', 'Item3']


In [9]:
item_id = 'book'
recommendation = Recommender(cf_settings)
print recommendation.get(item_id, count=3)

[]


In [32]:
len(r.get_all_goods_ids())

100002

In [34]:
settings = {
    'expire': 3600 * 24 * 100,
    # redis
    'redis': {
        'host': 'localhost',
        'port': 6379,
        'db': 33
    },
}

#_MAX = 10 * 10000
_MAX=100
# register new goods
rt = Recommender(settings=settings)
len(rt.get_all_goods_ids())

100002

In [37]:
from Arthur.core.recomeng.recommender.recommender import Recommender

settings = {
    'expire': 3600 * 24 * 100,
    # redis
    'redis': {
        'host': 'localhost',
        'port': 6379,
        'db': 10
    },
}

#_MAX = 10 * 10000
_MAX=100
# register new goods
rr = Recommender(settings=settings)
len(rr.get_all_goods_ids())

100002

In [17]:
[random.randint(1, 10000) for _x in range(random.randint(1, 10))]

[6770, 4630]

In [21]:
# -*- coding: utf-8 -*-
#from __future__ import absolute_import, unicode_literals
from Arthur.core.recomeng.recommender.recommender import Recommender
import random
from uuid import uuid4


settings = {
    'expire': 3600 * 24 * 100,
    # redis
    'redis': {
        'host': 'localhost',
        'port': 6379,
        'db': 11
    },
}

#_MAX = 10 * 10000
_MAX=100
# register new goods
tags = ['default', 'book', 'computer', 'dvd', 'camera', 'clothes', 'tag7', 'tag8', 'tag9', 'tag10']
r = Recommender(settings=settings)
for x in xrange(1, _MAX):
    r.register(x, tag=random.choice(tags))

# like goods_ids
for x in xrange(1, 100):
    user_id = str(uuid4())
    like_goods_ids = [random.randint(1, _MAX) for _x in range(random.randint(1, 100))]
    r.like(user_id, like_goods_ids, realtime_update=False)
    if x % 100 == 0:
        print "{}/{}".format(str(x), str(_MAX))