In [22]:

""" Models that use various Approximate Nearest Neighbours libraries in order to quickly
generate recommendations and lists of similar items.
See http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/
"""
import time
import sys
import numpy
import itertools
try:
    import annoy
except ImportError:
    print("The package 'annoy' is required to run this example.")
    sys.exit()

try:
    import nmslib
except ImportError:
    print("The package 'nmslib' is required to run this example.")
    sys.exit()

import numpy as np
from scipy.sparse import csr_matrix

from sklearn.base import BaseEstimator, TransformerMixin
import logging
log = logging.getLogger("recsys")



def augment_inner_product_matrix(factors):
    """ This function transforms a factor matrix such that an angular nearest neighbours search
    will return top related items of the inner product.
    This involves transforming each row by adding one extra dimension as suggested in the paper:
    "Speeding Up the Xbox Recommender System Using a Euclidean Transformation for Inner-Product
    Spaces" https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf
    Basically this involves transforming each feature vector so that they have the same norm, which
    means the cosine of this transformed vector is proportional to the dot product (if the other
    vector in the cosine has a 0 in the extra dimension). """
    norms = numpy.linalg.norm(factors, axis=1)
    max_norm = norms.max()

    # add an extra dimension so that the norm of each row is the same
    # (max_norm)
    extra_dimension = numpy.sqrt(max_norm ** 2 - norms ** 2)
    return max_norm, numpy.append(factors, extra_dimension.reshape(norms.shape[0], 1), axis=1)


class AnnoyTransformer(TransformerMixin, BaseEstimator):
    """Wrapper for using annoy.AnnoyIndex as sklearn's KNeighborsTransformer"""

    def __init__(self, n_neighbors=5, metric='euclidean', n_trees=10,
                 search_k=-1,approximate_similar_items=True,approximate_recommend=True,
                 user_embeddings= None,item_embeddings=None):
        self.n_neighbors = n_neighbors
        self.n_trees = n_trees
        self.search_k = search_k
        self.metric = metric
        self.approximate_similar_items = approximate_similar_items
        self.approximate_recommend=approximate_recommend
        self.user_embeddings = user_embeddings
        self.item_embeddings = item_embeddings
    def fit(self, X):
        self.n_samples_fit_ = X.shape[0]
        metric = self.metric if self.metric != 'sqeuclidean' else 'euclidean'
        self.annoy_ = annoy.AnnoyIndex(X.shape[1], metric=metric)
        for i, x in enumerate(X):
            self.annoy_.add_item(i, x.tolist())
        self.annoy_.build(self.n_trees)
        
        #New add
        self.item_factors = X
        if self.approximate_similar_items:
            
            log.debug("Building annoy similar items index")
            self.similar_items_index = annoy.AnnoyIndex(
                self.item_factors.shape[1], 'angular')
            for i, row in enumerate(self.item_factors):
                self.similar_items_index.add_item(i, row)
            self.similar_items_index.build(self.n_trees)
            
        # build up a separate index for the inner product (for recommend
        # methods)
        if self.approximate_recommend:
            log.debug("Building annoy recommendation index")
            self.max_norm, extra = augment_inner_product_matrix(self.item_factors)
            self.recommend_index = annoy.AnnoyIndex(extra.shape[1], 'angular')
            for i, row in enumerate(extra):
                self.recommend_index.add_item(i, row)
            self.recommend_index.build(self.n_trees)

        return self

    def transform(self, X):
        return self._transform(X)

    def fit_transform(self, X, y=None):
        return self.fit(X)._transform(X=None)

    def _transform(self, X):
        """As `transform`, but handles X is None for faster `fit_transform`."""

        n_samples_transform = self.n_samples_fit_ if X is None else X.shape[0]

        # For compatibility reasons, as each sample is considered as its own
        # neighbor, one extra neighbor will be computed.
        n_neighbors = self.n_neighbors + 1

        indices = np.empty((n_samples_transform, n_neighbors),
                           dtype=np.int)
        distances = np.empty((n_samples_transform, n_neighbors))

        if X is None:
            for i in range(self.annoy_.get_n_items()):
                ind, dist = self.annoy_.get_nns_by_item(
                    i, n_neighbors, self.search_k, include_distances=True)

                indices[i], distances[i] = ind, dist
        else:
            for i, x in enumerate(X):
                indices[i], distances[i] = self.annoy_.get_nns_by_vector(
                    x.tolist(), n_neighbors, self.search_k,
                    include_distances=True)

        if self.metric == 'sqeuclidean':
            distances **= 2

        indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
                           n_neighbors)
        kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
                                       indptr), shape=(n_samples_transform,
                                                       self.n_samples_fit_))

        return kneighbors_graph

    def similar_items(self, itemid, N=10):
        #if not self.approximate_similar_items:
        #    return super(AnnoyAlternatingLeastSquares, self).similar_items(itemid, N)

        neighbours, dist = self.similar_items_index.get_nns_by_item(itemid, N,
                                                                    search_k=self.search_k,
                                                                    include_distances=True)
        # transform distances back to cosine from euclidean distance
        return zip(neighbours, 1 - (numpy.array(dist) ** 2) / 2)

    def recommend(self, userid, user_items, N=10, filter_already_liked_items=True,
                  filter_items=None, recalculate_user=False):
        #if not self.approximate_recommend:
        #    return super(NMSLibAlternatingLeastSquares,
        #                 self).recommend(userid, user_items, N=N,
        #                                filter_items=filter_items,
        #                                 recalculate_user=recalculate_user)

        #user = self._user_factor(userid, user_items, recalculate_user)

        # calculate the top N items, removing the users own liked items from
        # the results
        liked = set()
        #user_items =interactions_matrix 
        
        if filter_already_liked_items:
            #liked.update(user_items[userid].indices)
            liked.update(user_items.tocsr()[userid].indices)
        if filter_items:
            liked.update(filter_items)
        count = N + len(liked)
        
        user = self.user_embeddings[userid]
        query = numpy.append(user, 0)
        ids, dist = self.recommend_index.get_nns_by_vector(query, count, include_distances=True,
                                                           search_k=self.search_k)

        # convert the distances from euclidean to cosine distance,
        # and then rescale the cosine distance to go back to inner product
        scaling = self.max_norm * numpy.linalg.norm(query)
        dist = scaling * (1 - (numpy.array(dist) ** 2) / 2)
        return list(itertools.islice((rec for rec in zip(ids, dist) if rec[0] not in liked), N))

In [None]:
norms = np.linalg.norm(item_embeddings, axis=1)
max_norm = norms.max()
extra_dimension = np.sqrt(max_norm ** 2 - norms ** 2)
norm_data = np.append(item_embeddings, extra_dimension.reshape(norms.shape[0], 1), axis=1)

In [4]:
from lightfm.datasets import fetch_movielens

movielens = fetch_movielens()

In [5]:
train = movielens['train']
test = movielens['test']

In [6]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

model = LightFM(learning_rate=0.05, loss='warp', no_components=64, item_alpha=0.001)

model.fit_partial(train, item_features=movielens['item_features'], epochs=20 )

train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.73, test 0.09.
AUC: train 0.97, test 0.91.


In [7]:
_, item_embeddings = model.get_item_representations(movielens['item_features'])


In [8]:
# Define our user vectors

_, user_embeddings = model.get_user_representations()

In [18]:
ann_recom = AnnoyTransformer(user_embeddings=user_embeddings,item_embeddings=item_embeddings)

In [19]:
ann_recom_fit= ann_recom.fit(item_embeddings)

In [39]:
userid = 1
user_items=train
ann_recom_fit.recommend(userid, user_items, N=20, filter_already_liked_items=True,
                        filter_items=None, recalculate_user=False)

[(136, 2.3265348067071696),
 (8, 2.1597858030083117),
 (327, 2.1514774636771485),
 (123, 2.0236341978902663),
 (689, 1.9506655407820166),
 (332, 1.8867702788260263),
 (311, 1.8791406596704099),
 (878, 1.838610719839612),
 (749, 1.836402241329505),
 (814, 1.8158444006150678),
 (244, 1.809224028242631),
 (247, 1.7970648869910224),
 (507, 1.771336694819363),
 (321, 1.7449642427173029),
 (470, 1.7429613719346697),
 (320, 1.706161021723867),
 (747, 1.7043134040777057),
 (14, 1.6970882932241744),
 (245, 1.68820445688647),
 (844, 1.6259218379671931)]

In [24]:
train

<943x1682 sparse matrix of type '<class 'numpy.float32'>'
	with 90570 stored elements in COOrdinate format>

In [27]:
%%timeit 
ann_recom_fit.recommend(userid, user_items, N=10, filter_already_liked_items=True,
                        filter_items=None, recalculate_user=False)

956 µs ± 11.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [40]:
for i in ann_recom_fit.similar_items(19):
    print(i)

(19, 0.9999998892660652)
(115, 0.6977784538841831)
(18, 0.6950377336570952)
(13, 0.6064464370531084)
(9, 0.582929925711694)
(15, 0.5719863354176322)
(693, 0.5397662146593358)
(220, 0.5388534283593724)
(56, 0.5366351592978997)
(935, 0.5288477023333868)


In [38]:
# maps command line model argument to class name
MODELS = {"annoy_als": AnnoyTransformer,
          "lmf":LFMRecommender}


def get_model(model_name):
    print("getting model %s" % model_name)
    model_class = MODELS.get(model_name)
    if not model_class:
        raise ValueError("Unknown Model '%s'" % model_name)

    # some default params
    if issubclass(model_class, TransformerMixin):
        params = {}
    elif model_name == "bm25":
        params = {'K1': 100, 'B': 0.5}
    elif model_name == "bpr":
        params = {'factors': 63}
    elif model_name == "lmf":
        params = {'factors': 30, "iterations": 40, "regularization": 1.5}
    else:
        params = {}

    return model_class#(**params)

get_model("annoy_als")

getting model annoy_als


__main__.AnnoyTransformer

In [2]:
from bolt4ds.recsys import get_model



In [11]:
ann = get_model("annoy_als")(user_embeddings=user_embeddings,item_embeddings=item_embeddings)

getting model annoy_als


In [12]:
ann_recom_fit= ann.fit(X=item_embeddings)

In [13]:
userid = 1
user_items=train
ann_recom_fit.recommend(userid, user_items, N=20, filter_already_liked_items=True,
                        filter_items=None, recalculate_user=False)

[(136, 2.2221838295875784),
 (8, 2.2198093418191345),
 (123, 2.1993432818068874),
 (311, 2.0600969948114045),
 (321, 2.042870147274102),
 (590, 2.034225611736514),
 (244, 1.984773234128349),
 (291, 1.9604686484739655),
 (327, 1.9571264655870952),
 (247, 1.9373006321210666),
 (332, 1.912855179686669),
 (749, 1.8979130893279526),
 (318, 1.8502370477031043),
 (470, 1.838754050824516),
 (689, 1.8362735034439268),
 (12, 1.7967761171468761),
 (507, 1.7864370687288935),
 (14, 1.7801899000298878),
 (339, 1.7297110270661964),
 (267, 1.7245583937412818)]

In [14]:
import bolt4ds.flow.pipes

Loading postgres module without psycopg2 installed. Will crash at runtime if postgres functionality is used.


Welcome to bolt4ds.flow!
Welcome to d6tpipe!


In [16]:
# auto save d6tflow data to d6tpipe repo
cfg_pipe = 'pipename'
cfg_profile = 'default'
bolt4ds.flow.pipes.init(cfg_pipe,profile=cfg_profile, local_pipe=True) # work in local mode first


Operating in local mode, use this to access local files, to run remote operations use `Pipe()`


In [20]:
import bolt4ds.flow.tasks
d6tflow.tasks.TaskPqPandas

bolt4ds.flow.tasks.TaskPqPandas

In [21]:
import bolt4ds.flow as d6tflow
import bolt4ds.flow.pipes
