In [9]:
from collections import Counter
import pandas as pd
import tensorflow as tf
import numpy as np
import tensorflow_recommenders as tfrs
from typing import Dict, Text

In [2]:
# setup HW
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

# check HW availability
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU')))

Num GPUs Available:  1
Num CPUs Available:  1


In [3]:
# load data
dataset_df = pd.read_csv('data/hackprague_txs.csv', sep = ";")
dataset_df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,transaction_id,tx_date,client_id,client_gender,client_year_of_birth,amount,merchant_uid,merchant_category,shop_tags,merchant_category_id,shop_uid,shop_type,region,country
0,1,2020-07-02,146167,Female,1980.0,-594,X0noQaM22AHzkPN9erAb74,Professional Services,{Delivery Service},1483998.0,zYMe28Vm9LFEpAN18vaQvG,MERCHANT,Pardubice,CZ
1,2,2020-06-11,40161,Female,1980.0,-986,X0noQaM22AHzkPN9erAb74,Professional Services,{Delivery Service},1483998.0,zYMe28Vm9LFEpAN18vaQvG,MERCHANT,Pardubice,CZ
2,3,2020-08-14,200977,Female,1965.0,-102,g0DedJv2k5uqz5bYEjdQn8,Food And Drink,,147.0,MkBX1n9BjZFAGoly0Yq3Xy,MERCHANT,Kladno,CZ
3,4,2020-06-10,200977,Female,1965.0,-158,g0DedJv2k5uqz5bYEjdQn8,Food And Drink,,147.0,MkBX1n9BjZFAGoly0Yq3Xy,MERCHANT,Kladno,CZ
4,5,2020-05-27,200977,Female,1965.0,-126,g0DedJv2k5uqz5bYEjdQn8,Food And Drink,,147.0,MkBX1n9BjZFAGoly0Yq3Xy,MERCHANT,Kladno,CZ


In [4]:
# map categories

mapping = [
    {
        'category': 'Bakery',
        'type': 'Groceries',
        'tags': ['Bakery'],
        'places': ['bakery']
     },
    {
        'category': 'Alcohol',
        'type': 'Groceries',
        'tags': ['Alcohol'],
        'places': ['liquor store']
     },
    {
        'category': 'Restaurant',
        'type': 'Food And Drink',
        'tags': ['Restaurant', 'Cuisine', 'Fast Food', 'Bageterie', 'Steak House / Burgers', 'Sushi'],
        'places': ['restaurant']
     },
    {
        'category': 'Party',
        'type': 'Food And Drink',
        'tags': ['Bar', 'Music Club', 'Alcohol', 'Wine', 'Drinking', 'Bear', 'Pub'],
        'places': ['night_club', 'bar', 'pub']
     },
    {
        'category': 'Clothing',
        'type': 'Fashion',
        'tags': ['Fashion Chain - Mix', 'Fashion Accessories', 'Clothing - Other'],
        'places': ['clothing_store']
     },
    {
        'category': 'Pets',
        'type': 'Pets',
        'tags': ['Pet Supplies', 'Pets'],
        'places': ['pet store']
     }      
]

def map_category(x):
    cat: str = x[0]
    tags: str = x[1]

    if pd.isnull(tags):
        return None

    for item in mapping:
        if item['type'] == cat:
            if any(x in tags for x in item['tags']):
                return item['category']
    return None

dataset_filtered_df = dataset_df[dataset_df['merchant_category'].isin(list(map(lambda x: x['type'], mapping))) == True]
dataset_filtered_df['category'] = dataset_filtered_df[['merchant_category','shop_tags']].apply(map_category, axis=1)
dataset_filtered_df = dataset_filtered_df[~dataset_filtered_df['category'].isnull() & ~dataset_filtered_df['merchant_uid'].isnull()]
dataset_filtered_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_filtered_df['category'] = dataset_filtered_df[['merchant_category','shop_tags']].apply(map_category, axis=1)


Unnamed: 0,transaction_id,tx_date,client_id,client_gender,client_year_of_birth,amount,merchant_uid,merchant_category,shop_tags,merchant_category_id,shop_uid,shop_type,region,country,category
2749,2750,2020-06-14,200428,Male,1980.0,-105,beXJ9o0k3NSl9PaoEz4eR3,Food And Drink,"{Bar,Drinking}",147.0,EWAavyAo2GT5LYqkPPoEnj,MERCHANT,Hlavní město Praha,CZ,Party
2750,2751,2020-09-19,169335,Male,1995.0,-89,beXJ9o0k3NSl9PaoEz4eR3,Food And Drink,"{Bar,Drinking}",147.0,EWAavyAo2GT5LYqkPPoEnj,MERCHANT,Hlavní město Praha,CZ,Party
2751,2752,2020-09-04,137596,Male,1985.0,-950,beXJ9o0k3NSl9PaoEz4eR3,Food And Drink,"{Bar,Drinking}",147.0,EWAavyAo2GT5LYqkPPoEnj,MERCHANT,Hlavní město Praha,CZ,Party
2752,2753,2020-08-28,111554,Female,1990.0,-894,beXJ9o0k3NSl9PaoEz4eR3,Food And Drink,"{Bar,Drinking}",147.0,EWAavyAo2GT5LYqkPPoEnj,MERCHANT,Hlavní město Praha,CZ,Party
2753,2754,2020-08-19,40631,Male,1995.0,-644,beXJ9o0k3NSl9PaoEz4eR3,Food And Drink,"{Bar,Drinking}",147.0,EWAavyAo2GT5LYqkPPoEnj,MERCHANT,Hlavní město Praha,CZ,Party


In [5]:
# filter users from Prague
users_in_prague = Counter(dataset_filtered_df[dataset_filtered_df['region'].str.contains("Praha") == True]['client_id'])
users_transactions = Counter(dataset_filtered_df[dataset_filtered_df['client_id'].isin(list(users_in_prague.keys()))]['client_id'])
users_to_keep = [k for k, v in users_in_prague.items() if (v / users_transactions[k]) > 0.8]

# update dataset
dataset_filtered_df = dataset_filtered_df.where(dataset_filtered_df['client_id'].isin(users_to_keep))

In [6]:
# prepare data
visits_df = dataset_filtered_df[~dataset_filtered_df['category'].isnull()
                             & ~dataset_filtered_df['merchant_uid'].isnull()][['client_id', 'merchant_uid']]\
    .drop_duplicates()

In [7]:
# create TF dataset - places + visits
visits = tf.data.Dataset.from_tensor_slices(np.array([[x[0], str(x[1])] for x in visits_df.values])).map(lambda x: {
    "merchant_uid": x[1],
    "client_id": x[0],
})
places = tf.data.Dataset.from_tensor_slices(visits_df['merchant_uid'].unique().reshape(-1, 1))

In [8]:
user_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(visits.map(lambda x: x["client_id"]))

places_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
places_vocabulary.adapt(places)

In [10]:
# train model

class MovieLensModel(tfrs.Model):
  # We derive from a custom base class to help reduce boilerplate. Under the hood,
  # these are still plain Keras Models.

  def __init__(
      self,
      user_model: tf.keras.Model,
      place_model: tf.keras.Model,
      task: tfrs.tasks.Retrieval):
    super().__init__()

    # Set up user and movie representations.
    self.user_model = user_model
    self.place_model = place_model

    # Set up a retrieval task.
    self.task = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # Define how the loss is computed.

    user_embeddings = self.user_model(features["client_id"])
    place_embeddings = self.place_model(features["merchant_uid"])

    return self.task(user_embeddings, place_embeddings)

In [11]:
# Define user and movie models.
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocab_size(), 64)
])
place_model = tf.keras.Sequential([
    places_vocabulary,
    tf.keras.layers.Embedding(places_vocabulary.vocab_size(), 64)
])

# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    places.batch(128).map(place_model)
  )
)

In [42]:
# Create a retrieval model.
model = MovieLensModel(user_model, place_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Train for 3 epochs.
model.fit(visits.batch(2048), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f329ff34220>

In [44]:
# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index(places.batch(100).map(model.place_model), places)

# Get some recommendations.
_, titles = index(tf.constant([str(users_to_keep[1])]))
print(users_to_keep[1])
print(f"Top N recommendations for user: {titles[0, :100]}")

111554
Top N recommendations for user: [b'qka7G41e3ZfEpPjENMNnZK' b'K70me03zXvH9N5rNyZondj'
 b'2MmN44emyoIJr1WaGYlQA5' b'geknJ4ajBpc9aWGVLPRen2'
 b'J9VENr0Q3ZSaZ7BoMqBZZK' b'09QVg32QZkcJEdk3bj7Zyv'
 b'ozpJL2ea3pI9WbVVKDQ8NB' b'DLd2JkVvb1h4lo1WDzyNrM'
 b'3ALPl5mQWoIQdJBb0agdry' b'LedJkgpompCydrPEVoPvk2']
