In [1]:
# Remember !!!
# First select the environment from Anaconda UI
# Then open the Jupyter notebook

In [2]:
import tensorflow as tf
print(tf.__version__)
# expected = 2.15.1


2.15.1


In [3]:
import tensorflow_ranking as tfr
print(tfr.__version__)
# expected 0.5.5.dev


0.5.5.dev


In [4]:
from typing import Dict, Tuple
import tensorflow_datasets as tfds

In [5]:

# Import our custom Dataloader
from data_loader import DataLoader

# Create an instance of DataLoader and pass the name of the dir that holds the dataset
data_loader = DataLoader(data_dir="data")


In [6]:

# 1. Load the CSV Data
CSV_PATH = "data/df_response_model_expression.csv"

def load_dataset():
    column_names = ['DRUG_ID',
                    'ARXSPAN_ID', 
                    'IC50_PUBLISHED', 
                    'OncotreeCode', 
                    'AgeCategory',
                    'Sex',
                    'ZMIZ1 (57178)', 
                    'ENG (2022)', 
                    'FGFR1 (2260)', 
                    'PAWR (5074)']

    column_defaults = {
        'DRUG_ID': tf.int32,
        'ARXSPAN_ID': tf.string,
        'IC50_PUBLISHED': tf.float32,
        'OncotreeCode': tf.string,
        'AgeCategory': tf.string,
        'Sex': tf.string,
        'ZMIZ1 (57178)': tf.float32,
        'ENG (2022)': tf.float32,
        'FGFR1 (2260)': tf.float32,
        'PAWR (5074)': tf.float32
    }
    
    # Load dataset from CSV file
    csvdataset = tf.data.experimental.make_csv_dataset(
        CSV_PATH,
        batch_size = 8,
        num_epochs = 1,  # Number of times to repeat the dataset
        shuffle = False,  # Shuffle the data
        column_names = column_names,  # Specify the column names
        column_defaults=column_defaults, # Specify the column types
        # label_name='IC50_PUBLISHED'  # Specify the target column
    )

    return csvdataset.unbatch()

In [7]:
ratings = tfds.load('movielens/100k-ratings', split="train")
movies = tfds.load('movielens/100k-movies', split="train")

print(type(ratings))
print(ratings)

<class 'tensorflow.python.data.ops.prefetch_op._PrefetchDataset'>
<_PrefetchDataset element_spec={'bucketized_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'movie_genres': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'movie_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'movie_title': TensorSpec(shape=(), dtype=tf.string, name=None), 'raw_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'timestamp': TensorSpec(shape=(), dtype=tf.int64, name=None), 'user_gender': TensorSpec(shape=(), dtype=tf.bool, name=None), 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_occupation_label': TensorSpec(shape=(), dtype=tf.int64, name=None), 'user_occupation_text': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(), dtype=tf.float32, name=None), 'user_zip_code': TensorSpec(shape=(), dtype=tf.string, name=None)}>
<class 'tensorflow.python.data.ops.unbatch_op._UnbatchDataset'>
<_UnbatchDataset element_spec=Or

In [None]:
prefetched = load_dataset()

print(type(prefetched))
print(prefetched)

In [8]:

# Select the basic features.

ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "user_rating": x["user_rating"]
})
print(ratings)

prefetched = prefetched.map(lambda x: {
    "DRUG_ID": x["DRUG_ID"],
    "ARXSPAN_ID": x["ARXSPAN_ID"],
    "IC50_PUBLISHED": tf.strings.to_number(x["IC50_PUBLISHED"], tf.float32)
})
print(prefetched)


<_MapDataset element_spec={'movie_title': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(), dtype=tf.float32, name=None)}>
<_MapDataset element_spec={'DRUG_ID': TensorSpec(shape=(), dtype=tf.string, name=None), 'ARXSPAN_ID': TensorSpec(shape=(), dtype=tf.string, name=None), 'IC50_PUBLISHED': TensorSpec(shape=(), dtype=tf.float32, name=None)}>


In [9]:

# Build vocabularies to convert all user ids and all movie titles into integer indices for embedding layers/:

users = ratings.map(lambda x: x["user_id"])
user_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(users.batch(1000))

movies = movies.map(lambda x: x["movie_title"])
movie_titles_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
movie_titles_vocabulary.adapt(movies.batch(1000))














In [10]:
cell_lines = prefetched.map(lambda x: x["ARXSPAN_ID"])
cell_line_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
cell_line_ids_vocabulary.adapt(cell_lines.batch(1000))

drugs = prefetched.map(lambda x: x["DRUG_ID"])
drug_ids_vocabulary = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
drug_ids_vocabulary.adapt(drugs.batch(1000))

In [11]:
# Group by user_id
key_func = lambda x: user_ids_vocabulary(x["user_id"])
reduce_func = lambda key, dataset: dataset.batch(100)
ds_train = ratings.group_by_window(key_func=key_func, reduce_func=reduce_func, window_size=100)

# Group by ARXSPAN_ID
my_key_func = lambda x: cell_line_ids_vocabulary(x["ARXSPAN_ID"])
my_reduce_func = lambda key, dataset: dataset.batch(100)
my_ds_train = prefetched.group_by_window(key_func=my_key_func, reduce_func=my_reduce_func, window_size=100)

In [12]:
for x in ds_train.take(1):
  for key, value in x.items():
    print(f"Shape of {key}: {value.shape}")
    print(f"Example values of {key}: {value[:5].numpy()}")
    print()

Shape of movie_title: (100,)
Example values of movie_title: [b'Man Who Would Be King, The (1975)' b'Silence of the Lambs, The (1991)'
 b'Next Karate Kid, The (1994)' b'2001: A Space Odyssey (1968)'
 b'Usual Suspects, The (1995)']

Shape of user_id: (100,)
Example values of user_id: [b'405' b'405' b'405' b'405' b'405']

Shape of user_rating: (100,)
Example values of user_rating: [1. 4. 1. 5. 5.]



In [13]:
for x in my_ds_train.take(1):
  for key, value in x.items():
    print(f"Shape of {key}: {value.shape}")
    print(f"Example values of {key}: {value[:5].numpy()}")
    print()

# !!! At this point investigate if indeed we have records of the ACH-000910 being treated with these drugs 

Shape of DRUG_ID: (100,)
Example values of DRUG_ID: [b'1003' b'1004' b'1005' b'1006' b'1007']

Shape of ARXSPAN_ID: (100,)
Example values of ARXSPAN_ID: [b'ACH-000910' b'ACH-000910' b'ACH-000910' b'ACH-000910' b'ACH-000910']

Shape of IC50_PUBLISHED: (100,)
Example values of IC50_PUBLISHED: [1.4789245e-01 2.4882589e-02 8.6001694e+01 4.5610552e+00 8.6790435e-03]



In [14]:
# Generate batched features and labels

def _features_and_labels(x: Dict[str, tf.Tensor]) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
  labels = x.pop("user_rating")
  return x, labels

def _my_features_and_labels(x: Dict[str, tf.Tensor]) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
  labels = x.pop("IC50_PUBLISHED")
  return x, labels

ds_train = ds_train.map(_features_and_labels)
ds_train = ds_train.apply(tf.data.experimental.dense_to_ragged_batch(batch_size=32))

my_ds_train = my_ds_train.map(_my_features_and_labels)
my_ds_train = my_ds_train.apply(tf.data.experimental.dense_to_ragged_batch(batch_size=32))


Instructions for updating:
Use `tf.data.Dataset.ragged_batch` instead.


Instructions for updating:
Use `tf.data.Dataset.ragged_batch` instead.


In [15]:
for x, label in ds_train.take(1):
  for key, value in x.items():
    print(f"Shape of {key}: {value.shape}")
    print(f"Example values of {key}: {value[:3, :3].numpy()}")
    print()
  print(f"Shape of label: {label.shape}")
  print(f"Example values of label: {label[:3, :3].numpy()}")

Shape of movie_title: (32, None)
Example values of movie_title: [[b'Man Who Would Be King, The (1975)'
  b'Silence of the Lambs, The (1991)' b'Next Karate Kid, The (1994)']
 [b'Flower of My Secret, The (Flor de mi secreto, La) (1995)'
  b'Little Princess, The (1939)' b'Time to Kill, A (1996)']
 [b'Kundun (1997)' b'Scream (1996)' b'Power 98 (1995)']]

Shape of user_id: (32, None)
Example values of user_id: [[b'405' b'405' b'405']
 [b'655' b'655' b'655']
 [b'13' b'13' b'13']]

Shape of label: (32, None)
Example values of label: [[1. 4. 1.]
 [3. 3. 3.]
 [5. 1. 1.]]


In [16]:
for x, label in my_ds_train.take(1):
  for key, value in x.items():
    print(f"Shape of {key}: {value.shape}")
    print(f"Example values of {key}: {value[:3, :3].numpy()}")
    print()
  print(f"Shape of label: {label.shape}")
  print(f"Example values of label: {label[:3, :3].numpy()}")

Shape of DRUG_ID: (32, None)
Example values of DRUG_ID: [[b'1003' b'1004' b'1005']
 [b'1003' b'1004' b'1005']
 [b'1003' b'1004' b'1005']]

Shape of ARXSPAN_ID: (32, None)
Example values of ARXSPAN_ID: [[b'ACH-000910' b'ACH-000910' b'ACH-000910']
 [b'ACH-000876' b'ACH-000876' b'ACH-000876']
 [b'ACH-000783' b'ACH-000783' b'ACH-000783']]

Shape of label: (32, None)
Example values of label: [[1.4789245e-01 2.4882589e-02 8.6001694e+01]
 [1.1501115e+01 2.6155075e-02 8.4742218e+02]
 [1.9450953e+00 1.7586000e-01 2.0785484e+02]]


In [17]:
# Define a model
class MovieLensRankingModel(tf.keras.Model):

  def __init__(self, user_vocab, movie_vocab):
    super().__init__()

    # Set up user and movie vocabulary and embedding.
    self.user_vocab = user_vocab
    self.movie_vocab = movie_vocab
    self.user_embed = tf.keras.layers.Embedding(user_vocab.vocabulary_size(), 64)
    self.movie_embed = tf.keras.layers.Embedding(movie_vocab.vocabulary_size(), 64)

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    # Define how the ranking scores are computed: 
    # Take the dot-product of the user embeddings with the movie embeddings.

    user_embeddings = self.user_embed(self.user_vocab(features["user_id"]))
    movie_embeddings = self.movie_embed(
        self.movie_vocab(features["movie_title"]))

    return tf.reduce_sum(user_embeddings * movie_embeddings, axis=2)

# Define a model
class DrugRankingModel(tf.keras.Model):

  def __init__(self, user_vocab, movie_vocab):
    super().__init__()

    # Set up user and movie vocabulary and embedding.
    self.user_vocab = user_vocab
    self.movie_vocab = movie_vocab
    self.user_embed = tf.keras.layers.Embedding(user_vocab.vocabulary_size(),64)
    self.movie_embed = tf.keras.layers.Embedding(movie_vocab.vocabulary_size(),64)

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    # Define how the ranking scores are computed: 
    # Take the dot-product of the user embeddings with the movie embeddings.

    user_embeddings = self.user_embed(self.user_vocab(features["ARXSPAN_ID"]))
    movie_embeddings = self.movie_embed(self.movie_vocab(features["DRUG_ID"]))

    return tf.reduce_sum(user_embeddings * movie_embeddings, axis=2)

In [18]:
# Create the ranking model, trained with a ranking loss and evaluated with
# ranking metrics.
model = MovieLensRankingModel(user_ids_vocabulary, movie_titles_vocabulary)
optimizer = tf.keras.optimizers.Adagrad(0.5)
loss = tfr.keras.losses.get(
    loss=tfr.keras.losses.RankingLossKey.SOFTMAX_LOSS, ragged=True)
eval_metrics = [
    tfr.keras.metrics.get(key="ndcg", name="metric/ndcg", ragged=True),
    tfr.keras.metrics.get(key="mrr", name="metric/mrr", ragged=True)
]
model.compile(optimizer=optimizer, loss=loss, metrics=eval_metrics)

In [19]:
# Create the ranking model, trained with a ranking loss and evaluated with
# ranking metrics.
my_model = DrugRankingModel(cell_line_ids_vocabulary, drug_ids_vocabulary)
my_optimizer = tf.keras.optimizers.Adagrad(0.5)
my_loss = tfr.keras.losses.get(loss=tfr.keras.losses.RankingLossKey.SOFTMAX_LOSS, ragged=True)
my_eval_metrics = [
    tfr.keras.metrics.get(key="ndcg", name="metric/ndcg", ragged=True),
    tfr.keras.metrics.get(key="mrr", name="metric/mrr", ragged=True)
]
my_model.compile(optimizer=my_optimizer, loss=my_loss, metrics=my_eval_metrics)

In [20]:
model.fit(ds_train, epochs=3)

Epoch 1/3






Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x1b1e7558d30>

In [21]:
ds_train

<_BatchDataset element_spec=({'movie_title': RaggedTensorSpec(TensorShape([None, None]), tf.string, 1, tf.int64), 'user_id': RaggedTensorSpec(TensorShape([None, None]), tf.string, 1, tf.int64)}, RaggedTensorSpec(TensorShape([None, None]), tf.float32, 1, tf.int64))>

In [22]:
my_model.fit(my_ds_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x1b1e75a5cc0>

In [23]:
# Get movie title candidate list.
for movie_titles in movies.batch(2000):
  break

# Generate the input for user 42.
inputs = {
    "user_id":
        tf.expand_dims(tf.repeat("42", repeats=movie_titles.shape[0]), axis=0),
    "movie_title":
        tf.expand_dims(movie_titles, axis=0)
}

# Get movie recommendations for user 42.
scores = model(inputs)
titles = tfr.utils.sort_by_scores(scores,
                                  [tf.expand_dims(movie_titles, axis=0)])[0]
print(f"Top 5 recommendations for user 42: {titles[0, :5]}")

Top 5 recommendations for user 42: [b'Star Wars (1977)' b'Empire Strikes Back, The (1980)' b'Titanic (1997)'
 b'Raiders of the Lost Ark (1981)' b'Return of the Jedi (1983)']


In [24]:
# Get movie title candidate list.
for drug in drugs.batch(2000):
  break

# Generate the input for user 42.
my_inputs = {
    "ARXSPAN_ID":
        tf.expand_dims(tf.repeat("ACH-000876", repeats=drug.shape[0]), axis=0),
    "DRUG_ID":
        tf.expand_dims(drug, axis=0)
}

# Get drug recommendations for user ACH-000876.
my_scores = my_model(my_inputs)
proposed_drugs = tfr.utils.sort_by_scores(my_scores,
                                  [tf.expand_dims(drug, axis=0)])[0]
print(f"Top 5 drug recommendations for user ACH-000876: {proposed_drugs[0, :]}")

# !!! To investigate:
# 1) Lower IC50 is better so maybe we should somehow pass this info to the model, maybe by inversing the values or with another way.

Top 5 drug recommendations for user ACH-000876: [b'1005' b'1005' b'1005' ... b'1004' b'1004' b'1004']
