In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os
import cv2
import numpy as np

from tqdm.auto import tqdm
from transformers import BertTokenizer, TFBertModel

In [6]:
tokenizer = BertTokenizer.from_pretrained(
	"bert-base-uncased",
	padding=True, padding_side='left', truncation=True, truncation_side='left'
)
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [7]:
def batched_bert_input(x:dict, batch_size):
	batched_input = []
	num_samples = len(x['input_ids'])
	for i in range(0, num_samples, batch_size):
		end_i = min(i + batch_size, num_samples)
		batched_input.append({k: v[i:end_i] for k, v in x.items()})
	return batched_input


class FOOD101_Loader(object):
	def __init__(self):
		pass
	
	@staticmethod
	def load_data():
		ds_train, ds_test = tfds.load('food101', split=['train', 'validation'], shuffle_files=False, as_supervised=True)
		
		# since food101 images are big, we resize them to 32x32
		x_train = []
		y_train = []

		for image, label in tqdm(tfds.as_numpy(ds_train)):
			resized = cv2.resize(image, (32, 32), interpolation=cv2.INTER_AREA)
			x_train.append(resized)
			y_train.append(label)
		x_train = np.stack(x_train)
		y_train = np.stack(y_train)

		x_test = []
		y_test = []

		for image, label in tqdm(tfds.as_numpy(ds_test)):
			resized = cv2.resize(image, (32, 32), interpolation=cv2.INTER_AREA)
			x_test.append(resized)
			y_test.append(label)
		x_test = np.stack(x_test)
		y_test = np.stack(y_test)
		return (x_train, y_train), (x_test, y_test)


class Text_Dataset_Loader(object):
	def __init__(self, dset_name, num_classes):
		self.dset_name = dset_name
		self.num_classes = num_classes

	def load_data(self):
		ds_train, ds_test = tfds.load(self.dset_name, split=['train', 'test'], shuffle_files=False, as_supervised=True)
		
		# encode the text
		x_train_text = []
		y_train = []
		for text, label in tfds.as_numpy(ds_train):
			x_train_text.append(text.decode('utf-8'))
			y_train.append(label)

		x_test_text = []
		y_test = []
		for text, label in tfds.as_numpy(ds_test):
			x_test_text.append(text.decode('utf-8'))
			y_test.append(label)

		max_length = 200 if self.dset_name == 'imdb_reviews' else 100
		print("tokenizing text. This might take a while...")
		x_train_text_encoded = tokenizer(x_train_text, padding="max_length", truncation=True, max_length=max_length, return_tensors="np")
		x_test_text_encoded = tokenizer(x_test_text, padding="max_length", truncation=True, max_length=max_length, return_tensors="np")

		# batch the train and test dset for BERT encoding
		x_train_text_encoded_batched = batched_bert_input(x_train_text_encoded, batch_size=64)
		x_test_text_encoded_batched = batched_bert_input(x_test_text_encoded, batch_size=64)

		# perform BERT embedding
		x_train_embeddings = []
		print("encoding train text")
		for batch in tqdm(x_train_text_encoded_batched):
			x_train_embeddings.append(bert_model(**batch).pooler_output)

		x_test_embeddings = []
		print("encoding test text")
		for batch in tqdm(x_test_text_encoded_batched):
			x_test_embeddings.append(bert_model(**batch).pooler_output)

		x_train_embeddings = tf.concat(x_train_embeddings, axis=0)
		x_test_embeddings = tf.concat(x_test_embeddings, axis=0)

		# output
		x_train = x_train_embeddings
		y_train = tf.constant(y_train)
		y_train = tf.one_hot(y_train, depth=self.num_classes)

		x_test = x_test_embeddings
		y_test = tf.constant(y_test)
		y_test = tf.one_hot(y_test, depth=self.num_classes)
		return (x_train, y_train), (x_test, y_test)


class AwA2_Precomputed(object):
	def __init__(self):
		# first download the awa2 dataset to this directory "datasets/raw/awa2"
		self.feature_path = f"datasets/raw/awa2/precomputed/AwA2-features.txt"
		self.label_path = f"datasets/raw/awa2/precomputed/AwA2-labels.txt"

		classname_file = f"datasets/raw/awa2/classes.txt"
		idx_to_classname = {}
		with open(classname_file, "rt") as f:
			for line in f:
				idx, classname = line.strip().split("\t")
				idx_to_classname[int(idx)-1] = classname
		self.idx_to_classname = idx_to_classname

	def __read_txt(self, path):
		with open(path, "rt") as f:
			out = np.loadtxt(f, delimiter=" ")
		return out

	def load_all(self):
		X = self.__read_txt(self.feature_path)
		Y = self.__read_txt(self.label_path)
		Y = Y.astype(np.int64) - 1 # so we start with 0
		return X, Y


class AwA2_Normal_Precomputed(AwA2_Precomputed):
	"""AwA2 dataset with a Normal train/test split on precomputed features.
	"""
	def __init__(self):
		super().__init__()
		# first download the awa2 dataset to this directory "datasets/raw/awa2"
		self.split_idx_file = f"datasets/raw/awa2/precomputed/train_test_idx.npz"
		
		split_idx = np.load(self.split_idx_file)
		self.X_train_idx = split_idx["train_idx"]
		self.X_test_idx = split_idx["test_idx"]
	
	def load_data(self):
		X, Y = self.load_all()
		X_train = X[self.X_train_idx]
		X_test = X[self.X_test_idx]
		Y_train = Y[self.X_train_idx]
		Y_test = Y[self.X_test_idx]
		return (X_train, Y_train), (X_test, Y_test)

In [8]:
NAME_TO_DSET = {
	"mnist": tf.keras.datasets.mnist,
	"fashion_mnist": tf.keras.datasets.fashion_mnist,
	"cifar10": tf.keras.datasets.cifar10,
	"cifar100": tf.keras.datasets.cifar100,
	"food101": FOOD101_Loader(),
	"awa2_n_precomputed": AwA2_Normal_Precomputed(), # this requires first munally downloading the AwA2 dataset
	"imdb_reviews": Text_Dataset_Loader('imdb_reviews', 2),
	"yelp_polarity_reviews": Text_Dataset_Loader('yelp_polarity_reviews', 2),
}

In [9]:
def process_data(dset_name, num_classes, is_image_dset=True):
	(x_train, y_train), (x_test, y_test) = NAME_TO_DSET[dset_name].load_data()

	if is_image_dset:
		y_train = y_train.reshape(-1)
		x_train = x_train.astype("float32")/ 255.
		y_train = tf.one_hot(y_train, depth=num_classes)

		y_test = y_test.reshape(-1)
		x_test = x_test.astype("float32")/ 255.
		y_test = tf.one_hot(y_test, depth=num_classes)
	return x_train, y_train, x_test, y_test


def save(dset_name, num_classes, x_train, y_train, x_test, y_test):
	train_prefix = f"{num_classes}_cls_all_percls"
	test_prefix = f"{num_classes}_cls_all_percls"

	train_save_path = f"datasets/{dset_name}/train/{train_prefix}.npz"
	print(f"Saving train data to {train_save_path}")
	# save train dset
	if not os.path.exists(f"datasets/{dset_name}/train"):
		os.makedirs(f"datasets/{dset_name}/train")
	np.savez(train_save_path, x=x_train, y=y_train) # compressed

	test_save_path = f"datasets/{dset_name}/test/{test_prefix}.npz"
	print(f"Saving test data to {test_save_path}")
	# save test dset
	if not os.path.exists(f"datasets/{dset_name}/test"):
		os.makedirs(f"datasets/{dset_name}/test")
	np.savez(test_save_path, x=x_test, y=y_test)
	return

In [9]:
# e.g.
dset_name = "cifar10"
num_classes = 10
x_train, y_train, x_test, y_test = process_data(dset_name, num_classes, is_image_dset=True)
save(dset_name, num_classes, x_train, y_train, x_test, y_test)

Saving train data to datasets/cifar10/train/10_cls_all_percls.npz
Saving test data to datasets/cifar10/test/10_cls_all_percls.npz


In [10]:
# e.g.
dset_name = "imdb_reviews"
num_classes = 2
x_train, y_train, x_test, y_test = process_data(dset_name, num_classes, is_image_dset=False)
save(dset_name, num_classes, x_train, y_train, x_test, y_test)

tokenizing text
encoding train text


  0%|          | 0/391 [00:00<?, ?it/s]

encoding test text


  0%|          | 0/391 [00:00<?, ?it/s]

Saving train data to datasets/imdb_reviews/train/2_cls_all_percls.npz
Saving test data to datasets/imdb_reviews/test/2_cls_all_percls.npz
