In [None]:
!pip install transformers==4.31.0

Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
  Attempting uninstall: transformers
    Found existing installation: transformers 4.38.2
    Uninstalling transformers-4.38.2:
      Successfully uninstalled transformers-4.38.2
Successfully installed tokenizers-0.13.3 transformers-4.31.0


In [None]:
import os
from typing import List

from sklearn.model_selection import KFold
import plotly.graph_objects as go
import tensorflow as tf
import transformers
import numpy as np
import pandas as pd

In [None]:
MODEL = "microsoft/codebert-base"
RNG = 17
N = 5
MAX_LEN = 512
EPOCHS = 4
LR = 4e-5
WARMUP_RATE = 0.05
BATCH_MULTIPLIER = 16

try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    STRATEGY = tf.distribute.experimental.TPUStrategy(TPU)
    BATCH_SIZE = BATCH_MULTIPLIER * STRATEGY.num_replicas_in_sync
except Exception:
    TPU = None
    STRATEGY = tf.distribute.get_strategy()
    BATCH_SIZE = 4

if TPU is not None:
    print("Using TPU")
else:
    print("Using GPU/CPU")

print("Batch size:", BATCH_SIZE)



TensorFlow 2.12.0
Using TPU v3-8
Batch size: 128


In [None]:
def count_samples(filenames: List[str]) -> int:
    return sum(int(os.path.basename(x).split(".")[0].split("-")[-1]) for x in filenames)


def read_tfrecord(record: tf.Tensor) -> tf.Tensor:
    features = {
        "input_ids": tf.io.FixedLenFeature([MAX_LEN], tf.int32),
        "attention_mask": tf.io.FixedLenFeature([MAX_LEN], tf.int32),
        "feature": tf.io.FixedLenFeature([], tf.float32),
        "label": tf.io.FixedLenFeature([], tf.float32),
    }
    return tf.io.parse_single_example(record, features)



def get_dataset(
    filenames: List[str],
    ordered: bool = False,
    repeated: bool = True,
    cached: bool = False,
) -> tf.data.Dataset:
    auto = tf.data.experimental.AUTOTUNE
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=auto)
    dataset = dataset.map(read_tfrecord, num_parallel_calls=auto)
    if not ordered:
        ignore_order = tf.data.Options()
        ignore_order.experimental_deterministic = False
        dataset = dataset.with_options(ignore_order)
        dataset = dataset.shuffle(2048, seed=RNG)

    if repeated:
        dataset = dataset.repeat()

    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

    if cached:
        dataset = dataset.cache()
    dataset = dataset.prefetch(auto)

    return STRATEGY.experimental_distribute_dataset(dataset)


def get_model() -> tf.keras.Model:
    backbone = transformers.TFAutoModel.from_pretrained(MODEL)

    input_ids = tf.keras.layers.Input(
        shape=(TOTAL_MAX_LEN,),
        dtype=tf.int32,
        name="input_ids",
    )

    attention_mask = tf.keras.layers.Input(
        shape=(TOTAL_MAX_LEN,),
        dtype=tf.int32,
        name="attention_mask",
    )
    feature = tf.keras.layers.Input(
        shape=(1,),
        dtype=tf.float32,
        name="feature",
    )
    x = backbone(input_ids, attention_mask)[0]
    x = tf.concat([x[:, 0, :], feature], axis=1)
    outputs = tf.keras.layers.Dense(1, activation="linear", dtype="float32")(x)
    return tf.keras.Model(
        inputs=[input_ids, attention_mask, feature],
        outputs=outputs,
    )


class WarmupLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(
        self,
        base_learning_rate: float,
        warmup_steps: int,
        total_steps: int,
    ) -> None:
        self._base_learning_rate = base_learning_rate
        self._warmup_steps = warmup_steps
        self._total_steps = total_steps

    def __call__(self, step: int) -> float:
        return self._base_learning_rate * tf.cond(
            tf.math.less_equal(step, warmup_steps),
            lambda: step / self._warmup_steps,
            lambda: (step - total_steps) / (self._warmup_steps - self._total_steps),
        )

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger

checkpoint_path = "model_checkpoint.h5"
csv_log_file = "training_log.csv"

checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True
)

csv_logger_callback = CSVLogger(csv_log_file)

In [None]:
GCS_PATH = 'gs://ai-4-code/'


kfold = KFold(n_splits=N, shuffle=True, random_state=RNG)

for i, (train_index, val_index) in enumerate(kfold.split(range(N))):
    if TPU is not None:
        tf.tpu.experimental.initialize_tpu_system(TPU)

    train_filenames = np.ravel([
      tf.io.gfile.glob(os.path.join(GCS_PATH, "tfrec", str(x), "*.tfrec"))
      for x in train_index
    ])

    val_filenames = np.ravel([
        tf.io.gfile.glob(os.path.join(GCS_PATH, "tfrec", str(x), "*.tfrec"))
        for x in val_index
    ])

    steps_per_epoch = count_samples(train_filenames) // BATCH_SIZE
    train_dataset = get_dataset(train_filenames)

    validation_steps = count_samples(val_filenames) // BATCH_SIZE
    val_dataset = get_dataset(val_filenames, ordered=True, repeated=False, cached=True)

    with STRATEGY.scope():
        model = get_model()

        total_steps = steps_per_epoch * EPOCHS
        warmup_steps = int(WARMUP_RATE * total_steps)

        optimizer = transformers.AdamWeightDecay(
            learning_rate=WarmupLinearDecay(
                base_learning_rate=LR,
                warmup_steps=warmup_steps,
                total_steps=total_steps,
            ),
            weight_decay_rate=0.01,
            exclude_from_weight_decay=[
                "bias",
                "LayerNorm.bias",
                "LayerNorm.weight",
            ],
        )
        model.compile(loss="mae", optimizer=optimizer)

    metrics = model.fit(
        train_dataset,
        steps_per_epoch=steps_per_epoch,
        validation_data=val_dataset,
        validation_steps=validation_steps,
        epochs=EPOCHS,
        verbose=VERBOSE,
        callbacks=[checkpoint_callback],
    ).history

    model.save_weights(f"model_{i}.h5")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/499M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at microsoft/codebert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10