In [None]:
https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb#scrollTo=hsdBQBpDAX5X

In [None]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

There are 1 GPU(s) available.
Device name: A100-SXM4-40GB
Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:
!pip install tensorflow
!pip install transformers

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras import activations, optimizers, losses
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import pickle
import time

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/katrinmisel/sentiment_analysis/master/sample_df.csv")

In [None]:
cols_to_keep = ["tweet", "target"]
df = df[cols_to_keep]
df = df.rename(columns={"tweet":"text", "target":"labels"})

In [None]:
x = list(df['text'])
y = df['labels']

In [None]:
MODEL_NAME = 'distilbert-base-uncased'
MAX_LEN = 100

tkzr = DistilBertTokenizer.from_pretrained(MODEL_NAME)

def construct_encodings(x, tkzr, max_len, trucation=True, padding=True):
    return tkzr(x, max_length=max_len, truncation=trucation, padding=padding)
    
encodings = construct_encodings(x, tkzr, max_len=MAX_LEN)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
def construct_tfdataset(encodings, y=None):
    return tf.data.Dataset.from_tensor_slices((dict(encodings),y))
    
tfdataset = construct_tfdataset(encodings, y=y)

In [None]:
TEST_SPLIT = 0.2
BATCH_SIZE = 2

train_size = int(len(x) * (1-TEST_SPLIT))

tfdataset = tfdataset.shuffle(len(x))
tfdataset_train = tfdataset.take(train_size)
tfdataset_test = tfdataset.skip(train_size)

tfdataset_train = tfdataset_train.batch(BATCH_SIZE)
tfdataset_test = tfdataset_test.batch(BATCH_SIZE)

In [None]:
N_EPOCHS = 3

model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
optimizer = optimizers.Adam(learning_rate=3e-5)
loss = losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

start = time.time()
model.fit(tfdataset_train, batch_size=BATCH_SIZE, epochs=N_EPOCHS)
end = time.time()

Downloading:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_projector', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
benchmarks = model.evaluate(tfdataset_test, return_dict=True, batch_size=BATCH_SIZE)
print(benchmarks)

{'loss': 0.16972993314266205, 'accuracy': 0.9416249990463257}


In [None]:
print("Train time: " + str(end-start))
print("Loss: " + str(benchmarks['loss']))
print("Accuracy: " + str(benchmarks['accuracy']))

Train time: 1426.951797246933
Loss: 0.16972993314266205
Accuracy: 0.9416249990463257


In [None]:
model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/model/clf')
with open('/content/drive/MyDrive/Colab Notebooks/model/info.pkl', 'wb') as f:
    pickle.dump((MODEL_NAME, MAX_LEN), f)