In [1]:
from transformers import AutoTokenizer

model_ckpt = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [2]:
def tokenize(examples):
    if 'text' in examples:
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    return tokenizer(examples, padding='max_length', truncation=True)

In [3]:
import torch
from transformers import AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [4]:
device

device(type='cuda')

In [4]:
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state

    del inputs
    torch.cuda.empty_cache()

    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [5]:
from datasets import load_dataset
train_set = load_dataset("csv", data_files='train_emoji.csv')
train_set = train_set['train'].train_test_split(test_size=0.2)
test_set = load_dataset("csv", data_files='test_emoji.csv')

In [6]:
train_encoded = train_set.map(tokenize, batched=True, batch_size=None)
test_encoded = test_set.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

In [7]:
train_encoded.set_format("torch", columns=["input_ids", "attention_mask", "target"])
test_encoded.set_format("torch", columns=["input_ids", "attention_mask"])

In [8]:
train_hidden = train_encoded.map(extract_hidden_states, batched=True, batch_size=8)
test_hidden = test_encoded.map(extract_hidden_states, batched=True, batch_size=8)
# test_hidden
# train_hidden

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

## Plot

In [None]:
import numpy as np

X_train = np.array(train_hidden["train"]["hidden_state"])
X_valid = np.array(train_hidden["test"]["hidden_state"])
y_train = np.array(train_hidden["train"]["target"])
y_valid = np.array(train_hidden["test"]["target"])
X_test = np.array(test_hidden["train"]["hidden_state"])
X_train.shape, X_valid.shape

In [None]:
from umap.umap_ import UMAP
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Scale features to [0,1] range
X_scaled = MinMaxScaler().fit_transform(X_train)
# Initialize and fit UMAP
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
# Create a DataFrame of 2D embeddings
df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
df_emb["label"] = y_train
df_emb.head()

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(5, 5))
axes = axes.flatten()
cmaps = ["Greys", "Blues"]
labels = train_set["train"].features["target"]
labels = [1, 0]

for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_emb.query(f"label == {i}")
    axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
                   gridsize=20, linewidths=(0,))
    axes[i].set_title(label)
    axes[i].set_xticks([]), axes[i].set_yticks([])

plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

labels = [1, 0]
def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize="true")
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False)
    plt.title("Normalized confusion matrix")
    plt.show()

# y_preds = lr_clf.predict(X_valid)
# plot_confusion_matrix(y_preds, y_valid, labels)

## Train

In [9]:
from transformers import AutoModelForSequenceClassification

num_labels = 5
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

# model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [11]:
from transformers import Trainer, TrainingArguments

batch_size = 8
logging_steps = len(train_encoded["train"]) // batch_size
model_name = f"./{model_ckpt}_"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=20,
                                  learning_rate=1e-5,
                                  per_gpu_train_batch_size=batch_size,
                                  per_gpu_eval_batch_size=batch_size,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True,
                                  log_level="error")

2024-06-30 11:35:19.436506: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-30 11:35:19.845532: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-30 11:35:20.813302: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.8/lib64:
2024-06-30 11:35:20.813496: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: ca

In [12]:
train_encoded = train_encoded.rename_column("target", "label")

In [13]:
from transformers import Trainer


trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=train_encoded["train"],
                  eval_dataset=train_encoded["test"],
                  tokenizer=tokenizer)
trainer.train()
# try:
#     trainer.train()
# except:
#     del trainer
#     torch.cuda.empty_cache()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5767,1.555327,0.296296,0.211412
2,1.4922,1.475727,0.444444,0.346979
3,1.4169,1.43827,0.444444,0.336626
4,1.3061,1.374992,0.444444,0.336626
5,1.1467,1.302121,0.481481,0.40823
6,1.0784,1.228915,0.592593,0.57672
7,0.9278,1.13432,0.592593,0.577499
8,0.8355,1.040183,0.592593,0.58682
9,0.6831,0.968089,0.666667,0.667615
10,0.5515,0.922418,0.62963,0.621388


TrainOutput(global_step=280, training_loss=0.6409281034554754, metrics={'train_runtime': 148.1566, 'train_samples_per_second': 14.174, 'train_steps_per_second': 1.89, 'total_flos': 552548099174400.0, 'train_loss': 0.6409281034554754, 'epoch': 20.0})

In [15]:
import numpy as np

preds_output = trainer.predict(test_encoded['train'])
y_preds = np.argmax(preds_output.predictions, axis=1)

(array([0, 1, 2, 3, 4]), array([ 9,  6, 17, 19,  5]))

In [23]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)



In [41]:
import emoji as emoji

emoji_dictionary = {"0": "\u2764\uFE0F",
                    "1": ":baseball:",
                    "2": ":beaming_face_with_smiling_eyes:",
                    "3": ":downcast_face_with_sweat:",
                    "4": ":fork_and_knife:",
                   }

for e in emoji_dictionary.values():
    print(emoji.emojize(e))

❤️
⚾
😁
😓
🍴


In [49]:
text = 'I am the shit'
props = pipe(text)[0]
argmax = np.argmax([prop['score'] for prop in props])
emoji_text = emoji_dictionary[str(argmax)]
print(f'{text}: {emoji.emojize(emoji_text)}')

I am the shit: 😓
