## Let's build a Transformers Classifier using Bert as Feature Extractor

In [None]:
!pip install transformers
!pip install evaluate 
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 5.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 84.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 83.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 4.1 MB/s 


In [None]:
from transformers import AutoTokenizer
from transformers import AutoModel, TrainingArguments, Trainer, AutoConfig
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
import os
import pandas as pd
from datasets import Dataset
import torch

In [None]:
weights = "bert-base-german-cased"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
!mkdir -p data/processed/raw

In [None]:
# Read training and dev datasets splits
BASE_PATH = os.path.join("./data/processed/raw")

def get_split(split, base_path=BASE_PATH): 
    return pd.read_csv(os.path.join(base_path, f"{split}.csv"))

In [None]:
idx_to_label = {0: "tech", 1: "soft", 2: "none"}
label_to_idx = {"tech": 0, "soft": 1, "none": 2}

In [None]:
train_df = get_split(split="training")[['text', 'label']]
train_df.rename({'label':'label_id'}, inplace=True, axis=1)
train_df['label'] = train_df['label_id'].apply(lambda x: label_to_idx[x])
train_df.head()

Unnamed: 0,text,label_id,label
0,Wenn Ihnen eine angenehme Arbeitsatmosphäre mi...,none,2
1,***Abitur / Fachhochschulreife,none,2
2,Routinierter Umgang mit MS Office-Anwendungen,tech,0
3,"Standort univativ GmbH Lavesstr. 80, 30159 Han...",none,2
4,"Führerschein der Klasse B, wünschenswert BE, C...",soft,1


In [None]:
dev_df = get_split(split="dev")[['text', 'label']]
dev_df.rename({'label':'label_id'}, inplace=True, axis=1)
dev_df['label'] = dev_df['label_id'].apply(lambda x: label_to_idx[x])
dev_df.head()

Unnamed: 0,text,label_id,label
0,Assistenz der Geschäftsführung m/w,none,2
1,Detaillierte Kenntnisse des Java Memory Models.,tech,0
2,Umzugsbereitschaft in Richtung Osnabrück (Wohn...,soft,1
3,Ihr Profil:,none,2
4,"Kommunikationsstärke, selbstbewusstes Auftrete...",soft,1


In [None]:
test_df = get_split(split="test")[['text', 'label']]
test_df.rename({'label':'label_id'}, inplace=True, axis=1)
test_df['label'] = test_df['label_id'].apply(lambda x: label_to_idx[x])
test_df.head()

Unnamed: 0,text,label_id,label
0,Umfangreiche Kenntnisse mind. eines der CAD-Sy...,tech,0
1,Anschreiben:,none,2
2,Sehr gute Kenntnisse der Office-Anwendungen.,tech,0
3,Sie verfügen über ein hohes Maß an Leistungsfä...,soft,1
4,Wenn Sie für Ihre effiziente und umsichtige Ar...,none,2


In [None]:
train_data = Dataset.from_pandas(train_df)
dev_data = Dataset.from_pandas(dev_df)
test_data = Dataset.from_pandas(test_df)

In [None]:
train_data[0]

{'text': 'Wenn Ihnen eine angenehme Arbeitsatmosphäre mit viel Enthusiasmus viel Wert ist, freuen wir uns auf Ihre Bewerbung per E-Mail an recruiting@valyue.de und das persönliche Gespräch mit Ihnen. Frau Asimina Kafida beantwortet Ihre Fragen gerne vorab unter Telefon +49.711.627676-13. Valyue Consulting GmbH www.valyue.de/karriere',
 'label_id': 'none',
 'label': 2}

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [None]:
tokenizer = AutoTokenizer.from_pretrained(weights)

In [None]:
tokenizer.model_input_names

['input_ids', 'token_type_ids', 'attention_mask']

In [None]:
model = AutoModel.from_pretrained(weights).to(device)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Obtain the hidden states to be able to generate features
def get_hidden_states(batch):
    inputs = {k:v.to(device) for k,v in batch.items() 
              if k in tokenizer.model_input_names}
    # Get hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return last representation output from CLS token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
train_tokenized = train_data.map(preprocess_function, batched=True)
dev_tokenized = dev_data.map(preprocess_function, batched=True)
test_tokenized = test_data.map(preprocess_function, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
train_tokenized.set_format("torch", 
                           columns=["input_ids", "attention_mask", "label"])
dev_tokenized.set_format("torch", 
                           columns=["input_ids", "attention_mask", "label"])
test_tokenized.set_format("torch", 
                           columns=["input_ids", "attention_mask", "label"])

In [None]:
train_hidden = train_tokenized.map(get_hidden_states, batched=True)
dev_hidden = dev_tokenized.map(get_hidden_states, batched=True)
test_hidden = test_tokenized.map(get_hidden_states, batched=True)

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

## Create Feature Matrix to train the model

In [None]:
X_train = np.array(train_hidden["hidden_state"])
X_valid = np.array(dev_hidden["hidden_state"])
X_test =  np.array(test_hidden["hidden_state"])
y_train = np.array(train_hidden["label"])
y_valid = np.array(dev_hidden["label"])
y_test =  np.array(test_hidden["label"])
X_train.shape, X_valid.shape, X_test.shape
     

((6975, 768), (1532, 768), (1502, 768))

In [None]:
# TODO 
# Save the arrays to be re-used later on :) 
for split, arrs in zip(["training", "test", "dev"], [(X_train, y_train), (X_test, y_test), (X_valid, y_valid)]):
  X, y = arrs 
  with open(f'{split}.npy', 'wb') as f:
      np.save(f, X)
      np.save(f, y)
  with open(f'{split}.npy', 'rb') as f:
      a = np.load(f)
      b = np.load(f)
      print(a.shape, b.shape)

(6975, 768) (6975,)
(1502, 768) (1502,)
(1532, 768) (1532,)


In [None]:
!tar -czvf dataset_nparray.tar.gz *.npy

dev.npy
test.npy
training.npy
