In [1]:
! pip install transformers
! pip install datasets
! pip install adapter-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m90.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [3]:
import transformers
import torch

In [4]:
torch.cuda.is_available()

True

In [5]:
# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [6]:
import pandas as pd
from google.colab import files
uploaded = files.upload()

Saving naics_desc_data_2022.xlsx to naics_desc_data_2022.xlsx


In [7]:
import io
dataset = pd.read_excel(io.BytesIO(uploaded['naics_desc_data_2022.xlsx']))

In [8]:
dataset.head()

Unnamed: 0,naics,examples
0,482112,Beltline railroads
1,482112,"Freight railways, short-line or beltline"
2,482112,Logging railroads
3,482112,"Railroad transportation, short-line or beltline"
4,482112,"Railroads, short-line or beltline"


In [9]:
sentences = dataset.examples.values
labels = dataset.naics.values

In [10]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(labels)

In [11]:
y_enc = le.transform(labels)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sentences, y_enc, test_size=0.1, random_state=42)

In [18]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [19]:
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset

df1 = pd.DataFrame({'sentences': list(X_train), 'labels': list(y_train)})
dataset = ds.dataset(pa.Table.from_pandas(df1).to_batches())

### convert to Huggingface dataset
hg_dataset1 = Dataset(pa.Table.from_pandas(df1))

In [20]:
df2 = pd.DataFrame({'sentences': list(X_test), 'labels': list(y_test)})
dataset = ds.dataset(pa.Table.from_pandas(df2).to_batches())

### convert to Huggingface dataset
hg_dataset2 = Dataset(pa.Table.from_pandas(df2))

In [21]:
import datasets
dd = datasets.DatasetDict({"train":hg_dataset1,"test":hg_dataset2})

In [22]:
def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(
      batch["sentences"],
      max_length=256,
      truncation=True,
      padding="max_length"
  )

# Encode the input data
dd = dd.map(encode_batch, batched=True)
# The transformers model expects the target class column to be named "labels"
# dd = dd.rename_column("labels", "labels")
# Transform to pytorch tensors and only output the required columns
dd.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/17930 [00:00<?, ? examples/s]

Map:   0%|          | 0/1993 [00:00<?, ? examples/s]

In [23]:
from transformers import BertConfig, BertModelWithHeads

id2label = dict(list(enumerate([str(x) for x in le.classes_])))

config = BertConfig.from_pretrained(
    "bert-large-uncased",
    id2label=id2label,
)
model = BertModelWithHeads.from_pretrained(
    "bert-large-uncased",
    config=config,
)




Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
# from transformers.adapters import AdapterConfig,PfeifferConfig
# config = PfeifferConfig()
# config = AdapterConfig(mh_adapter=True, output_adapter=True, reduction_factor=16, non_linearity="relu")
# model.add_adapter("naics", config=PfeifferConfig())
model.to(device)

BertModelWithHeads(
  (shared_parameters): ModuleDict()
  (bert): BertModel(
    (shared_parameters): ModuleDict()
    (invertible_adapters): ModuleDict()
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(
                in_features=1024, out_features=1024, bias=True
                (loras): ModuleDict()
              )
              (key): Linear(
                in_features=1024, out_features=1024, bias=True
                (loras): ModuleDict()
              )
              (value): Linear(
                in_features=1024

In [25]:
model.add_classification_head("cb", num_labels=len(id2label))
# model.train_adapter("naics")
# model.to(device)

In [26]:
import numpy as np
from transformers import TrainingArguments, Trainer, EvalPrediction

training_args = TrainingArguments(
    learning_rate=5e-5,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=200,
    output_dir="./training_output",
    # overwrite_output_dir=True,
    # The next line is important to ensure the dataset labels are properly passed to the model
    remove_unused_columns=False,
)

def compute_accuracy(p: EvalPrediction):
  preds = np.argmax(p.predictions, axis=1)
  return {"acc": (preds == p.label_ids).mean()}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dd["train"],
    eval_dataset=dd["test"],
    compute_metrics=compute_accuracy,
)

In [27]:
trainer.train()

***** Running training *****
  Num examples = 17930
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5605
  Number of trainable parameters = 337226738


Step,Training Loss
200,6.5566
400,6.0515
600,5.4344
800,4.6873
1000,4.071
1200,3.5253
1400,3.0773


Saving model checkpoint to ./training_output/checkpoint-500
Configuration saved in ./training_output/checkpoint-500/config.json
Configuration saved in ./training_output/checkpoint-500/generation_config.json
Model weights saved in ./training_output/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./training_output/checkpoint-1000
Configuration saved in ./training_output/checkpoint-1000/config.json
Configuration saved in ./training_output/checkpoint-1000/generation_config.json
Model weights saved in ./training_output/checkpoint-1000/pytorch_model.bin


Step,Training Loss
200,6.5566
400,6.0515
600,5.4344
800,4.6873
1000,4.071
1200,3.5253
1400,3.0773
1600,2.7967
1800,2.5948
2000,2.3187


Saving model checkpoint to ./training_output/checkpoint-1500
Configuration saved in ./training_output/checkpoint-1500/config.json
Configuration saved in ./training_output/checkpoint-1500/generation_config.json
Model weights saved in ./training_output/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./training_output/checkpoint-2000
Configuration saved in ./training_output/checkpoint-2000/config.json
Configuration saved in ./training_output/checkpoint-2000/generation_config.json
Model weights saved in ./training_output/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./training_output/checkpoint-2500
Configuration saved in ./training_output/checkpoint-2500/config.json
Configuration saved in ./training_output/checkpoint-2500/generation_config.json
Model weights saved in ./training_output/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./training_output/checkpoint-3000
Configuration saved in ./training_output/checkpoint-3000/config.json
Configuration s

TrainOutput(global_step=5605, training_loss=2.1676409488698396, metrics={'train_runtime': 12087.693, 'train_samples_per_second': 7.417, 'train_steps_per_second': 0.464, 'total_flos': 4.20606299198976e+16, 'train_loss': 2.1676409488698396, 'epoch': 5.0})

In [37]:
trainer.evaluate(dd["test"],)

***** Running Evaluation *****
  Num examples = 1993
  Batch size = 16


{'eval_runtime': 90.2955,
 'eval_samples_per_second': 22.072,
 'eval_steps_per_second': 1.384,
 'epoch': 5.0}

In [54]:
dd["test"]["labels"]

tensor([873, 772, 484,  ..., 197, 777, 515])

In [69]:
preds=trainer.predict(dd["test"])

***** Running Prediction *****
  Num examples = 1993
  Batch size = 16


In [90]:
np.argmax(preds.predictions[1],axis=1)

array([760, 772, 482, ..., 197, 777, 515])

In [89]:
(np.argmax(preds.predictions[1],axis=1) == dd["test"]["labels"].numpy()).mean()


0.7471149021575514

In [92]:
from sklearn.metrics import classification_report

print(classification_report(dd["test"]["labels"].numpy(),np.argmax(preds.predictions[1],axis=1)))

              precision    recall  f1-score   support

           2       1.00      0.50      0.67         2
           7       0.00      0.00      0.00         1
           9       0.60      1.00      0.75         3
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         2
          18       0.33      1.00      0.50         3
          20       1.00      0.80      0.89         5
          21       0.00      0.00      0.00         2
          22       0.50      1.00      0.67         1
          26       1.00      1.00      1.00         1
          29       0.25      0.25      0.25         4
          30       0.25      1.00      0.40         1
          31       0.00      0.00      0.00         2
          33       0.00      0.00      0.00         1
          37       0.00      0.00      0.00         1
          38       0.00      0.00      0.00         1
          39       0.00      0.00      0.00         0
          41       0.50    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
!zip -r model.zip training_output/checkpoint-5500

updating: training_output/checkpoint-5500/ (stored 0%)
  adding: training_output/checkpoint-5500/scheduler.pt (deflated 49%)
  adding: training_output/checkpoint-5500/generation_config.json (deflated 8%)
  adding: training_output/checkpoint-5500/rng_state.pth (deflated 28%)
  adding: training_output/checkpoint-5500/trainer_state.json (deflated 76%)
  adding: training_output/checkpoint-5500/pytorch_model.bin (deflated 7%)
  adding: training_output/checkpoint-5500/config.json (deflated 80%)
  adding: training_output/checkpoint-5500/training_args.bin (deflated 48%)
  adding: training_output/checkpoint-5500/optimizer.pt (deflated 15%)
