In [1]:
from datasets import load_dataset

dataset = load_dataset("papluca/language-identification")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from model import LlamaClassificationModel

model_name = "meta-llama/Llama-2-7b-hf"
access_token = "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"
output_dir = 'OUTPUT'
llama_model = LlamaClassificationModel(model_name, '4b', access_token)

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 70000
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 10000
    })
})

In [4]:
amazon_languages = ['en', 'de', 'fr', 'es', 'ja', 'zh']
xnli_languages = ['ar', 'el', 'hi', 'ru', 'th', 'tr', 'vi', 'bg', 'sw', 'ur']
stsb_languages = ['it', 'nl', 'pl', 'pt']

all_langs = sorted(list(set(amazon_languages + xnli_languages + stsb_languages)))

In [5]:
id2label = {idx: all_langs[idx] for idx in range(len(all_langs))}
label2id = {v: k for k, v in id2label.items()}
label2id

{'ar': 0,
 'bg': 1,
 'de': 2,
 'el': 3,
 'en': 4,
 'es': 5,
 'fr': 6,
 'hi': 7,
 'it': 8,
 'ja': 9,
 'nl': 10,
 'pl': 11,
 'pt': 12,
 'ru': 13,
 'sw': 14,
 'th': 15,
 'tr': 16,
 'ur': 17,
 'vi': 18,
 'zh': 19}

In [6]:
id2label

{0: 'ar',
 1: 'bg',
 2: 'de',
 3: 'el',
 4: 'en',
 5: 'es',
 6: 'fr',
 7: 'hi',
 8: 'it',
 9: 'ja',
 10: 'nl',
 11: 'pl',
 12: 'pt',
 13: 'ru',
 14: 'sw',
 15: 'th',
 16: 'tr',
 17: 'ur',
 18: 'vi',
 19: 'zh'}

In [7]:
dataset['train'][0]

{'labels': 'pt',
 'text': 'os chefes de defesa da estónia, letónia, lituânia, alemanha, itália, espanha e eslováquia assinarão o acordo para fornecer pessoal e financiamento para o centro.'}

In [8]:
def tokenize_text(sequence):
    """Tokenize input sequence."""
    return llama_model.tokenizer(sequence["text"], truncation=True, max_length=128)

In [9]:
tok_train = dataset['train'].map(tokenize_text, batched=True)
tok_valid = dataset['validation'].map(tokenize_text, batched=True)
tok_test = dataset['test'].map(tokenize_text, batched=True)

In [10]:
def encode_labels(example):
    """Map string labels to integers."""
    example["labels"] = label2id[example["labels"]]
    return example

In [11]:
tok_train = tok_train.map(encode_labels, batched=False)
tok_valid = tok_valid.map(encode_labels, batched=False)
tok_test = tok_test.map(encode_labels, batched=False)

Map: 100%|██████████| 10000/10000 [00:00<00:00, 21741.33 examples/s]


In [12]:
from statistics import mean, stdev

_len = [len(sample) for sample in tok_train['input_ids']]
avg_len, std_len = mean(_len), stdev(_len)
min_len, max_len = min(_len), max(_len)

print('-'*10 + ' Corpus statistics ' + '-'*10)
print(f'\nAvg. length: {avg_len:.1f} (std. {std_len:.1f})')
print('Min. length:', min_len)
print('Max. length:', max_len)

---------- Corpus statistics ----------

Avg. length: 55.9 (std. 39.0)
Min. length: 3
Max. length: 128


In [13]:
from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification
# Use dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=llama_model.tokenizer)

In [14]:
llama_model.set_model(id2label, label2id, len(all_langs))

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from sklearn.metrics import f1_score, accuracy_score, classification_report

def compute_metrics(pred):
    """Custom metric to be used during training."""
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)  # Accuracy
    f1 = f1_score(labels, preds, average="weighted")  # F1-score
    return {
        "accuracy": acc,
        "f1": f1
        }

In [16]:
from transformers import TrainingArguments, Trainer

epochs = 2
lr = 2e-5
train_bs = 64
eval_bs = train_bs * 2

# Log training loss at each epoch
logging_steps = len(tok_train) // train_bs
# Out dir

training_args = TrainingArguments(
  output_dir=output_dir,
  num_train_epochs=epochs,
  learning_rate=lr,
  per_device_train_batch_size=train_bs,
  per_device_eval_batch_size=eval_bs,
  evaluation_strategy="epoch",
  logging_steps=logging_steps,
  fp16=True,  # Remove if GPU doesn't support it
)

In [17]:
llama_model.prepare_model()

from peft import get_peft_model, LoraConfig


def get_lora_config():
    lora_config = LoraConfig(r = 256, # attention heads
                    lora_alpha = 512, # alpha scaling
                    lora_dropout = 0.05,
                    bias = "all",
                    task_type = "CAUSAL_LM",
                     # set this for CLM or Seq2Seq
    )
    return lora_config
peft_model = get_peft_model(llama_model.model, get_lora_config())

In [18]:
trainer = Trainer(
  peft_model,
  training_args,
  compute_metrics=compute_metrics,
  train_dataset=tok_train,
  eval_dataset=tok_valid,
  data_collator=data_collator,
  tokenizer=llama_model.tokenizer,
)

In [19]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


OutOfMemoryError: CUDA out of memory. Tried to allocate 344.00 MiB. GPU 0 has a total capacty of 11.75 GiB of which 212.31 MiB is free. Including non-PyTorch memory, this process has 11.43 GiB memory in use. Of the allocated memory 9.26 GiB is allocated by PyTorch, and 2.03 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
llama_model.model