In [1]:
# set run environment (local/colab), if colab move proper dir
import os
from pathlib import Path

if os.getenv("COLAB_RELEASE_TAG"):
    colab = True
    
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content/drive/Othercomputers/My computer/EQILLM/
    
    !pip install -r requirements.txt -q --exists-action i
    !pip install transformers[torch] -q --exists-action i
    !pip install accelerate -U -q --exists-action i
else:
    colab = False

import csv
import datetime
import evaluate
import glob
import itertools
import numpy as np
import openai
import pandas as pd
import torch, gc
import wandb

from datasets import Dataset, load_dataset
from dotenv import load_dotenv, dotenv_values
from huggingface_hub import login
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from tqdm.auto import tqdm
from tqdm.notebook import tqdm_notebook
from transformers import AutoModelForSequenceClassification, Trainer, BitsAndBytesConfig, AutoTokenizer, DataCollatorWithPadding

from eqillm import finetune, yeelight_eow_notification, param_combinations, load_PolarIs, split_ds, encode_labels, init_model

dotenv_config = dotenv_values('.env')
yeelight_notify = dotenv_config['YEELIGHT_NOTIFY'] if ('YEELIGHT_NOTIFY' in dotenv_config) and (colab) else False

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
login(token=dotenv_config['HF_TOKEN'])

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\Jakub\.cache\huggingface\token
Login successful


In [2]:
# models = ['albert', 'bart', 'bert', 'big_bird', 'bigbird_pegasus', 'biogpt', 'bloom', 'camembert', 'canine', 'code_llama', 'convbert', 'ctrl', 'data2vec-text', 'deberta', 'deberta-v2', 'distilbert', 'electra', 'ernie', 'ernie_m', 'esm', 'falcon', 'flaubert', 'fnet', 'funnel', 'gemma', 'gpt-sw3', 'gpt2', 'gpt_bigcode', 'gpt_neo', 'gpt_neox', 'gptj', 'ibert', 'layoutlm', 'layoutlmv2', 'layoutlmv3', 'led', 'lilt', 'llama', 'longformer', 'luke', 'markuplm', 'mbart', 'mega', 'megatron-bert', 'mistral', 'mixtral', 'mobilebert', 'mpnet', 'mpt', 'mra', 'mt5', 'mvp', 'nezha', 'nystromformer', 'open-llama', 'openai-gpt', 'opt', 'perceiver', 'persimmon', 'phi', 'plbart', 'qdqbert', 'qwen2', 'reformer', 'rembert', 'roberta', 'roberta-prelayernorm', 'roc_bert', 'roformer', 'squeezebert', 'stablelm', 'starcoder2', 't5', 'tapas', 'transfo-xl', 'umt5', 'xlm', 'xlm-roberta', 'xlm-roberta-xl', 'xlnet', 'xmod', 'yoso',]
# =============================================
# works = [']
# ----------
#too weak pc (cannot be loaded or runs at speeds <1 it/s) = 'albert-xlarge-v1', 't5-11B', 'LongformerForSequenceClassification', 'xlm-roberta-base', 'allenai/longformer-base-4096', 'facebook/bart-large',  flaubert/flaubert_large_cased (stopped at 0.5 epoch),
# ----------
# need changes to run (check error logs) =  ProsusAI/finbert (despite adding padding proposed in error)
# =============================================
# 'xlnet-base-cased' - needs data collator

# , 'camembert-base', 'ctrl', 'xlnet-base-cased', 'roberta-base', 'distilroberta-base', 'flaubert/flaubert_base_cased'

# deprecated: transfo-xl-wt103


# ['michellejieli/emotion_text_classifier', 'xlnet-base-cased', 'roberta-base', 'distilroberta-base', 'flaubert/flaubert_base_cased']



# to_solve


In [5]:
MAX_NEW_TOKENS = 256

params_tested = {'model_name': [
                              'microsoft/phi-2',
                            'google/gemma-2b',
                            'mistralai/Mistral-7B-v0.1',
                           # 'cardiffnlp/twitter-roberta-base-irony',
                           # 'nlptown/bert-base-multilingual-uncased-sentiment',
                           # 'michellejieli/emotion_text_classifier',
                           # 'j-hartmann/emotion-english-distilroberta-base',
                           # 'roberta-base',
                           # 'lxyuan/distilbert-base-multilingual-cased-sentiments-student',
                           # 'ProsusAI/finbert',
                           # 'cardiffnlp/twitter-roberta-base-sentiment-latest',
                           # 'distilbert/distilbert-base-uncased-finetuned-sst-2-english',
                           # 'joeddav/distilbert-base-uncased-go-emotions-student',
                           # 'camembert-base'
                           # 'joeddav/distilbert-base-uncased-go-emotions-student',
                           # 'ctrl',
                           # 'camembert-base',
                           # 'papluca/xlm-roberta-base-language-detection',
                           # 'mistralai/Mistral-7B-v0.1',
                           # 'cardiffnlp/twitter-roberta-base-irony',
                           # 'meta-llama/Meta-Llama-3-8B',
                           # 'distilbert/distilbert-base-uncased-finetuned-sst-2-english',
                           # 'j-hartmann/emotion-english-distilroberta-base',
                           # 'arpanghoshal/EmoRoBERTa',
                           # 'ProsusAI/finbert',
                           # 'cardiffnlp/twitter-roberta-base-sentiment-latest',
                           # 'michellejieli/emotion_text_classifier',
                           # 'xlnet-base-cased',
                           # 'roberta-base',
                           # 'distilroberta-base',
                           # 'flaubert/flaubert_base_cased',
                           #  'celine98/canine-s-finetuned-sst2',
                           #  'lytang/MiniCheck-Flan-T5-Large',
                           #  'michelecafagna26/t5-base-finetuned-sst2-sentiment',
                           #  'facebook/tart-full-flan-t5-xl',
                           #  'lxyuan/distilbert-base-multilingual-cased-sentiments-student',
                           #  'nlptown/bert-base-multilingual-uncased-sentiment',
                           #  'cardiffnlp/twitter-xlm-roberta-base-sentiment',
                           ], # Pre-trained model names from the Hugging Face hub used for fine-tuning
                 'num_train_epochs': 10, # Number of times the model sees the entire training dataset.
                 'save_strategy': 'epoch', # Controls when to save model checkpoints ('epoch' or 'no').
                 'per_device_train_batch_size': 16, # Number of samples processed in each training step (personally, 8/16 work best, 16 is faster, but you may find linear drop in inference speed during fine-tuning).
                 'per_device_eval_batch_size': 16, # Number of samples processed in each evaluation step.
                 'split': (0.8, 0.1, 0.1), # Divides the dataset into training, testing, (and optionally) validation sets. Use 'balanced' for equal class representation in the validation set. Examples: (90,10) -> split into train and test proportionally; (80, 10, 10) splits into train,test, validate proportionally.
                 'binary': [True], # Indicates whether the task is binary (two classes) or multi-class classification.,
                 'balanced': [False], # his way labels used for training are split evenly, fitting size to the lowest label count. n (equal to 80% of least represented label) will be taken from each label, rest will be used for test.
                 'learning_rate': [
                                    # 1e-3, 
                                    1e-4, 
                                    # 1e-5,
                                    ],
                 }

# Controls whether to save logs during a process. When set to False, logging is disabled.
save_logs = True

train_params_looped = param_combinations(params_tested)
print(len(train_params_looped))
print(*train_params_looped, sep='\n')

3
{'model_name': 'microsoft/phi-2', 'num_train_epochs': 10, 'save_strategy': 'epoch', 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'split': (0.8, 0.1, 0.1), 'binary': True, 'balanced': False, 'learning_rate': 0.0001}
{'model_name': 'google/gemma-2b', 'num_train_epochs': 10, 'save_strategy': 'epoch', 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'split': (0.8, 0.1, 0.1), 'binary': True, 'balanced': False, 'learning_rate': 0.0001}
{'model_name': 'mistralai/Mistral-7B-v0.1', 'num_train_epochs': 10, 'save_strategy': 'epoch', 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'split': (0.8, 0.1, 0.1), 'binary': True, 'balanced': False, 'learning_rate': 0.0001}


In [6]:
from transformers import TrainingArguments

data_path = 'data/PolarIs-Pathos.xlsx'

#run looped
for train_params in tqdm(train_params_looped):
    os.environ["WANDB_PROJECT"]="pathos-recognition-binary"

    df = load_PolarIs(data_path, train_params['binary'])
    df, target_map = encode_labels(df)
    ds = Dataset.from_pandas(df)
    ds = split_ds(ds, train_size=0.8, val_size=0.1)

    finetune(ds, train_params, target_map, colab)

if yeelight_notify:
    yeelight_eow_notification(dotenv_config['YEELIGHT_PORT'])

  0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/12470 [00:00<?, ? examples/s]

Map:   0%|          | 0/1558 [00:00<?, ? examples/s]

Map:   0%|          | 0/1560 [00:00<?, ? examples/s]

torch.float16


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model prepared
Model perfed
trainable params: 2,298,880 || all params: 2,650,864,640 || trainable%: 0.0867218931254068
cuda:0
Model to cuda




Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1,0.7052,0.636493,0.479491,0.686235,0.56453,0.664313
2,0.6114,0.627146,0.497592,0.62753,0.555058,0.681001
3,0.5839,0.627561,0.486172,0.676113,0.565622,0.670732
4,0.5749,0.628731,0.530142,0.605263,0.565217,0.70475
5,0.5656,0.627373,0.503704,0.688259,0.581694,0.686136
6,0.5515,0.653796,0.572104,0.489879,0.527808,0.72208
7,0.5436,0.625058,0.520868,0.631579,0.570906,0.698973
8,0.5329,0.640612,0.528626,0.560729,0.544204,0.702182
9,0.5359,0.630436,0.493469,0.688259,0.57481,0.67715
10,0.5246,0.627328,0.531624,0.629555,0.57646,0.706675




VBox(children=(Label(value='0.032 MB of 0.067 MB uploaded\r'), FloatProgress(value=0.47331156993259, max=1.0))…

0,1
eval/accuracy,█▁▃▂▆▄█▅▆▃▆
eval/f1-score,▁▇▇▇▇█▆█▇██
eval/loss,█▁▁▁▁▁▂▁▁▁▁
eval/precision,█▁▂▁▃▂▅▃▃▂▃
eval/recall,▁█▇█▇█▅▇▆█▇
eval/runtime,█▁▅▆▃▆▆▅▃▃▅
eval/samples_per_second,▁█▃▃▆▃▃▄▆▆▄
eval/steps_per_second,▁█▃▃▆▃▃▄▆▆▄
train/epoch,▁▁▁▁▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇▇███
train/global_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇███

0,1
eval/accuracy,0.70668
eval/f1-score,0.57646
eval/loss,0.62733
eval/precision,0.53162
eval/recall,0.62955
eval/runtime,46.6817
eval/samples_per_second,33.375
eval/steps_per_second,2.099
total_flos,9.007685183195136e+16
train/epoch,10.0


Map:   0%|          | 0/12470 [00:00<?, ? examples/s]

Map:   0%|          | 0/1558 [00:00<?, ? examples/s]

Map:   0%|          | 0/1560 [00:00<?, ? examples/s]

torch.float16


Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of GemmaForSequenceClassification were not initialized from the model checkpoint at google/gemma-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model prepared
Model perfed
trainable params: 234,496 || all params: 2,506,411,008 || trainable%: 0.009355847833876095
cuda:0
Model to cuda


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1,0.6677,0.598248,0.566735,0.558704,0.562691,0.724647
2,0.5456,0.596088,0.508499,0.726721,0.598333,0.690629
3,0.5043,0.618189,0.474138,0.779352,0.589587,0.655969
4,0.426,0.65098,0.545769,0.639676,0.589003,0.716945
5,0.3577,0.770778,0.550201,0.554656,0.552419,0.715019
6,0.2396,1.298458,0.577215,0.461538,0.512936,0.72208
7,0.1869,1.336848,0.505068,0.605263,0.550645,0.686778
8,0.136,2.207862,0.564286,0.479757,0.5186,0.717587
9,0.143,2.832149,0.559242,0.477733,0.515284,0.715019
10,0.1033,3.541602,0.504604,0.554656,0.528447,0.686136




VBox(children=(Label(value='0.032 MB of 0.053 MB uploaded (0.004 MB deduped)\r'), FloatProgress(value=0.604796…

0,1
eval/accuracy,█▅▁▇▇█▄▇▇▄
eval/f1-score,▅█▇▇▄▁▄▁▁▂
eval/loss,▁▁▁▁▁▃▃▅▆█
eval/precision,▇▃▁▆▆█▃▇▇▃
eval/recall,▃▇█▅▃▁▄▁▁▃
eval/runtime,▁▅▄▄▄▄██▇▇
eval/samples_per_second,█▄▅▅▅▅▁▁▂▂
eval/steps_per_second,█▄▅▅▅▅▁▁▂▂
train/epoch,▁▁▁▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇▇███
train/global_step,▁▁▁▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▇▇▇▇███

0,1
eval/accuracy,0.68614
eval/f1-score,0.52845
eval/loss,3.5416
eval/precision,0.5046
eval/recall,0.55466
eval/runtime,35.0134
eval/samples_per_second,44.497
eval/steps_per_second,2.799
total_flos,7.004989208604672e+16
train/epoch,10.0


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


Map:   0%|          | 0/12470 [00:00<?, ? examples/s]

Map:   0%|          | 0/1558 [00:00<?, ? examples/s]

Map:   0%|          | 0/1560 [00:00<?, ? examples/s]

torch.float16




ValueError: 
                    Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the
                    quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules
                    in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to
                    `from_pretrained`. Check
                    https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                    for more details.
                    

In [None]:
# import datasets
# from transformers import pipeline
# from transformers.pipelines.pt_utils import KeyDataset
# from tqdm.auto import tqdm
#
# pipe = pipeline("text-classification", model="output/models/google/gemma-2b_2024-04-27_00-13/checkpoints/checkpoint-140290", device=0)
# valds = df_to_ds(polarIs_df)[0]['validate']
# predicted = [i['label'] for i in pipe(valds['text'])]