In [1]:
import pandas as pd
import pickle
import evaluate
import numpy as np
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from functions_variables import *
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import os
from huggingface_hub import HfApi, HfFolder
from datasets import load_dataset
from transformers import pipeline

## Enhancing Model Using Transfer Learning

<p>Hugging Face trainer would not run unless I provided an API key, so the next few steps are required to do this</p>

In [2]:
API_KEY = os.getenv("HUG_FACE_API_KEY")
if API_KEY:
    print("API key successfully retrieved.")
else:
    print("API key not found.")

API key successfully retrieved.


In [3]:
if not API_KEY:
    raise ValueError("API key not found. Please set the HUGGINGFACE_API_KEY environment variable.")

In [4]:
HfFolder.save_token(API_KEY)

# Initialize the API with the token
api = HfApi()

# Load the dataset
data = load_dataset("imdb", use_auth_token=API_KEY)
data



DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [5]:
# use the default preprocessor
# important to ensure expected input to our model (i.e. same lemmatization modelling, stopwords, etc)
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
tokenized_imdb = data.map(lambda x: preprocess_function(tokenizer,x), batched=True)

In [6]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
accuracy = evaluate.load("accuracy")

In [8]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

model = AutoModelForSequenceClassification.from_pretrained(

    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id

)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
training_args = TrainingArguments(
    output_dir="../data/my_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    push_to_hub=False,
)

In [10]:
train_dataset = tokenized_imdb['train']
train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 25000
})

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb['train'],
    eval_dataset=tokenized_imdb['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [14]:
trainer.train()

  0%|          | 0/1563 [00:00<?, ?it/s]

{'loss': 0.3151, 'grad_norm': 13.494571685791016, 'learning_rate': 1.3602047344849649e-05, 'epoch': 0.32}
{'loss': 0.2416, 'grad_norm': 3.902318000793457, 'learning_rate': 7.204094689699297e-06, 'epoch': 0.64}
{'loss': 0.2126, 'grad_norm': 9.443507194519043, 'learning_rate': 8.061420345489445e-07, 'epoch': 0.96}
{'train_runtime': 1673.6006, 'train_samples_per_second': 14.938, 'train_steps_per_second': 0.934, 'train_loss': 0.2544141605887288, 'epoch': 1.0}


TrainOutput(global_step=1563, training_loss=0.2544141605887288, metrics={'train_runtime': 1673.6006, 'train_samples_per_second': 14.938, 'train_steps_per_second': 0.934, 'total_flos': 3311684966400000.0, 'train_loss': 0.2544141605887288, 'epoch': 1.0})

In [16]:
with open('../data/trainer.pkl', 'wb') as f:
    pickle.dump(trainer, f)

In [12]:
# Load the objects from the pickle file
with open('../data/trainer.pkl', 'rb') as f:
    trainer = pickle.load(f)

  return torch.load(io.BytesIO(b))


In [13]:
evaluation_results = trainer.evaluate()
print(evaluation_results)

  0%|          | 0/1563 [00:00<?, ?it/s]

{'eval_loss': 0.1969287395477295, 'eval_accuracy': 0.9274632117722329, 'eval_runtime': 530.317, 'eval_samples_per_second': 47.142, 'eval_steps_per_second': 2.947, 'epoch': 1.0}


In [14]:
with open('../data/trainer_evaluation_results.pkl', 'wb') as f:
    pickle.dump(evaluation_results, f)

## Evaluating and Optimizing Project Model

In [17]:
training_args = TrainingArguments(
    output_dir="../data/optimized_model",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    push_to_hub=True,
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb['train'],
    eval_dataset=tokenized_imdb['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [19]:
trainer.train()

  0%|          | 0/2346 [00:00<?, ?it/s]

{'loss': 0.3097, 'grad_norm': 3.8327584266662598, 'learning_rate': 7.868712702472295e-06, 'epoch': 0.64}
{'loss': 0.2053, 'grad_norm': 7.533845901489258, 'learning_rate': 5.737425404944586e-06, 'epoch': 1.28}
{'loss': 0.1716, 'grad_norm': 3.1163320541381836, 'learning_rate': 3.60613810741688e-06, 'epoch': 1.92}
{'loss': 0.1393, 'grad_norm': 5.9714674949646, 'learning_rate': 1.474850809889173e-06, 'epoch': 2.56}
{'train_runtime': 4903.1758, 'train_samples_per_second': 15.296, 'train_steps_per_second': 0.478, 'train_loss': 0.19591549696926366, 'epoch': 3.0}


TrainOutput(global_step=2346, training_loss=0.19591549696926366, metrics={'train_runtime': 4903.1758, 'train_samples_per_second': 15.296, 'train_steps_per_second': 0.478, 'total_flos': 9935054899200000.0, 'train_loss': 0.19591549696926366, 'epoch': 3.0})

### Save Model Because Pickling wouldn't work.

In [24]:
output_dir = "../data/optimized_model"
os.makedirs(output_dir, exist_ok=True)

# Save the model, tokenizer, and training arguments
trainer.save_model(output_dir)
trainer.tokenizer.save_pretrained(output_dir)
# Save the training arguments
with open(os.path.join(output_dir, "training_args.json"), "w") as f:
    f.write(trainer.args.to_json_string())

No files have been modified since last commit. Skipping to prevent empty commit.


In [25]:
evaluation_results = trainer.evaluate()
print(evaluation_results)

  0%|          | 0/782 [00:00<?, ?it/s]

{'eval_loss': 0.20846539735794067, 'eval_accuracy': 0.92756, 'eval_runtime': 524.299, 'eval_samples_per_second': 47.683, 'eval_steps_per_second': 1.492, 'epoch': 3.0}


In [26]:
trainer.push_to_hub()

events.out.tfevents.1722694625.Kenneths-MacBook-Pro.local.21333.2:   0%|          | 0.00/411 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kenwallmisc/optimized_model/commit/14f6a60f48c4d3d381a7aa2accf29b7ed6755a78', commit_message='End of training', commit_description='', oid='14f6a60f48c4d3d381a7aa2accf29b7ed6755a78', pr_url=None, pr_revision=None, pr_num=None)

In [32]:
text = "I hate this terrible movie"
my_model = pipeline(model="kenwallmisc/optimized_model", task='sentiment-analysis', device=0)
my_model(text)


[{'label': 'NEGATIVE', 'score': 0.972745418548584}]