# Install Comet and Dependencies

## Step 1: Install Dependencies
We first install the required libraries, including:
- `comet-ml`: For experiment tracking.
- `datasets`: Provides the IMDb dataset.
- `transformers`: Implements the pre-trained model.
- `scikit-learn`: Used for evaluation metrics.

In [None]:
!pip install comet-ml datasets transformers scikit-learn




# Initialize Comet

## Step 2: Initialize Comet ML
We initialize Comet ML to track training progress and log metrics.

In [None]:
import comet_ml

comet_ml.init(project_name = "imdb-distilbart" )



# Set Model Type

## Step 3: Define Model Type
We use `distilbert-base-uncased`, a lightweight version of BERT for text classification.

In [None]:
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"
SEED = 20


# Load Data

## Step 4: Load the IMDb Dataset
We use the Hugging Face `datasets` library to load the IMDb dataset, which contains positive and negative movie reviews.

In [None]:
from transformers import AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load the IMDb dataset
raw_datasets = load_dataset("imdb")

# Print dataset information
print(raw_datasets)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


#Setup Tokenizer

## Step 3: Define Model Type
We use `distilbert-base-uncased`, a lightweight version of BERT for text classification.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create Sample Datasets

For this guide, we are only going to sample 200 examples from our dataset

In [None]:
train_dataset = tokenized_datasets["train"].shuffle(SEED).select(range(200))
eval_dataset = tokenized_datasets["test"].shuffle(SEED).select(range(200))

#Setup Transformer Model


## Step 3: Define Model Type
We use `distilbert-base-uncased`, a lightweight version of BERT for text classification.

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#Setup Evaluation Function

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def get_example (index):
  return eval_dataset[index]["text"]

def compute_metrics(pred):
    experiment = comet_ml.get_global_experiment()

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)

    if experiment:
        epoch = int(experiment.curr_epoch) if experiment.curr_epoch is not None else 0
        experiment.set_epoch(epoch)
        experiment.log_confusion_matrix(
            y_true = labels,
            y_predicted = preds,
            filename = f"confusion-matrix-epoch-{epoch}.json",
            labels = ["Negative", "Positive"],
            index_to_example_function = get_example,
            step = epoch,
        )
        for i in range (20):
          experiment.log_text(get_example(i), metadata={"label: lables[i].item()"})

        return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# Run Training

In order to enable logging from the Hugging Face Trainer, you will need to set the

## Step 6: Set Up the Trainer
We use Hugging Face's `Trainer` API to handle training, evaluation, and optimization.

In [None]:
%env COMET_MODE=ONLINE
%env COMET_LOG_ASSETS=TRUE

training_args = TrainingArguments(
    seed = SEED,
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    do_train=True,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=25,
    save_strategy="steps",
    save_total_limit=10,
    save_steps=25,
    per_device_train_batch_size=8,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

trainer.train()



env: COMET_MODE=ONLINE
env: COMET_LOG_ASSETS=TRUE


[1;38;5;39mCOMET INFO:[0m An experiment with the same configuration options is already running and will be reused.


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
25,No log,0.664281,0.7,0.689955,0.718887,0.695226




TrainOutput(global_step=25, training_loss=0.6839591217041016, metrics={'train_runtime': 899.2083, 'train_samples_per_second': 0.222, 'train_steps_per_second': 0.028, 'total_flos': 26493479731200.0, 'train_loss': 0.6839591217041016, 'epoch': 1.0})