In [None]:
'''
Imports
'''

# General
import json
import numpy as np
from collections import Counter
import pandas as pd

try:
  import wandb
except:
  ! pip install wandb
  import wandb

try:
  from datasets import Dataset
except:
  ! pip install datasets
  from datasets import Dataset

# Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score as sk_precision_score
from sklearn.metrics import recall_score as sk_recall_score
from sklearn.metrics import f1_score as sk_f1_score

try:
  from seqeval.metrics import classification_report as seqeval_classification_report
  from seqeval.metrics import f1_score, precision_score, recall_score
except:
  ! pip install seqeval
  from seqeval.metrics import classification_report as seqeval_classification_report
  from seqeval.metrics import f1_score, precision_score, recall_score

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset

# Transformers
try:
    from transformers import (
        DataCollatorForTokenClassification,
        DistilBertConfig,
        DistilBertForTokenClassification,
        DistilBertTokenizerFast,
        Trainer,
        TrainingArguments,
    )
except:
    ! pip install transformers
    from transformers import (
        DataCollatorForTokenClassification,
        DistilBertConfig,
        DistilBertForTokenClassification,
        DistilBertTokenizerFast,
        Trainer,
        TrainingArguments,
    )

! pip install accelerate



In [None]:
import os
#os.environ["WANDB_MODE"] = "offline"
from google.colab import drive
if "COLAB_GPU" in os.environ:
        drive.mount('/content/drive')

output_dir = "/content/drive/MyDrive/distilbert_outputs"
os.makedirs(output_dir, exist_ok=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
"""
    Preprocess a list of email chains for token classification with BIO tagging.

    For each email chain, this function:
    - Concatenates all email bodies into a single string.
    - Extracts entity values from the tenant profile.
    - Tokenizes the email text.
    - Labels each token using the BIO format based on entity spans.
    - Converts labels to IDs using the provided label2id mapping.
    - Masks padding token positions with -100 for loss computation.

    Args:
        data (list): List of email chains, each as a dictionary with an email_chain and tenant_profile.
        tokenizer (PreTrainedTokenizer): Hugging Face tokenizer.
        label2id (dict): Mapping from BIO label strings to integer IDs.
        max_length (int, optional): Maximum sequence length for tokenization. Defaults to 512.

    Returns:
        A tuple of two lists:
            - List of tokenized input dictionaries (with input_ids, attention_mask, etc.)
            - List of corresponding label ID sequences
"""

def preprocess_for_token_classification(data, tokenizer, label2id, max_length=512):
    inputs, all_labels = [], []

    for item in data:
        email_text = " ".join(email["body"].replace("\n", " ") for email in item["email_chain"])
        email_text_lower = email_text.lower()

        profile = item.get("tenant_profile", {})
        prefs = profile.get("Property Preferences", {})

        # Get all entities
        raw_entities = {
            "FirstName": profile.get("Tenant Representative Details", {}).get("First Name", ""),
            "LastName": profile.get("Tenant Representative Details", {}).get("Last Name", ""),
            "Email": profile.get("Tenant Representative Details", {}).get("Email", ""),
            "Phone": profile.get("Tenant Representative Details", {}).get("Phone", ""),
            "CompanyName": profile.get("Company Details", {}).get("Company Name", ""),
            "Industry": profile.get("Company Details", {}).get("Industry", ""),
            "CompanySize": profile.get("Company Details", {}).get("Company Size", ""),
            "GrowthStage": profile.get("Company Details", {}).get("Growth Stage", ""),
            "CurrentNeighborhood": profile.get("Company Details", {}).get("Current Neighborhood", []),
            "FirstInteraction": profile.get("First Interaction", ""),
            "LastInteraction": profile.get("Last Interaction", ""),
            "DecisionMakerRole": profile.get("Decision-Maker Role", ""),
            "PropertyType": prefs.get("Property Type", ""),
            "PreferredNeighborhood": prefs.get("Preferred Neighborhood", []),
            "Budget": prefs.get("Estimated or Stated Budget", ""),
            "MustHaves": prefs.get("Must-Haves", []),
            "NiceToHaves": prefs.get("Nice-to-Haves", []),
            "SpaceSize": prefs.get("Space Size", ""),
            "PreferredLeaseTerm": prefs.get("Preferred Lease Term", ""),
            "MovingTerm": prefs.get("Moving Term", ""),
            "min_months": prefs.get("Moving Timing", {}).get("min_months", ""),
            "max_months": prefs.get("Moving Timing", {}).get("max_months", ""),
            "MovingTimeline": profile.get("Moving Timeline", ""),
            "PainPoint": profile.get("Pain Points", []),
            "UrgencyScore": str(profile.get("Urgency Score", "")),
            "Outcome": profile.get("Outcome", ""),
            "Personality": profile.get("Tenant Personality", "")
        }

        # Normalize all values to a list of values
        entities = {}
        for key, val in raw_entities.items():
            if isinstance(val, list):
                entities[key] = [str(v).strip() for v in val if v]
            elif val:
                entities[key] = [str(val).strip()]
            else:
                entities[key] = []

        # Tokenize input
        encoding = tokenizer(
            email_text,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_offsets_mapping=True
        )
        offset_mapping = encoding.pop("offset_mapping")
        input_ids = encoding["input_ids"]

        # Set default label to "O"
        label_list = ["O"] * len(input_ids)

        # Tag tokens with BIO
        for entity_type, values in entities.items():
            for value in values:
                start = email_text_lower.find(value.lower())
                if start == -1:
                    continue
                end = start + len(value)

                inside = False
                for i, (token_start, token_end) in enumerate(offset_mapping):
                    if token_start == 0 and token_end == 0:
                        continue
                    if token_start >= end or token_end <= start:
                        continue
                    tag = f"I-{entity_type}" if inside else f"B-{entity_type}"
                    label_list[i] = tag
                    inside = True

        # Convert tags to IDs and mask padding
        label_ids = [label2id.get(tag, label2id["O"]) for tag in label_list]
        label_ids = [
            label_id if input_id != tokenizer.pad_token_id else -100
            for label_id, input_id in zip(label_ids, input_ids)
        ]

        inputs.append(encoding)
        all_labels.append(label_ids)

    return inputs, all_labels


In [None]:
"""
    Creates a PyTorch Dataset from the pre-tokenized input encodings and their corresponding label sequences.

    Args:
        encodings (list of dict): List of dictionaries containing tokenized inputs.
        labels (list of list): List of label ID sequences aligned with the tokenized inputs.

    Methods:
        __getitem__(index): Returns a single item from the dataset as a dictionary
                          with input tensors and corresponding label tensor.
        __len__(): Returns the number of items in the dataset.
"""
class EmailNERDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, index):
        item = {
            key: torch.tensor(val)
            for key, val in self.encodings[index].items()
        }
        item["labels"] = torch.tensor(self.labels[index])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
'''
Define tokenizer and import pre-trained model
'''
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
'''
Select ENTITY_TAGS for BIO tagging, having "O" as the default tag
'''

ENTITY_TAGS = [
    "O",
    "B-FirstName", "I-FirstName",
    "B-LastName", "I-LastName",
    "B-Email", "I-Email",
    "B-Phone", "I-Phone",
    "B-CompanyName", "I-CompanyName",
    "B-Industry", "I-Industry",
    "B-CompanySize", "I-CompanySize",
    "B-GrowthStage", "I-GrowthStage",
    "B-CurrentNeighborhood", "I-CurrentNeighborhood",
    "B-FirstInteraction", "I-FirstInteraction",
    "B-LastInteraction", "I-LastInteraction",
    "B-DecisionMakerRole", "I-DecisionMakerRole",
    "B-PropertyType", "I-PropertyType",
    "B-PreferredNeighborhood", "I-PreferredNeighborhood",
    "B-Budget", "I-Budget",
    "B-SpaceSize", "I-SpaceSize",
    "B-PreferredLeaseTerm", "I-PreferredLeaseTerm",
    "B-MovingTerm", "I-MovingTerm",
    "B-min_months", "I-min_months",
    "B-max_months", "I-max_months",
    "B-MovingTimeline", "I-MovingTimeline",
    "B-MustHaves", "I-MustHaves",
    "B-NiceToHaves", "I-NiceToHaves",
    "B-PainPoint", "I-PainPoint",
    "B-UrgencyScore", "I-UrgencyScore",
    "B-Outcome", "I-Outcome",
    "B-Personality", "I-Personality"
]

# Define label to id mappings
label2id = {label: i for i, label in enumerate(ENTITY_TAGS)}
id2label = {i: label for label, i in label2id.items()}

# Set inicial label weights
label_weights = torch.tensor([0.05] + [1.0] * (len(ENTITY_TAGS) - 1), dtype=torch.float)

In [None]:
'''
Initialize model
'''

model = DistilBertForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(ENTITY_TAGS),
    id2label=id2label,
    label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
'''
Import data and preprocess it
'''

with open("train.json") as f:  # Change path according to the file location
    train_data = json.load(f)
with open("val.json") as f:  # Change path according to the file location
    val_data = json.load(f)
with open("test.json") as f:  # Change path according to the file location
    test_data = json.load(f)

'''
For running with github repo.:
with open("../3. Data Split/train.json") as f:
    train_data = json.load(f)
with open("../3. Data Split/val.json") as f:
    val_data = json.load(f)
with open("../3. Data Split/test.json") as f:
    test_data = json.load(f)
'''

# Tokenize
train_encodings, train_labels = preprocess_for_token_classification(train_data, tokenizer, label2id)
val_encodings, val_labels = preprocess_for_token_classification(val_data, tokenizer, label2id)
test_encodings, test_labels = preprocess_for_token_classification(test_data, tokenizer, label2id)

# Create Datasets
train_data = EmailNERDataset(train_encodings, train_labels)
val_data = EmailNERDataset(val_encodings, val_labels)
test_data = EmailNERDataset(test_encodings, test_labels)


In [None]:
# Check tokenized outputs
for name, dataset in zip(["Validation", "Test", "Train"], [val_data, test_data, train_data]):
  print(f"\nLabel distribution for {name} dataset:")
  all_labels = sum([item['labels'].tolist() for item in dataset], [])
  label_counts = Counter(all_labels)
  print({(id2label[k] if k != -100 else 'IGNORED'): v for k, v in label_counts.items()})


Label distribution for Validation dataset:
{'O': 12598, 'B-FirstName': 30, 'B-CompanyName': 25, 'I-CompanyName': 37, 'B-MustHaves': 60, 'I-MustHaves': 60, 'B-UrgencyScore': 21, 'IGNORED': 2406, 'B-LastName': 12, 'B-Industry': 13, 'B-CurrentNeighborhood': 2, 'I-CurrentNeighborhood': 2, 'B-PropertyType': 27, 'B-PreferredNeighborhood': 9, 'I-PreferredNeighborhood': 8, 'B-NiceToHaves': 10, 'I-NiceToHaves': 13, 'B-PreferredLeaseTerm': 4, 'I-PreferredLeaseTerm': 5, 'B-Personality': 2, 'B-DecisionMakerRole': 7, 'I-FirstName': 2, 'B-Outcome': 2, 'B-SpaceSize': 1, 'I-SpaceSize': 4}

Label distribution for Test dataset:
{'O': 12209, 'B-FirstName': 30, 'B-CompanyName': 26, 'I-CompanyName': 38, 'B-PreferredNeighborhood': 13, 'I-PreferredNeighborhood': 12, 'B-PropertyType': 26, 'B-MustHaves': 49, 'I-MustHaves': 60, 'B-UrgencyScore': 20, 'IGNORED': 2818, 'B-PreferredLeaseTerm': 3, 'I-PreferredLeaseTerm': 3, 'B-LastName': 8, 'B-NiceToHaves': 10, 'I-NiceToHaves': 10, 'B-Industry': 12, 'B-DecisionMake

In [None]:
"""
  A subclass of DistilBertForTokenClassification that incorporates class weights into the loss function.

  This model is designed for token classification tasks where class imbalance may exist.
  It uses a cross-entropy loss function with class weights and `ignore_index=-100`
  to ignore padding tokens.

  Args:
      - config: Model configuration.
      - class_weights (torch.Tensor, optional): 1D tensor of weights for each class. If None, standard (unweighted) loss is used.
"""

class WeightedDistilBertForTokenClassification(DistilBertForTokenClassification):
    def __init__(self, config, class_weights=None):
        super().__init__(config)
        self.class_weights = class_weights

    def forward(self, input_ids=None, attention_mask=None, num_items_in_batch=None, labels=None, **kwargs):
        '''
          Forward Args:
              - input_ids (torch.LongTensor): Token IDs with shape (batch_size, sequence_length).
              - attention_mask (torch.LongTensor): Mask to avoid performing attention on padding tokens.
              - labels (torch.LongTensor, optional): Token-level labels for computing the loss.
              - num_items_in_batch (optional): Unused placeholder for potential logging/debugging.
              - **kwargs: Additional keyword arguments for base model.

          Returns:
              - dict: A dictionary with:
                  - "loss" (torch.FloatTensor, optional): The weighted cross-entropy loss, if labels are provided.
                  - "logits" (torch.FloatTensor): The predicted logits of shape (batch_size, sequence_length, num_labels).
        '''
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=None, **kwargs)
        logits = outputs.logits
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights.to(logits.dtype).to(logits.device), ignore_index=-100)
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}


In [None]:
'''
Initialize model
'''
# Load configuration with entities, labels and label ids
config = DistilBertConfig.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(ENTITY_TAGS),
    id2label=id2label,
    label2id=label2id
)

# Load model using weighted class
model = WeightedDistilBertForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    config=config,
    class_weights=label_weights
)

Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
'''
Define metrics for model evaluation
'''

def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(axis=-1)

    true_labels = []
    pred_labels = []

    for pred, label in zip(preds, labels):
        true_seq = []
        pred_seq = []
        for p_, l_ in zip(pred, label):
            if l_ != -100: # Ignore padding
                true_seq.append(id2label[l_])
                pred_seq.append(id2label[p_])
        true_labels.append(true_seq)
        pred_labels.append(pred_seq)

    return {
        "precision": precision_score(true_labels, pred_labels, zero_division=1),
        "recall": recall_score(true_labels, pred_labels, zero_division=1),
        "f1": f1_score(true_labels, pred_labels, zero_division=1)
    }


In [None]:
"""
Extract contiguous entities from BIO-tagged tokens.

Args:
    tokens (List[str]): List of tokens.
    labels (List[str]): Corresponding BIO labels (e.g., B-Budget, I-Budget, O).

Returns:
    Dict[str, str]: A mapping from field name to extracted string value.
"""
def extract_entities_from_tokens(tokens, labels):

    entity_dict = {}
    current_entity = None
    current_tokens = []

    for token, label in zip(tokens, labels):
        if label == "O" or token in tokenizer.all_special_tokens:
            if current_entity:
                value = tokenizer.convert_tokens_to_string(current_tokens).replace(" ##", "")
                entity_dict.setdefault(current_entity.lower(), value.strip())
                current_entity, current_tokens = None, []
            continue

        tag, field = label.split("-", 1)

        if tag == "B":
            if current_entity:
                value = tokenizer.convert_tokens_to_string(current_tokens).replace(" ##", "")
                entity_dict.setdefault(current_entity.lower(), value.strip())
            current_entity = field
            current_tokens = [token]
        elif tag == "I" and field == current_entity:
            current_tokens.append(token)
        else:
            if current_entity:
                value = tokenizer.convert_tokens_to_string(current_tokens).replace(" ##", "")
                entity_dict.setdefault(current_entity.lower(), value.strip())
            current_entity = None
            current_tokens = []

    if current_entity and current_tokens:
        value = tokenizer.convert_tokens_to_string(current_tokens).replace(" ##", "")
        entity_dict.setdefault(current_entity.lower(), value.strip())

    return entity_dict


In [None]:
"""
Convert BERT predictions on token-level to structured tenant profiles.

Args:
    dataset: EmailNERDataset instance
    predictions: raw logits from trainer.predict()
    id2label: mapping from label id to BIO label
    tokenizer: tokenizer used for encoding

Returns:
    List[Dict]: structured tenant profiles
"""
def convert_predictions_to_profiles(dataset, predictions, id2label, tokenizer):

    predicted_ids = predictions.argmax(axis=-1)
    profiles = []

    for i in range(len(dataset)):
        input_ids = dataset[i]["input_ids"]
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        label_ids = predicted_ids[i]


        tokens_filtered = []
        labels_filtered = []
        for token, label_id in zip(tokens, label_ids):
            if label_id == -100 or token == tokenizer.pad_token: # Ignore padding
                continue
            tokens_filtered.append(token)
            labels_filtered.append(id2label[label_id])

        entities = extract_entities_from_tokens(tokens_filtered, labels_filtered)

        # Fill with all expected keys
        structured_profile = {
            key.lower(): entities.get(key.lower(), None)
            for key in [
                "FirstName", "LastName", "Email", "Phone",
                "CompanyName", "Industry", "CompanySize", "GrowthStage",
                "CurrentNeighborhood", "FirstInteraction", "LastInteraction",
                "DecisionMakerRole", "PropertyType", "PreferredNeighborhood",
                "Budget", "SpaceSize", "PreferredLeaseTerm", "MovingTerm",
                "min_months", "max_months", "MovingTimeline",
                "MustHaves", "NiceToHaves", "PainPoint",
                "UrgencyScore", "Outcome", "Personality"
            ]
        }
        profiles.append(structured_profile)

    return profiles


In [None]:
"""
  Extracts f1, Precsion and Recall for each entity (column) on the dataframe.

  Args:
      - df (pandas Dataframe):

"""

def get_entity_metrics(df):
    field_names = df.columns.tolist()
    results = {}
    macro_p, macro_r, macro_f1 = [], [], []

    for field in field_names:
        y_true = df[field].fillna("").str.strip().str.lower().tolist()
        y_pred = df[field].fillna("").str.strip().str.lower().tolist()

        binary_true = [1 if val else 0 for val in y_true]
        binary_pred = [1 if pred == val and val else 0 for pred, val in zip(y_pred, y_true)]

        if any(binary_true):
            p = sk_precision_score(binary_true, binary_pred, zero_division=0)
            r = sk_recall_score(binary_true, binary_pred, zero_division=0)
            f1 = sk_f1_score(binary_true, binary_pred, zero_division=0)
            results[f"{field}_precision"] = round(p, 4)
            results[f"{field}_recall"] = round(r, 4)
            results[f"{field}_f1"] = round(f1, 4)
            macro_p.append(p)
            macro_r.append(r)
            macro_f1.append(f1)

    results["macro_precision"] = round(np.mean(macro_p), 4) if macro_p else 0.0
    results["macro_recall"] = round(np.mean(macro_r), 4) if macro_r else 0.0
    results["macro_f1"] = round(np.mean(macro_f1), 4) if macro_f1 else 0.0


    print("Structured Field-Level Metrics:", results)
    return results


In [None]:
"""
  Evaluates a token classification model on the given dataset and prints a classification report.

  This function assumes that the dataframe contains one column per entity field, with
  predictions and ground truths already aligned. It treats each cell as correct if
  the predicted value matches the true value exactly (case-insensitive and stripped).

  Args:
      - df (pandas.DataFrame): DataFrame containing one column per entity. Each column
                               should contain both ground truth and predicted values
                               already aligned row-wise.

  Returns:
     - dict: A dictionary with precision, recall, and F1 score for each field, as well
             as macro-averaged precision, recall, and F1 score across all fields.

"""
def evaluate_model(trainer, dataset, id2label, tokenizer):
    predictions, labels, _ = trainer.predict(dataset)
    preds = predictions.argmax(axis=-1)

    true_labels = []
    pred_labels = []

    for pred, label in zip(preds, labels):
        true_seq = []
        pred_seq = []
        for p, l in zip(pred, label):
            if l != -100:
                true_seq.append(id2label[l])
                pred_seq.append(id2label[p])
        true_labels.append(true_seq)
        pred_labels.append(pred_seq)

    profiles = convert_predictions_to_profiles(test_data, predictions, id2label, tokenizer)
    df_profiles = pd.DataFrame(profiles)

    print(seqeval_classification_report(true_labels, pred_labels))

    return df_profiles


In [None]:
'''
Initialize wandb
'''

wandb.init()

[34m[1mwandb[0m: Currently logged in as: [33mlidianeh[0m ([33mlidianeh-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
'''
Define sweep configurations and set up sweep agent
'''

sweep_config = {
    "method": "bayes",
    "metric": {
        "name": "eval_f1",
        "goal": "maximize"
    },
    "parameters": {
        "learning_rate": {
            "distribution": "uniform",
            "min": 1e-5,
            "max": 4e-5
        },
        "per_device_train_batch_size": {
            "values": [8, 16]
        },
        "num_train_epochs": {
            "values": [6, 8, 10]
        },
        "weight_decay": {
            "distribution": "uniform",
            "min": 0.05,
            "max": 0.2
        },
        "warmup_ratio": {
            "distribution": "uniform",
            "min": 0.05,
            "max": 0.2
        },
        "adam_epsilon": {
            "values": [1e-8, 1e-6]
        },
        "seed": {
            "values": [42, 2024]
        }
    }
}


sweep_id = wandb.sweep(sweep_config, project="distilbert-final")

Create sweep with ID: 6yae4880
Sweep URL: https://wandb.ai/lidianeh-none/distilbert-final/sweeps/6yae4880


In [None]:
"""
Trains and evaluates DistilBERT-based token classification model using a W&B sweep configuration.

This function initializes wandb, builds a model and training pipeline using hyperparameters from sweep_config,
training and evaluating the model on the provided datasets, and logging evaluation metrics to W&B.

Args:
    - config (dict, optional): Sweep configuration containing training hyperparameters.
      If None, wandb will use the default sweep config.

Raises:
    - Exception: Prints the error and finishes the wandb run if training fails.
"""


def train_with_wandb_sweep(config=None):

    with wandb.init(config=config) as run:
        config = wandb.config

        model_config = DistilBertConfig.from_pretrained(
            "distilbert-base-uncased",
            num_labels=len(ENTITY_TAGS),
            id2label=id2label,
            label2id=label2id
        )

        model = WeightedDistilBertForTokenClassification.from_pretrained(
            "distilbert-base-uncased",
            config=model_config,
            class_weights=label_weights
        )

        training_args = TrainingArguments(
            output_dir="./results",
            num_train_epochs=config.num_train_epochs,
            per_device_train_batch_size=config.per_device_train_batch_size,
            per_device_eval_batch_size=config.per_device_train_batch_size,
            learning_rate=config.learning_rate,
            weight_decay=config.weight_decay,
            warmup_ratio=config.warmup_ratio,
            adam_epsilon=config.adam_epsilon,
            seed=config.seed,
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_dir="./logs",
            logging_strategy="epoch",
            logging_steps=10,
            report_to="wandb",
            load_best_model_at_end=True,
            metric_for_best_model="eval_f1",
            greater_is_better=True
        )


        # Initialize trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_data,
            eval_dataset=val_data,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        # Train and evaluate
        try:
            trainer.train()
            trainer.evaluate()
            metrics = trainer.evaluate(test_data)
            print(metrics)
            df = evaluate_model(trainer, test_data, id2label, tokenizer)
            print(df)
            wandb_table = wandb.Table(dataframe=df)
            wandb.log({f"{run.name} Predictions": wandb_table})

            entity_metrics = get_entity_metrics(df)
            wandb.log({**{f"test_{k}": float(v) for k, v in entity_metrics.items()}})

            # Save all predictions to Google Drive
            pred_path = os.path.join(output_dir, f"{wandb.run.name}_predictions.json")
            df.to_json(pred_path, orient="records", indent=2)
            print(f"Predictions saved to: {pred_path}")

        except Exception as e:
            print(f"Training failed: {e}")
            return


In [None]:
''' Launch the sweep agent '''
wandb.agent(sweep_id, function=train_with_wandb_sweep, count=25)

[34m[1mwandb[0m: Agent Starting Run: o0qra5bp with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 2.652952326169707e-05
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.07452962363800385
[34m[1mwandb[0m: 	weight_decay: 0.11292689409200424


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.0566,1.847368,1.0,0.0,0.0
2,1.4828,1.168944,0.301075,0.246696,0.271186
3,0.9799,0.812521,0.24466,0.555066,0.339623
4,0.7264,0.646151,0.237013,0.643172,0.346382
5,0.5814,0.566431,0.25,0.682819,0.365998
6,0.4892,0.520067,0.229137,0.713656,0.346895
7,0.4478,0.498501,0.269737,0.722467,0.392814
8,0.415,0.493678,0.273345,0.709251,0.394608


{'eval_loss': 0.42069491744041443, 'eval_precision': 0.30326295585412666, 'eval_recall': 0.7632850241545893, 'eval_f1': 0.4340659340659341, 'eval_runtime': 0.2116, 'eval_samples_per_second': 141.81, 'eval_steps_per_second': 18.908, 'epoch': 8.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.40      0.74      0.52        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.33      0.17      0.22         6
            FirstName       0.45      0.97      0.61        30
             Industry       0.31      0.67      0.42        12
             LastName       0.35      0.88      0.50         8
            MustHaves       0.24      0.86      0.38        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.22      0.67      0.33         3
PreferredNeighborhood       0.75      0.92      0.83        13
         PropertyType       0.60      0.92      0.73        26
            SpaceSize       0.00      0.00      0.00         0
         UrgencyScore       0.12      0.60      0.21        20

            micro avg       0.30      0.76      0.43 

0,1
eval/f1,▁▅▆▇▇▇▇▇▇█
eval/loss,█▅▃▂▂▁▁▁▁▁
eval/precision,█▂▁▁▁▁▁▁▁▂
eval/recall,▁▃▆▇▇█████
eval/runtime,▂▂▁▂▁▂▁▁██
eval/samples_per_second,▇▇█▇█▇██▁▁
eval/steps_per_second,▇▇█▇█▇██▁▁
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.43407
eval/loss,0.42069
eval/precision,0.30326
eval/recall,0.76329
eval/runtime,0.2116
eval/samples_per_second,141.81
eval/steps_per_second,18.908
test/f1,0.43407
test/loss,0.42069
test/precision,0.30326


[34m[1mwandb[0m: Agent Starting Run: 3xgwd3px with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 2.1641820645150784e-05
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	seed: 42
[34m[1mwandb[0m: 	warmup_ratio: 0.16831289580189984
[34m[1mwandb[0m: 	weight_decay: 0.17494773851259987


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.5689,2.472898,1.0,0.0,0.0
2,1.8373,1.480993,1.0,0.0,0.0
3,1.2549,1.039377,0.268456,0.352423,0.304762
4,0.918,0.803044,0.219931,0.563877,0.31644
5,0.7288,0.678912,0.212219,0.581498,0.310954
6,0.6183,0.621917,0.267658,0.634361,0.376471
7,0.5635,0.595477,0.25784,0.651982,0.369538
8,0.534,0.584327,0.261324,0.660793,0.374532


{'eval_loss': 0.5357958674430847, 'eval_precision': 0.3008298755186722, 'eval_recall': 0.7004830917874396, 'eval_f1': 0.420899854862119, 'eval_runtime': 0.1936, 'eval_samples_per_second': 154.96, 'eval_steps_per_second': 20.661, 'epoch': 8.0}


  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.36      0.67      0.47        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.44      0.93      0.60        30
             Industry       0.67      0.17      0.27        12
             LastName       0.33      0.75      0.46         8
            MustHaves       0.21      0.86      0.34        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.75      0.92      0.83        13
         PropertyType       0.63      0.92      0.75        26
         UrgencyScore       0.13      0.60      0.21        20

            micro avg       0.30      0.70      0.42       207
            macro avg       0.27      0.45      0.30 

0,1
eval/f1,▁▁▆▆▆▇▇▇▇█
eval/loss,█▄▃▂▂▁▁▁▁▁
eval/precision,██▁▁▁▁▁▁▁▂
eval/recall,▁▁▅▇▇▇██▇█
eval/runtime,▆▁▅▅▃▄▅▃█▇
eval/samples_per_second,▃█▄▄▆▅▄▆▁▂
eval/steps_per_second,▃█▄▄▆▅▄▆▁▂
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.4209
eval/loss,0.5358
eval/precision,0.30083
eval/recall,0.70048
eval/runtime,0.1936
eval/samples_per_second,154.96
eval/steps_per_second,20.661
test/f1,0.4209
test/loss,0.5358
test/precision,0.30083


[34m[1mwandb[0m: Agent Starting Run: sx8fssan with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-06
[34m[1mwandb[0m: 	learning_rate: 2.4018086235892944e-05
[34m[1mwandb[0m: 	num_train_epochs: 10
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	seed: 42
[34m[1mwandb[0m: 	warmup_ratio: 0.10133492200825173
[34m[1mwandb[0m: 	weight_decay: 0.11867715277521929


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.8056,2.437096,1.0,0.0,0.0
2,1.7607,1.381711,0.0,0.0,0.0
3,1.1279,0.91294,0.270396,0.511013,0.353659
4,0.793,0.708407,0.19708,0.594714,0.296053
5,0.6178,0.600435,0.250438,0.629956,0.358396
6,0.5002,0.543308,0.272727,0.647577,0.383812
7,0.4367,0.508794,0.286232,0.696035,0.405648
8,0.391,0.498798,0.278261,0.704846,0.399002
9,0.3697,0.496916,0.302326,0.687225,0.419919
10,0.3548,0.484197,0.292505,0.704846,0.413437


{'eval_loss': 0.4104796051979065, 'eval_precision': 0.3269230769230769, 'eval_recall': 0.7391304347826086, 'eval_f1': 0.4533333333333333, 'eval_runtime': 0.1912, 'eval_samples_per_second': 156.903, 'eval_steps_per_second': 20.92, 'epoch': 10.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.49      0.78      0.60        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.44      0.93      0.60        30
             Industry       0.35      0.67      0.46        12
             LastName       0.35      0.88      0.50         8
            MustHaves       0.28      0.82      0.41        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.17      0.67      0.27         3
PreferredNeighborhood       0.80      0.92      0.86        13
         PropertyType       0.59      0.92      0.72        26
            SpaceSize       0.00      0.00      0.00         0
         UrgencyScore       0.11      0.50      0.18        20

            micro avg       0.33      0.74      0.45 

0,1
eval/f1,▁▁▆▆▇▇▇▇▇▇▇█
eval/loss,█▄▃▂▂▁▁▁▁▁▁▁
eval/precision,█▁▃▂▃▃▃▃▃▃▃▃
eval/recall,▁▁▆▇▇▇██████
eval/runtime,▁▁▂▂▁▂▂▂▂▂█▂
eval/samples_per_second,██▇▇█▇▇▇▇▇▁▇
eval/steps_per_second,██▇▇█▇▇▇▇▇▁▇
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.45333
eval/loss,0.41048
eval/precision,0.32692
eval/recall,0.73913
eval/runtime,0.1912
eval/samples_per_second,156.903
eval/steps_per_second,20.92
test/f1,0.45333
test/loss,0.41048
test/precision,0.32692


[34m[1mwandb[0m: Agent Starting Run: ancgjdhp with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-06
[34m[1mwandb[0m: 	learning_rate: 3.5114310508988655e-05
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 42
[34m[1mwandb[0m: 	warmup_ratio: 0.10916818624206806
[34m[1mwandb[0m: 	weight_decay: 0.11386470674631534


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.6308,2.517639,1.0,0.0,0.0
2,2.0391,1.75628,1.0,0.0,0.0
3,1.5439,1.358725,0.0,0.0,0.0
4,1.2214,1.08339,0.162413,0.30837,0.212766
5,0.9899,0.913266,0.247768,0.488987,0.328889
6,0.8386,0.811927,0.247917,0.524229,0.336634
7,0.7439,0.763362,0.265849,0.572687,0.363128
8,0.7039,0.743134,0.257692,0.590308,0.358768


{'eval_loss': 0.6860792636871338, 'eval_precision': 0.29411764705882354, 'eval_recall': 0.6521739130434783, 'eval_f1': 0.4054054054054054, 'eval_runtime': 0.1925, 'eval_samples_per_second': 155.855, 'eval_steps_per_second': 10.39, 'epoch': 8.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.36      0.74      0.48        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.31      0.93      0.47        30
             Industry       1.00      0.08      0.15        12
             LastName       0.00      0.00      0.00         8
            MustHaves       0.20      0.86      0.33        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.80      0.92      0.86        13
         PropertyType       0.77      0.92      0.84        26
            SpaceSize       0.00      0.00      0.00         0
         UrgencyScore       0.14      0.35      0.20        20

            micro avg       0.29      0.65      0.41 

0,1
eval/f1,▁▁▁▅▇▇▇▇▇█
eval/loss,█▅▄▃▂▁▁▁▁▁
eval/precision,██▁▂▃▃▃▃▃▃
eval/recall,▁▁▁▄▆▇▇▇▇█
eval/runtime,▁▁▁▁▁▂▁▃█▄
eval/samples_per_second,█████▆█▆▁▄
eval/steps_per_second,█████▆█▆▁▄
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.40541
eval/loss,0.68608
eval/precision,0.29412
eval/recall,0.65217
eval/runtime,0.1925
eval/samples_per_second,155.855
eval/steps_per_second,10.39
test/f1,0.40541
test/loss,0.68608
test/precision,0.29412


[34m[1mwandb[0m: Agent Starting Run: qussyuaf with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-06
[34m[1mwandb[0m: 	learning_rate: 2.8880267958389663e-05
[34m[1mwandb[0m: 	num_train_epochs: 6
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.09958715560020488
[34m[1mwandb[0m: 	weight_decay: 0.12577085354759657


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.9504,2.826158,0.0,0.0,0.0
2,2.1355,1.748997,1.0,0.0,0.0
3,1.5779,1.429992,1.0,0.0,0.0
4,1.3112,1.20994,0.240741,0.057269,0.092527
5,1.1398,1.086138,0.349515,0.317181,0.332564
6,1.046,1.044065,0.315985,0.374449,0.342742


{'eval_loss': 0.9761345982551575, 'eval_precision': 0.3471698113207547, 'eval_recall': 0.4444444444444444, 'eval_f1': 0.38983050847457623, 'eval_runtime': 0.1842, 'eval_samples_per_second': 162.834, 'eval_steps_per_second': 10.856, 'epoch': 6.0}


  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.21      0.33      0.26        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.45      0.80      0.58        30
             Industry       0.00      0.00      0.00        12
             LastName       0.00      0.00      0.00         8
            MustHaves       0.21      0.54      0.30        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.67      0.77      0.71        13
         PropertyType       0.95      0.81      0.88        26
         UrgencyScore       0.25      0.05      0.08        20

            micro avg       0.35      0.44      0.39       207
            macro avg       0.21      0.25      0.22 

0,1
eval/f1,▁▁▁▃▇▇▇█
eval/loss,█▄▃▂▁▁▁▁
eval/precision,▁██▃▃▃▃▃
eval/recall,▁▁▁▂▆▇▇█
eval/runtime,▁▁▁▁▂▂█▃
eval/samples_per_second,████▇▇▁▆
eval/steps_per_second,████▇▇▁▆
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.38983
eval/loss,0.97613
eval/precision,0.34717
eval/recall,0.44444
eval/runtime,0.1842
eval/samples_per_second,162.834
eval/steps_per_second,10.856
test/f1,0.38983
test/loss,0.97613
test/precision,0.34717


[34m[1mwandb[0m: Agent Starting Run: w34dppic with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 1.662658764808312e-05
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.08828829830770812
[34m[1mwandb[0m: 	weight_decay: 0.1472663594066858


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.6528,2.39243,1.0,0.0,0.0
2,1.8188,1.545832,1.0,0.0,0.0
3,1.3388,1.145743,0.290323,0.15859,0.205128
4,1.0337,0.905422,0.28066,0.524229,0.365591
5,0.8539,0.795736,0.216216,0.599119,0.317757
6,0.7521,0.732282,0.203514,0.612335,0.305495
7,0.6988,0.691513,0.24386,0.612335,0.348808
8,0.6607,0.682073,0.231148,0.621145,0.336918


{'eval_loss': 0.8341310024261475, 'eval_precision': 0.29207920792079206, 'eval_recall': 0.5700483091787439, 'eval_f1': 0.3862520458265139, 'eval_runtime': 0.1896, 'eval_samples_per_second': 158.247, 'eval_steps_per_second': 21.1, 'epoch': 8.0}


  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.28      0.74      0.41        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.33      0.93      0.49        30
             Industry       0.00      0.00      0.00        12
             LastName       0.00      0.00      0.00         8
            MustHaves       0.20      0.78      0.32        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.53      0.62      0.57        13
         PropertyType       0.73      0.85      0.79        26
         UrgencyScore       0.08      0.05      0.06        20

            micro avg       0.29      0.57      0.39       207
            macro avg       0.17      0.31      0.20 

0,1
eval/f1,▁▁▅█▇▇▇▇██
eval/loss,█▅▃▂▁▁▁▁▂▂
eval/precision,██▂▂▁▁▁▁▂▂
eval/recall,▁▁▃▇████▇▇
eval/runtime,▂▁▁▁▂▁▁▂█▁
eval/samples_per_second,▇███▇██▇▁█
eval/steps_per_second,▇███▇██▇▁█
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.38625
eval/loss,0.83413
eval/precision,0.29208
eval/recall,0.57005
eval/runtime,0.1896
eval/samples_per_second,158.247
eval/steps_per_second,21.1
test/f1,0.38625
test/loss,0.83413
test/precision,0.29208


[34m[1mwandb[0m: Agent Starting Run: gi5aquan with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 1.2557770678988033e-05
[34m[1mwandb[0m: 	num_train_epochs: 6
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 42
[34m[1mwandb[0m: 	warmup_ratio: 0.05447096984109032
[34m[1mwandb[0m: 	weight_decay: 0.10887050513865498


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.6841,3.099876,0.0,0.0,0.0
2,2.5484,2.063484,1.0,0.0,0.0
3,1.897,1.809234,1.0,0.0,0.0
4,1.7214,1.664777,1.0,0.0,0.0
5,1.6045,1.584529,1.0,0.0,0.0
6,1.5485,1.555294,1.0,0.0,0.0


{'eval_loss': 3.090292453765869, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 0.1759, 'eval_samples_per_second': 170.587, 'eval_steps_per_second': 11.372, 'epoch': 6.0}


  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.00      0.00      0.00        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.00      0.00      0.00        30
             Industry       0.00      0.00      0.00        12
             LastName       0.00      0.00      0.00         8
            MustHaves       0.00      0.00      0.00        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.00      0.00      0.00        13
         PropertyType       0.00      0.00      0.00        26
         UrgencyScore       0.00      0.00      0.00        20

            micro avg       0.00      0.00      0.00       207
            macro avg       0.00      0.00      0.00 

0,1
eval/f1,▁▁▁▁▁▁▁▁
eval/loss,█▃▂▁▁▁██
eval/precision,▁█████▁▁
eval/recall,▁▁▁▁▁▁▁▁
eval/runtime,█▄▆▆▄▃▆▁
eval/samples_per_second,▁▅▃▃▅▅▃█
eval/steps_per_second,▁▅▃▃▅▅▃█
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.0
eval/loss,3.09029
eval/precision,0.0
eval/recall,0.0
eval/runtime,0.1759
eval/samples_per_second,170.587
eval/steps_per_second,11.372
test/f1,0.0
test/loss,3.09029
test/precision,0.0


[34m[1mwandb[0m: Agent Starting Run: n1p80o5x with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 2.998565313689423e-05
[34m[1mwandb[0m: 	num_train_epochs: 10
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.11072242994588884
[34m[1mwandb[0m: 	weight_decay: 0.08164553658184762


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,4.0129,3.370392,0.0,0.0,0.0
2,2.3999,1.877022,1.0,0.0,0.0
3,1.6765,1.500938,1.0,0.0,0.0
4,1.3566,1.190296,0.183673,0.079295,0.110769
5,1.0781,0.95194,0.288952,0.449339,0.351724
6,0.8847,0.814849,0.226168,0.53304,0.317585
7,0.7619,0.727622,0.272537,0.572687,0.369318
8,0.6797,0.67733,0.271287,0.603524,0.374317
9,0.6272,0.646012,0.27381,0.60793,0.377565
10,0.5945,0.636691,0.279678,0.612335,0.383978


{'eval_loss': 0.5629532933235168, 'eval_precision': 0.3050847457627119, 'eval_recall': 0.6956521739130435, 'eval_f1': 0.42415316642120765, 'eval_runtime': 0.1834, 'eval_samples_per_second': 163.559, 'eval_steps_per_second': 10.904, 'epoch': 10.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.35      0.70      0.46        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.37      0.93      0.53        30
             Industry       0.40      0.33      0.36        12
             LastName       0.32      0.75      0.44         8
            MustHaves       0.21      0.80      0.33        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.71      0.92      0.80        13
         PropertyType       0.77      0.92      0.84        26
            SpaceSize       0.00      0.00      0.00         0
         UrgencyScore       0.16      0.55      0.25        20

            micro avg       0.31      0.70      0.42 

0,1
eval/f1,▁▁▁▃▇▆▇▇▇▇▇█
eval/loss,█▄▃▃▂▂▁▁▁▁▁▁
eval/precision,▁██▂▃▃▃▃▃▃▃▃
eval/recall,▁▁▁▂▆▆▇▇▇▇▇█
eval/runtime,▁▁▁▁▂▁▁▁▂▂█▂
eval/samples_per_second,▇███▇▇█▇▇▇▁▇
eval/steps_per_second,▇███▇▇█▇▇▇▁▇
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.42415
eval/loss,0.56295
eval/precision,0.30508
eval/recall,0.69565
eval/runtime,0.1834
eval/samples_per_second,163.559
eval/steps_per_second,10.904
test/f1,0.42415
test/loss,0.56295
test/precision,0.30508


[34m[1mwandb[0m: Agent Starting Run: 0xs4aya7 with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 3.73733386548396e-05
[34m[1mwandb[0m: 	num_train_epochs: 6
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.1979512776258529
[34m[1mwandb[0m: 	weight_decay: 0.17452599929951376


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.4281,1.980274,1.0,0.0,0.0
2,1.5279,1.137235,0.2,0.061674,0.094276
3,0.939,0.75639,0.213689,0.563877,0.309927
4,0.6576,0.595784,0.199719,0.625551,0.302772
5,0.5274,0.529695,0.209141,0.665198,0.31823
6,0.4559,0.512717,0.254098,0.682819,0.37037


{'eval_loss': 0.4335229992866516, 'eval_precision': 0.27175843694493784, 'eval_recall': 0.7391304347826086, 'eval_f1': 0.3974025974025973, 'eval_runtime': 0.1886, 'eval_samples_per_second': 159.101, 'eval_steps_per_second': 21.213, 'epoch': 6.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.36      0.67      0.47        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.46      0.93      0.62        30
             Industry       0.38      0.42      0.40        12
             LastName       0.35      0.88      0.50         8
            MustHaves       0.22      0.86      0.35        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.12      0.33      0.18         3
PreferredNeighborhood       0.67      0.92      0.77        13
         PropertyType       0.56      0.92      0.70        26
            SpaceSize       0.00      0.00      0.00         0
         UrgencyScore       0.11      0.75      0.20        20

            micro avg       0.27      0.74      0.40 

0,1
eval/f1,▁▃▆▆▇███
eval/loss,█▄▂▂▁▁▁▁
eval/precision,█▁▁▁▁▁▁▂
eval/recall,▁▂▆▇▇▇▇█
eval/runtime,▁▁▂▂▁▂█▁
eval/samples_per_second,██▇▇█▇▁█
eval/steps_per_second,██▇▇█▇▁█
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.3974
eval/loss,0.43352
eval/precision,0.27176
eval/recall,0.73913
eval/runtime,0.1886
eval/samples_per_second,159.101
eval/steps_per_second,21.213
test/f1,0.3974
test/loss,0.43352
test/precision,0.27176


[34m[1mwandb[0m: Agent Starting Run: kglsctp6 with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 2.90022867164286e-05
[34m[1mwandb[0m: 	num_train_epochs: 10
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.06740521023032645
[34m[1mwandb[0m: 	weight_decay: 0.08554009050599434


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.842,2.915444,1.0,0.0,0.0
2,2.1142,1.741963,1.0,0.0,0.0
3,1.5531,1.384127,0.153846,0.008811,0.016667
4,1.2448,1.087119,0.291391,0.193833,0.232804
5,1.0,0.902282,0.237736,0.555066,0.332893
6,0.8418,0.78228,0.239714,0.590308,0.340967
7,0.7419,0.714377,0.234506,0.61674,0.339806
8,0.6703,0.67117,0.225397,0.625551,0.331389
9,0.6273,0.642434,0.245645,0.621145,0.35206
10,0.6006,0.6331,0.254054,0.621145,0.360614


{'eval_loss': 0.5611169934272766, 'eval_precision': 0.27920792079207923, 'eval_recall': 0.6811594202898551, 'eval_f1': 0.3960674157303371, 'eval_runtime': 0.1789, 'eval_samples_per_second': 167.729, 'eval_steps_per_second': 11.182, 'epoch': 10.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.43      0.74      0.54        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.31      0.93      0.46        30
             Industry       0.44      0.33      0.38        12
             LastName       0.50      0.25      0.33         8
            MustHaves       0.21      0.86      0.34        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.63      0.92      0.75        13
         PropertyType       0.55      0.92      0.69        26
            SpaceSize       0.00      0.00      0.00         0
         UrgencyScore       0.11      0.40      0.17        20

            micro avg       0.28      0.68      0.40 

0,1
eval/f1,▁▁▁▅▇▇▇▇▇▇▇█
eval/loss,█▅▃▃▂▂▁▁▁▁▁▁
eval/precision,██▁▂▂▂▂▂▂▂▂▂
eval/recall,▁▁▁▃▇▇▇▇▇▇▇█
eval/runtime,▁▂▁▂▂▃▂▂▂▂█▁
eval/samples_per_second,█▇█▇▇▆▇▇▆▇▁█
eval/steps_per_second,█▇█▇▇▆▇▇▆▇▁█
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.39607
eval/loss,0.56112
eval/precision,0.27921
eval/recall,0.68116
eval/runtime,0.1789
eval/samples_per_second,167.729
eval/steps_per_second,11.182
test/f1,0.39607
test/loss,0.56112
test/precision,0.27921


[34m[1mwandb[0m: Agent Starting Run: 3fhkwut8 with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 3.685217715775454e-05
[34m[1mwandb[0m: 	num_train_epochs: 10
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 42
[34m[1mwandb[0m: 	warmup_ratio: 0.06554292766090644
[34m[1mwandb[0m: 	weight_decay: 0.1366542653968309


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.6398,2.253623,1.0,0.0,0.0
2,1.8374,1.540029,1.0,0.0,0.0
3,1.3615,1.167056,0.188889,0.07489,0.107256
4,1.0484,0.913696,0.212569,0.506608,0.299479
5,0.8361,0.759918,0.201893,0.563877,0.297329
6,0.6927,0.666948,0.193948,0.621145,0.295597
7,0.5946,0.604296,0.238579,0.621145,0.344743
8,0.5373,0.568701,0.23825,0.647577,0.348341
9,0.4995,0.55379,0.23493,0.669604,0.347826
10,0.4755,0.545298,0.248333,0.656388,0.360339


{'eval_loss': 0.4603821337223053, 'eval_precision': 0.27387387387387385, 'eval_recall': 0.7342995169082126, 'eval_f1': 0.3989501312335958, 'eval_runtime': 0.1839, 'eval_samples_per_second': 163.108, 'eval_steps_per_second': 10.874, 'epoch': 10.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.37      0.70      0.49        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.39      0.93      0.55        30
             Industry       0.33      0.25      0.29        12
             LastName       0.35      0.88      0.50         8
            MustHaves       0.23      0.88      0.36        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.14      0.33      0.20         3
PreferredNeighborhood       0.67      0.92      0.77        13
         PropertyType       0.52      0.92      0.67        26
            SpaceSize       0.00      0.00      0.00         0
         UrgencyScore       0.11      0.70      0.19        20

            micro avg       0.27      0.73      0.40 

0,1
eval/f1,▁▁▃▆▆▆▇▇▇▇▇█
eval/loss,█▅▄▃▂▂▂▁▁▁▁▁
eval/precision,██▁▁▁▁▁▁▁▂▂▂
eval/recall,▁▁▂▆▆▇▇▇▇▇▇█
eval/runtime,▃▁▄▃▅▂▂█▁▆▆▅
eval/samples_per_second,▆█▅▆▄▇▇▁█▃▃▄
eval/steps_per_second,▆█▅▆▄▇▇▁█▃▃▄
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.39895
eval/loss,0.46038
eval/precision,0.27387
eval/recall,0.7343
eval/runtime,0.1839
eval/samples_per_second,163.108
eval/steps_per_second,10.874
test/f1,0.39895
test/loss,0.46038
test/precision,0.27387


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: nqyyovyy with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 3.818662923264096e-05
[34m[1mwandb[0m: 	num_train_epochs: 10
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 42
[34m[1mwandb[0m: 	warmup_ratio: 0.08584422328219288
[34m[1mwandb[0m: 	weight_decay: 0.09700009289287068


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.5861,2.412189,1.0,0.0,0.0
2,1.979,1.693933,1.0,0.0,0.0
3,1.4732,1.269024,0.166667,0.048458,0.075085
4,1.1259,0.983335,0.2175,0.38326,0.277512
5,0.8731,0.80384,0.221207,0.53304,0.312661
6,0.707,0.689144,0.233503,0.60793,0.337408
7,0.5942,0.630744,0.241546,0.660793,0.353774
8,0.5313,0.59655,0.264755,0.69163,0.382927
9,0.4886,0.581389,0.283859,0.674009,0.399478
10,0.4673,0.572488,0.28,0.678414,0.396396


{'eval_loss': 0.5020738840103149, 'eval_precision': 0.30495049504950494, 'eval_recall': 0.7439613526570048, 'eval_f1': 0.4325842696629213, 'eval_runtime': 0.1805, 'eval_samples_per_second': 166.161, 'eval_steps_per_second': 11.077, 'epoch': 10.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.47      0.78      0.58        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.38      0.93      0.54        30
             Industry       0.41      0.58      0.48        12
             LastName       0.35      0.88      0.50         8
            MustHaves       0.24      0.84      0.37        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.17      0.67      0.27         3
PreferredNeighborhood       0.80      0.92      0.86        13
         PropertyType       0.62      0.92      0.74        26
            SpaceSize       0.00      0.00      0.00         0
         UrgencyScore       0.12      0.55      0.19        20

            micro avg       0.30      0.74      0.43 

0,1
eval/f1,▁▁▂▅▆▆▇▇▇▇▇█
eval/loss,█▅▄▃▂▂▁▁▁▁▁▁
eval/precision,██▁▁▁▂▂▂▂▂▂▂
eval/recall,▁▁▁▅▆▇▇█▇▇▇█
eval/runtime,▂▁▁▁▂▂▂▂▃▂█▂
eval/samples_per_second,▇███▇▆▇▇▆▇▁▇
eval/steps_per_second,▇███▇▆▇▇▆▇▁▇
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.43258
eval/loss,0.50207
eval/precision,0.30495
eval/recall,0.74396
eval/runtime,0.1805
eval/samples_per_second,166.161
eval/steps_per_second,11.077
test/f1,0.43258
test/loss,0.50207
test/precision,0.30495


[34m[1mwandb[0m: Agent Starting Run: o2445qbl with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 2.791276222771067e-05
[34m[1mwandb[0m: 	num_train_epochs: 6
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.1041864430213464
[34m[1mwandb[0m: 	weight_decay: 0.08287537290740327


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.6067,2.537847,1.0,0.0,0.0
2,2.0705,1.819223,1.0,0.0,0.0
3,1.644,1.499149,1.0,0.0,0.0
4,1.3806,1.283248,0.238095,0.044053,0.074349
5,1.2062,1.153052,0.187266,0.220264,0.202429
6,1.1115,1.108884,0.225806,0.277533,0.249012


{'eval_loss': 1.0522377490997314, 'eval_precision': 0.23461538461538461, 'eval_recall': 0.2946859903381642, 'eval_f1': 0.2612419700214132, 'eval_runtime': 0.1786, 'eval_samples_per_second': 167.972, 'eval_steps_per_second': 11.198, 'epoch': 6.0}


  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.21      0.26      0.23        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.45      0.83      0.59        30
             Industry       0.00      0.00      0.00        12
             LastName       0.00      0.00      0.00         8
            MustHaves       0.18      0.58      0.27        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.00      0.00      0.00        13
         PropertyType       0.00      0.00      0.00        26
         UrgencyScore       0.00      0.00      0.00        20

            micro avg       0.23      0.29      0.26       207
            macro avg       0.06      0.13      0.08 

0,1
eval/f1,▁▁▁▃▆███
eval/loss,█▅▃▂▁▁▁▁
eval/precision,███▁▁▁▁▁
eval/recall,▁▁▁▂▆███
eval/runtime,▂▂▂▁▁▃█▁
eval/samples_per_second,▇▇▇██▆▁█
eval/steps_per_second,▇▇▇██▆▁█
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.26124
eval/loss,1.05224
eval/precision,0.23462
eval/recall,0.29469
eval/runtime,0.1786
eval/samples_per_second,167.972
eval/steps_per_second,11.198
test/f1,0.26124
test/loss,1.05224
test/precision,0.23462


[34m[1mwandb[0m: Agent Starting Run: 0fom09w6 with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-06
[34m[1mwandb[0m: 	learning_rate: 2.8847783730441056e-05
[34m[1mwandb[0m: 	num_train_epochs: 6
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 42
[34m[1mwandb[0m: 	warmup_ratio: 0.1274301632716811
[34m[1mwandb[0m: 	weight_decay: 0.1498608020433817


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.8465,2.951985,1.0,0.0,0.0
2,2.1565,1.78764,1.0,0.0,0.0
3,1.5975,1.462809,1.0,0.0,0.0
4,1.3569,1.239013,0.37037,0.088106,0.142349
5,1.169,1.105938,0.292517,0.189427,0.229947
6,1.0796,1.058767,0.277512,0.255507,0.266055


{'eval_loss': 0.989685595035553, 'eval_precision': 0.3471502590673575, 'eval_recall': 0.32367149758454106, 'eval_f1': 0.335, 'eval_runtime': 0.1779, 'eval_samples_per_second': 168.592, 'eval_steps_per_second': 11.239, 'epoch': 6.0}


  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.28      0.74      0.41        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.88      0.23      0.37        30
             Industry       0.00      0.00      0.00        12
             LastName       0.00      0.00      0.00         8
            MustHaves       0.22      0.36      0.27        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.00      0.00      0.00        13
         PropertyType       0.88      0.85      0.86        26
         UrgencyScore       0.00      0.00      0.00        20

            micro avg       0.35      0.32      0.34       207
            macro avg       0.17      0.17      0.15 

0,1
eval/f1,▁▁▁▄▆▇▇█
eval/loss,█▄▃▂▁▁▁▁
eval/precision,███▂▁▁▁▂
eval/recall,▁▁▁▃▅▇▇█
eval/runtime,▄▂▂▂▅▂█▁
eval/samples_per_second,▅▇▇▇▄▇▁█
eval/steps_per_second,▅▇▇▇▄▇▁█
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.335
eval/loss,0.98969
eval/precision,0.34715
eval/recall,0.32367
eval/runtime,0.1779
eval/samples_per_second,168.592
eval/steps_per_second,11.239
test/f1,0.335
test/loss,0.98969
test/precision,0.34715


[34m[1mwandb[0m: Agent Starting Run: p21y10c7 with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-06
[34m[1mwandb[0m: 	learning_rate: 2.0196671213787284e-05
[34m[1mwandb[0m: 	num_train_epochs: 10
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.07929368868700969
[34m[1mwandb[0m: 	weight_decay: 0.06468597547011538


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.7205,2.331753,1.0,0.0,0.0
2,1.8227,1.541244,1.0,0.0,0.0
3,1.3145,1.087859,0.285047,0.268722,0.276644
4,0.9557,0.821755,0.257143,0.515419,0.343109
5,0.739,0.681347,0.265504,0.603524,0.368775
6,0.6043,0.613418,0.197581,0.647577,0.302781
7,0.5451,0.563645,0.251678,0.660793,0.36452
8,0.484,0.543328,0.288168,0.665198,0.40213
9,0.4527,0.531513,0.278777,0.682819,0.395913
10,0.435,0.527991,0.283054,0.669604,0.397906


{'eval_loss': 0.46156227588653564, 'eval_precision': 0.310838445807771, 'eval_recall': 0.7342995169082126, 'eval_f1': 0.4367816091954023, 'eval_runtime': 0.1958, 'eval_samples_per_second': 153.21, 'eval_steps_per_second': 20.428, 'epoch': 10.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.40      0.78      0.53        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.38      0.93      0.54        30
             Industry       0.47      0.58      0.52        12
             LastName       0.35      0.88      0.50         8
            MustHaves       0.22      0.80      0.35        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.25      0.33      0.29         3
PreferredNeighborhood       0.71      0.92      0.80        13
         PropertyType       0.65      0.92      0.76        26
            SpaceSize       0.00      0.00      0.00         0
         UrgencyScore       0.15      0.60      0.24        20

            micro avg       0.31      0.73      0.44 

0,1
eval/f1,▁▁▅▆▇▆▇▇▇▇▇█
eval/loss,█▅▃▂▂▂▁▁▁▁▁▁
eval/precision,██▂▂▂▁▁▂▂▂▂▂
eval/recall,▁▁▄▆▇▇▇▇█▇▇█
eval/runtime,▂▁▂▁▃▁▁▅▁▃█▂
eval/samples_per_second,▇█▇█▆██▄█▆▁▆
eval/steps_per_second,▇█▇█▆██▄█▆▁▆
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.43678
eval/loss,0.46156
eval/precision,0.31084
eval/recall,0.7343
eval/runtime,0.1958
eval/samples_per_second,153.21
eval/steps_per_second,20.428
test/f1,0.43678
test/loss,0.46156
test/precision,0.31084


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: mla5mf0k with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-06
[34m[1mwandb[0m: 	learning_rate: 1.1887462672600453e-05
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.07103037657767161
[34m[1mwandb[0m: 	weight_decay: 0.10425742983771658


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,4.0376,3.612924,0.0,0.0,0.0
2,3.0754,2.460662,1.0,0.0,0.0
3,2.1352,1.917529,1.0,0.0,0.0
4,1.7819,1.691248,1.0,0.0,0.0
5,1.6061,1.54804,1.0,0.0,0.0
6,1.4892,1.455692,1.0,0.0,0.0
7,1.4237,1.400598,1.0,0.0,0.0
8,1.3818,1.380021,1.0,0.0,0.0


{'eval_loss': 3.6063168048858643, 'eval_precision': 0.00038211692777990065, 'eval_recall': 0.00966183574879227, 'eval_f1': 0.0007351589781290205, 'eval_runtime': 0.1907, 'eval_samples_per_second': 157.31, 'eval_steps_per_second': 10.487, 'epoch': 8.0}


  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

               Budget       0.00      0.00      0.00         0
          CompanyName       0.00      0.00      0.00        27
          CompanySize       0.00      0.00      0.00         0
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.33      0.01         6
                Email       0.00      0.00      0.00         0
     FirstInteraction       0.00      0.00      0.00         0
            FirstName       0.00      0.00      0.00        30
          GrowthStage       0.00      0.00      0.00         0
             Industry       0.00      0.00      0.00        12
      LastInteraction       0.00      0.00      0.00         0
             LastName       0.00      0.00      0.00         8
           MovingTerm       0.00      0.00      0.00         0
       MovingTimeline       0.00      0.00      0.00         0
            MustHaves       0.00      0.00      0.00  

0,1
eval/f1,▁▁▁▁▁▁▁▁▁█
eval/loss,█▄▃▂▂▁▁▁██
eval/precision,▁███████▁▁
eval/recall,▁▁▁▁▁▁▁▁▁█
eval/runtime,▄▁▁▂▂▁▁▂█▃
eval/samples_per_second,▅██▇▇██▇▁▅
eval/steps_per_second,▅██▇▇██▇▁▅
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.00074
eval/loss,3.60632
eval/precision,0.00038
eval/recall,0.00966
eval/runtime,0.1907
eval/samples_per_second,157.31
eval/steps_per_second,10.487
test/f1,0.00074
test/loss,3.60632
test/precision,0.00038


[34m[1mwandb[0m: Agent Starting Run: 5s0wutum with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 2.7269100045520024e-05
[34m[1mwandb[0m: 	num_train_epochs: 6
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.12933862217137568
[34m[1mwandb[0m: 	weight_decay: 0.09599373053343042


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.6085,2.515304,1.0,0.0,0.0
2,1.9767,1.713029,1.0,0.0,0.0
3,1.5613,1.424658,1.0,0.0,0.0
4,1.3297,1.234674,0.071429,0.008811,0.015686
5,1.1728,1.121809,0.226721,0.246696,0.236287
6,1.0889,1.081716,0.230483,0.273128,0.25


{'eval_loss': 1.010423183441162, 'eval_precision': 0.3008474576271186, 'eval_recall': 0.34299516908212563, 'eval_f1': 0.32054176072234764, 'eval_runtime': 0.1907, 'eval_samples_per_second': 157.345, 'eval_steps_per_second': 10.49, 'epoch': 6.0}


  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.38      0.67      0.49        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.41      0.90      0.56        30
             Industry       0.00      0.00      0.00        12
             LastName       0.00      0.00      0.00         8
            MustHaves       0.18      0.42      0.25        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.00      0.00      0.00        13
         PropertyType       1.00      0.19      0.32        26
         UrgencyScore       0.00      0.00      0.00        20

            micro avg       0.30      0.34      0.32       207
            macro avg       0.15      0.17      0.12 

0,1
eval/f1,▁▁▁▁▆▆▆█
eval/loss,█▄▃▂▂▁▁▁
eval/precision,███▁▂▂▂▃
eval/recall,▁▁▁▁▆▇▇█
eval/runtime,▂▄▁▁▁▂█▇
eval/samples_per_second,▇▅███▇▁▂
eval/steps_per_second,▇▅███▇▁▂
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.32054
eval/loss,1.01042
eval/precision,0.30085
eval/recall,0.343
eval/runtime,0.1907
eval/samples_per_second,157.345
eval/steps_per_second,10.49
test/f1,0.32054
test/loss,1.01042
test/precision,0.30085


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: kxgyusan with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 3.89704952470483e-05
[34m[1mwandb[0m: 	num_train_epochs: 10
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.16922267105807787
[34m[1mwandb[0m: 	weight_decay: 0.15180827596024266


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.9777,3.432479,0.005964,0.013216,0.008219
2,2.4765,1.826392,1.0,0.0,0.0
3,1.5818,1.354357,0.229167,0.048458,0.08
4,1.1873,0.987483,0.324042,0.409692,0.361868
5,0.8905,0.783145,0.211599,0.594714,0.312139
6,0.7145,0.670274,0.179894,0.599119,0.276704
7,0.605,0.594737,0.239669,0.638767,0.348558
8,0.5285,0.557463,0.255537,0.660793,0.36855
9,0.4848,0.532288,0.266102,0.69163,0.384333
10,0.4566,0.527022,0.277487,0.700441,0.3975


{'eval_loss': 0.448280394077301, 'eval_precision': 0.29902912621359223, 'eval_recall': 0.7439613526570048, 'eval_f1': 0.4265927977839335, 'eval_runtime': 0.1904, 'eval_samples_per_second': 157.593, 'eval_steps_per_second': 10.506, 'epoch': 10.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.47      0.78      0.58        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.41      0.93      0.57        30
             Industry       0.47      0.67      0.55        12
             LastName       0.33      0.75      0.46         8
            MustHaves       0.23      0.84      0.36        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.25      0.67      0.36         3
PreferredNeighborhood       0.71      0.92      0.80        13
         PropertyType       0.53      0.92      0.68        26
            SpaceSize       0.00      0.00      0.00         0
         UrgencyScore       0.12      0.55      0.20        20

            micro avg       0.30      0.74      0.43 

0,1
eval/f1,▁▁▂▇▆▆▇▇▇███
eval/loss,█▄▃▂▂▂▁▁▁▁▁▁
eval/precision,▁█▃▃▂▂▃▃▃▃▃▃
eval/recall,▁▁▁▅▇▇▇▇████
eval/runtime,▂▁▁▂▂▂▂▂▁▂█▄
eval/samples_per_second,▇██▇▇▇▇▇█▇▁▄
eval/steps_per_second,▇██▇▇▇▇▇█▇▁▄
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.42659
eval/loss,0.44828
eval/precision,0.29903
eval/recall,0.74396
eval/runtime,0.1904
eval/samples_per_second,157.593
eval/steps_per_second,10.506
test/f1,0.42659
test/loss,0.44828
test/precision,0.29903


[34m[1mwandb[0m: Agent Starting Run: 0hog3a3w with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 1.3658396993314802e-05
[34m[1mwandb[0m: 	num_train_epochs: 6
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.10113018586437938
[34m[1mwandb[0m: 	weight_decay: 0.09180623043220992


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.6555,2.464374,1.0,0.0,0.0
2,1.9082,1.601706,1.0,0.0,0.0
3,1.4331,1.286296,0.0,0.0,0.0
4,1.1975,1.088578,0.303965,0.303965,0.303965
5,1.055,0.996702,0.255,0.449339,0.325359
6,0.9784,0.964129,0.274112,0.475771,0.347826


{'eval_loss': 0.8959845900535583, 'eval_precision': 0.2845303867403315, 'eval_recall': 0.4975845410628019, 'eval_f1': 0.36203866432337434, 'eval_runtime': 0.19, 'eval_samples_per_second': 157.915, 'eval_steps_per_second': 21.055, 'epoch': 6.0}


  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.32      0.70      0.44        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.39      0.93      0.55        30
             Industry       0.00      0.00      0.00        12
             LastName       0.00      0.00      0.00         8
            MustHaves       0.18      0.70      0.28        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.20      0.08      0.11        13
         PropertyType       0.69      0.77      0.73        26
         UrgencyScore       0.00      0.00      0.00        20

            micro avg       0.28      0.50      0.36       207
            macro avg       0.14      0.24      0.16 

0,1
eval/f1,▁▁▁▇▇███
eval/loss,█▄▃▂▁▁▁▁
eval/precision,██▁▃▃▃▃▃
eval/recall,▁▁▁▅▇███
eval/runtime,▂▃▁▂▂█▁▁
eval/samples_per_second,▇▆█▇▇▁██
eval/steps_per_second,▇▆█▇▇▁██
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.36204
eval/loss,0.89598
eval/precision,0.28453
eval/recall,0.49758
eval/runtime,0.19
eval/samples_per_second,157.915
eval/steps_per_second,21.055
test/f1,0.36204
test/loss,0.89598
test/precision,0.28453


[34m[1mwandb[0m: Agent Starting Run: tocn7h7g with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-06
[34m[1mwandb[0m: 	learning_rate: 1.790378047578456e-05
[34m[1mwandb[0m: 	num_train_epochs: 6
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.1618188442658644
[34m[1mwandb[0m: 	weight_decay: 0.1089280688898064


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,4.0165,3.58146,0.001156,0.013216,0.002126
2,2.883,2.070069,1.0,0.0,0.0
3,1.8693,1.747147,1.0,0.0,0.0
4,1.6297,1.554653,1.0,0.0,0.0
5,1.4886,1.444985,1.0,0.0,0.0
6,1.4039,1.400702,0.0,0.0,0.0


{'eval_loss': 3.570537567138672, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 0.1872, 'eval_samples_per_second': 160.249, 'eval_steps_per_second': 10.683, 'epoch': 6.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

               Budget       0.00      0.00      0.00         0
          CompanyName       0.00      0.00      0.00        27
          CompanySize       0.00      0.00      0.00         0
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
                Email       0.00      0.00      0.00         0
     FirstInteraction       0.00      0.00      0.00         0
            FirstName       0.00      0.00      0.00        30
          GrowthStage       0.00      0.00      0.00         0
             Industry       0.00      0.00      0.00        12
      LastInteraction       0.00      0.00      0.00         0
             LastName       0.00      0.00      0.00         8
           MovingTerm       0.00      0.00      0.00         0
       MovingTimeline       0.00      0.00      0.00         0
            MustHaves       0.00      0.00      0.00  

0,1
eval/f1,█▁▁▁▁▁█▁
eval/loss,█▃▂▁▁▁██
eval/precision,▁████▁▁▁
eval/recall,█▁▁▁▁▁█▁
eval/runtime,▆▂▂▁▂▁█▅
eval/samples_per_second,▃▇▇█▇█▁▄
eval/steps_per_second,▃▇▇█▇█▁▄
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.0
eval/loss,3.57054
eval/precision,0.0
eval/recall,0.0
eval/runtime,0.1872
eval/samples_per_second,160.249
eval/steps_per_second,10.683
test/f1,0.0
test/loss,3.57054
test/precision,0.0


[34m[1mwandb[0m: Agent Starting Run: bfa7a5w0 with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-06
[34m[1mwandb[0m: 	learning_rate: 1.7381869753526496e-05
[34m[1mwandb[0m: 	num_train_epochs: 6
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 42
[34m[1mwandb[0m: 	warmup_ratio: 0.05497732539023774
[34m[1mwandb[0m: 	weight_decay: 0.1337149404446114


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.8299,3.190325,0.0,0.0,0.0
2,2.5241,1.94206,1.0,0.0,0.0
3,1.8059,1.710446,1.0,0.0,0.0
4,1.6092,1.545092,1.0,0.0,0.0
5,1.4784,1.446247,1.0,0.0,0.0
6,1.4123,1.407818,0.0,0.0,0.0


{'eval_loss': 3.174240827560425, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 0.1768, 'eval_samples_per_second': 169.643, 'eval_steps_per_second': 11.31, 'epoch': 6.0}


  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.00      0.00      0.00        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.00      0.00      0.00        30
             Industry       0.00      0.00      0.00        12
             LastName       0.00      0.00      0.00         8
            MustHaves       0.00      0.00      0.00        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.00      0.00      0.00        13
         PropertyType       0.00      0.00      0.00        26
         UrgencyScore       0.00      0.00      0.00        20

            micro avg       0.00      0.00      0.00       207
            macro avg       0.00      0.00      0.00 

0,1
eval/f1,▁▁▁▁▁▁▁▁
eval/loss,█▃▂▂▁▁██
eval/precision,▁████▁▁▁
eval/recall,▁▁▁▁▁▁▁▁
eval/runtime,▂▂▁▂▂▁█▁
eval/samples_per_second,▇▇█▇▇█▁█
eval/steps_per_second,▇▇█▇▇█▁█
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.0
eval/loss,3.17424
eval/precision,0.0
eval/recall,0.0
eval/runtime,0.1768
eval/samples_per_second,169.643
eval/steps_per_second,11.31
test/f1,0.0
test/loss,3.17424
test/precision,0.0


[34m[1mwandb[0m: Agent Starting Run: zgurtt5n with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 2.0473532964088585e-05
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.11883239631743574
[34m[1mwandb[0m: 	weight_decay: 0.11046032671086913


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,4.0673,3.58423,0.000664,0.008811,0.001236
2,2.7675,2.018287,1.0,0.0,0.0
3,1.8584,1.720939,1.0,0.0,0.0
4,1.6007,1.501794,1.0,0.0,0.0
5,1.4096,1.319821,0.227273,0.022026,0.040161
6,1.2488,1.189204,0.22619,0.0837,0.122186
7,1.1478,1.111193,0.272727,0.211454,0.238213
8,1.0916,1.084343,0.298077,0.273128,0.285057


{'eval_loss': 1.0181485414505005, 'eval_precision': 0.29842931937172773, 'eval_recall': 0.2753623188405797, 'eval_f1': 0.2864321608040201, 'eval_runtime': 0.1784, 'eval_samples_per_second': 168.189, 'eval_steps_per_second': 11.213, 'epoch': 8.0}


  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.29      0.63      0.40        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.83      0.17      0.28        30
             Industry       0.00      0.00      0.00        12
             LastName       0.00      0.00      0.00         8
            MustHaves       0.22      0.48      0.31        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.00      0.00      0.00        13
         PropertyType       1.00      0.42      0.59        26
         UrgencyScore       0.00      0.00      0.00        20

            micro avg       0.30      0.28      0.29       207
            macro avg       0.18      0.13      0.12 

0,1
eval/f1,▁▁▁▁▂▄▇███
eval/loss,█▄▃▂▂▁▁▁▁▁
eval/precision,▁███▃▃▃▃▃▃
eval/recall,▁▁▁▁▂▃▆███
eval/runtime,▄▁▂▁▃▁▃▄█▁
eval/samples_per_second,▅▇▇█▅▇▆▅▁█
eval/steps_per_second,▅▇▇█▅▇▆▅▁█
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.28643
eval/loss,1.01815
eval/precision,0.29843
eval/recall,0.27536
eval/runtime,0.1784
eval/samples_per_second,168.189
eval/steps_per_second,11.213
test/f1,0.28643
test/loss,1.01815
test/precision,0.29843


[34m[1mwandb[0m: Agent Starting Run: ehmuiv8o with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 2.255144336221304e-05
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 8
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.14692367295531478
[34m[1mwandb[0m: 	weight_decay: 0.06326732597381508


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.4749,2.225406,1.0,0.0,0.0
2,1.7347,1.413702,1.0,0.0,0.0
3,1.1932,1.001495,0.237288,0.493392,0.320458
4,0.8909,0.77128,0.224561,0.563877,0.321205
5,0.7034,0.653493,0.220758,0.590308,0.321343
6,0.5888,0.601132,0.191816,0.660793,0.297324
7,0.5424,0.568642,0.244767,0.669604,0.358491
8,0.5056,0.565445,0.263158,0.660793,0.376412


{'eval_loss': 0.4824983477592468, 'eval_precision': 0.291015625, 'eval_recall': 0.7198067632850241, 'eval_f1': 0.41446453407510425, 'eval_runtime': 0.1938, 'eval_samples_per_second': 154.835, 'eval_steps_per_second': 20.645, 'epoch': 8.0}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.37      0.67      0.47        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.43      0.97      0.59        30
             Industry       0.50      0.42      0.45        12
             LastName       0.35      0.88      0.50         8
            MustHaves       0.22      0.84      0.35        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.20      0.33      0.25         3
PreferredNeighborhood       0.71      0.92      0.80        13
         PropertyType       0.57      0.92      0.71        26
            SpaceSize       0.00      0.00      0.00         0
         UrgencyScore       0.11      0.55      0.19        20

            micro avg       0.29      0.72      0.41 

0,1
eval/f1,▁▁▆▆▆▆▇▇▇█
eval/loss,█▅▃▂▂▁▁▁▁▁
eval/precision,██▁▁▁▁▁▂▂▂
eval/recall,▁▁▆▆▇▇█▇▇█
eval/runtime,▅█▇▂▂▄▃▁▆▄
eval/samples_per_second,▄▁▂▇▇▅▆█▃▅
eval/steps_per_second,▄▁▂▇▇▅▆█▃▅
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.41446
eval/loss,0.4825
eval/precision,0.29102
eval/recall,0.71981
eval/runtime,0.1938
eval/samples_per_second,154.835
eval/steps_per_second,20.645
test/f1,0.41446
test/loss,0.4825
test/precision,0.29102


[34m[1mwandb[0m: Agent Starting Run: urnba9n5 with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-08
[34m[1mwandb[0m: 	learning_rate: 2.038221046426919e-05
[34m[1mwandb[0m: 	num_train_epochs: 8
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.10165667838056204
[34m[1mwandb[0m: 	weight_decay: 0.1098727733489712


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.7443,3.018028,1.0,0.0,0.0
2,2.2636,1.852144,1.0,0.0,0.0
3,1.6915,1.551706,1.0,0.0,0.0
4,1.4486,1.346835,1.0,0.0,0.0
5,1.268,1.19157,0.194444,0.123348,0.150943
6,1.1351,1.089383,0.233463,0.264317,0.247934
7,1.0566,1.035074,0.255102,0.440529,0.323102
8,1.0171,1.01388,0.267677,0.46696,0.340289


{'eval_loss': 0.9401419162750244, 'eval_precision': 0.30346820809248554, 'eval_recall': 0.5072463768115942, 'eval_f1': 0.37974683544303794, 'eval_runtime': 0.1819, 'eval_samples_per_second': 164.889, 'eval_steps_per_second': 10.993, 'epoch': 8.0}


  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.38      0.70      0.49        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.34      0.93      0.50        30
             Industry       0.00      0.00      0.00        12
             LastName       0.00      0.00      0.00         8
            MustHaves       0.19      0.72      0.30        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.57      0.31      0.40        13
         PropertyType       0.95      0.69      0.80        26
         UrgencyScore       0.00      0.00      0.00        20

            micro avg       0.30      0.51      0.38       207
            macro avg       0.19      0.26      0.19 

0,1
eval/f1,▁▁▁▁▄▆▇▇▇█
eval/loss,█▄▃▂▂▂▁▁▁▁
eval/precision,████▁▁▂▂▂▂
eval/recall,▁▁▁▁▃▅▇▇▇█
eval/runtime,▃▁▁█▂▅▄▃▄▃
eval/samples_per_second,▆██▁▇▄▅▆▅▆
eval/steps_per_second,▆██▁▇▄▅▆▅▆
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.37975
eval/loss,0.94014
eval/precision,0.30347
eval/recall,0.50725
eval/runtime,0.1819
eval/samples_per_second,164.889
eval/steps_per_second,10.993
test/f1,0.37975
test/loss,0.94014
test/precision,0.30347


[34m[1mwandb[0m: Agent Starting Run: ykpq0hhi with config:
[34m[1mwandb[0m: 	adam_epsilon: 1e-06
[34m[1mwandb[0m: 	learning_rate: 2.937363415135835e-05
[34m[1mwandb[0m: 	num_train_epochs: 6
[34m[1mwandb[0m: 	per_device_train_batch_size: 16
[34m[1mwandb[0m: 	seed: 2024
[34m[1mwandb[0m: 	warmup_ratio: 0.1300715751459401
[34m[1mwandb[0m: 	weight_decay: 0.16895242624596368


Some weights of WeightedDistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,3.6391,2.582971,1.0,0.0,0.0
2,1.9905,1.709658,1.0,0.0,0.0
3,1.5521,1.410122,1.0,0.0,0.0
4,1.3106,1.209551,0.134615,0.030837,0.050179
5,1.1466,1.093116,0.224561,0.281938,0.25
6,1.06,1.051925,0.263492,0.365639,0.306273


{'eval_loss': 0.9791063070297241, 'eval_precision': 0.3074074074074074, 'eval_recall': 0.40096618357487923, 'eval_f1': 0.34800838574423487, 'eval_runtime': 0.1835, 'eval_samples_per_second': 163.517, 'eval_steps_per_second': 10.901, 'epoch': 6.0}


  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

          CompanyName       0.38      0.67      0.48        27
  CurrentNeighborhood       0.00      0.00      0.00         1
    DecisionMakerRole       0.00      0.00      0.00         6
            FirstName       0.37      0.90      0.52        30
             Industry       0.00      0.00      0.00        12
             LastName       0.00      0.00      0.00         8
            MustHaves       0.20      0.56      0.30        50
          NiceToHaves       0.00      0.00      0.00        10
            PainPoint       0.00      0.00      0.00         1
   PreferredLeaseTerm       0.00      0.00      0.00         3
PreferredNeighborhood       0.00      0.00      0.00        13
         PropertyType       1.00      0.38      0.56        26
         UrgencyScore       0.00      0.00      0.00        20

            micro avg       0.31      0.40      0.35       207
            macro avg       0.15      0.19      0.14 

0,1
eval/f1,▁▁▁▂▆▇▇█
eval/loss,█▄▃▂▁▁▁▁
eval/precision,███▁▂▂▂▂
eval/recall,▁▁▁▂▆▇▇█
eval/runtime,▇▁▂▁▄██▆
eval/samples_per_second,▂█▇█▅▁▁▃
eval/steps_per_second,▂█▇█▅▁▁▃
test/f1,▁
test/loss,▁
test/precision,▁

0,1
eval/f1,0.34801
eval/loss,0.97911
eval/precision,0.30741
eval/recall,0.40097
eval/runtime,0.1835
eval/samples_per_second,163.517
eval/steps_per_second,10.901
test/f1,0.34801
test/loss,0.97911
test/precision,0.30741


Error in callback <bound method _WandbInit._post_run_cell_hook of <wandb.sdk.wandb_init._WandbInit object at 0x7cf499a5bf10>> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

In [None]:
'''
Predict and Evaluate results on DistilBERT model without training
'''

def run_base_model():
    print("Running base DistilBERT model without training...")

    with wandb.init(project="distilbert-final", name="distilbert_base", reinit=True) as run:

        # Load pre-trained base model
        base_model = DistilBertForTokenClassification.from_pretrained(
            "distilbert-base-uncased",
            num_labels=len(ENTITY_TAGS),
            id2label=id2label,
            label2id=label2id
        ).to("cuda" if torch.cuda.is_available() else "cpu")

        # Setup trainer
        eval_args = TrainingArguments(
            output_dir="./base_model_results",
            per_device_eval_batch_size=8,
            report_to="wandb",
            logging_dir="./logs",
            seed=42
        )

        # Initiate trainer
        trainer = Trainer(
            model=base_model,
            args=eval_args,
            eval_dataset=test_data,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        # Evaluate trainer
        trainer.evaluate()
        metrics = trainer.evaluate(test_data)
        print(metrics)
        df = evaluate_model(trainer, test_data, id2label, tokenizer)
        print(df)
        wandb_table = wandb.Table(dataframe=df)
        wandb.log({f"{run.name} Predictions": wandb_table})

        entity_metrics = get_entity_metrics(df)
        wandb.log({**{f"test_{k}": float(v) for k, v in entity_metrics.items()}})

        # Save all predictions to Google Drive
        pred_path = os.path.join(output_dir, f"{wandb.run.name}_predictions.json")
        df.to_json(pred_path, orient="records", indent=2)
        print(f"Predictions saved to: {pred_path}")

#run_base_model()