# Load Data Set Fro Mensturation Based Dataset

In [36]:

import torch
import pandas as pd
from datasets import Dataset
import ast

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")



df = pd.read_csv("../../dataset/menstrual_emotion/synthetic_data_womens_health.csv")  # Update path as needed
# Convert stringified emotion dictionaries into real dicts (if needed)
if isinstance(df["emotions"].iloc[0], str):
    df["emotions"] = df["emotions"].apply(ast.literal_eval)

# Expand the emotions column into multiple columns
emotion_df = df["emotions"].apply(pd.Series)

# Combine with the text column
df_ready = pd.concat([df["text"], emotion_df], axis=1)

# Replace NaNs with 0
df_ready.fillna(0, inplace=True)

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df_ready)


dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset_raw = dataset["train"]
val_dataset_raw = dataset["test"]

label_columns = list(emotion_df.columns)
print(f"Label columns: {label_columns}")

print(f"Train size: {len(train_dataset_raw)}")
print(f"Validation size: {len(val_dataset_raw)}")

Using device: mps
Label columns: ['Improved mood', 'Hopefulness', 'Renewed energy', 'Optimism', 'Productivity', 'Clarity', 'Confidence', 'High energy', 'Sociability', 'Empowerment', 'Motivation', 'Sadness', 'Tearfulness', 'Low self-esteem', 'Loneliness or Isolation', 'Feeling overwhelmed', 'Anger or frustration', 'Irritability', 'Mood swings', 'Anxiety', 'Sensitivity to rejection', 'Restlessness', 'Emotional sensitivity', 'Physical discomfort', 'Attractiveness', 'Sexual drive', 'Feeling in control', 'Gratitude', 'Relief']
Train size: 4500
Validation size: 500


Label Pereparation

In [37]:

print(train_dataset_raw[1])

{'text': "Starting this new job has me feeling anxious but alquite hopeful. It's scary and exciting at the same time.", 'Improved mood': 0.0, 'Hopefulness': 0.0, 'Renewed energy': 0.0, 'Optimism': 1.0, 'Productivity': 0.0, 'Clarity': 1.0, 'Confidence': 1.0, 'High energy': 0.0, 'Sociability': 0.0, 'Empowerment': 0.0, 'Motivation': 1.0, 'Sadness': 0.0, 'Tearfulness': 0.0, 'Low self-esteem': 0.0, 'Loneliness or Isolation': 0.0, 'Feeling overwhelmed': 0.0, 'Anger or frustration': 0.0, 'Irritability': 0.0, 'Mood swings': 0.0, 'Anxiety': 0.0, 'Sensitivity to rejection': 0.0, 'Restlessness': 0.0, 'Emotional sensitivity': 0.0, 'Physical discomfort': 0.0, 'Attractiveness': 1.0, 'Sexual drive': 0.0, 'Feeling in control': 1.0, 'Gratitude': 0.0, 'Relief': 0.0}


In [38]:
import numpy as np
import torch
from torch.nn import BCEWithLogitsLoss
from collections import Counter


num_labels = len(label_columns)

def preprocess(example):
    return {
        "text": example["text"],
        "labels": [example[label] for label in label_columns]
    }

train_dataset = train_dataset_raw.map(preprocess)
val_dataset = val_dataset_raw.map(preprocess)


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [39]:
# Filter Neutral Samples as empty samples add noise and teach the model that predicting nothing is normal
# Filter training samples that have at least one label
train_dataset = train_dataset.filter(lambda x: sum(x['labels']) > 0)
val_dataset = val_dataset.filter(lambda x: sum(x['labels']) > 0)

print(f"Filtered train size: {len(train_dataset)}")
print(f"Filtered validation size: {len(val_dataset)}")

Filter:   0%|          | 0/4500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Filtered train size: 4500
Filtered validation size: 500


In [40]:
label_counts = [sum(example['labels']) for example in train_dataset]
print("Average labels/sample:", np.mean(label_counts))
print("Unique label values:", np.unique(train_dataset[0]['labels']))


labels_array = np.array([x['labels'] for x in train_dataset])
print("\n\nUnique label vectors in training set:", np.unique(labels_array, axis=0).shape[0])
for i in range(5):
    print(f"Train sample {i}: {train_dataset[i]['text']}")
    print(f"Val sample {i}:   {val_dataset[i]['text']}")

Average labels/sample: 5.2651111111111115
Unique label values: [0. 1.]


Unique label vectors in training set: 2626
Train sample 0: Feeling incredibly confident and attractive today. Is this what they mean by ovulation glow?
Val sample 0:   Took a mental health day and already experiencing more in control. Sometimes you just need to pause.
Train sample 1: Starting this new job has me feeling anxious but alquite hopeful. It's scary and exciting at the same time.
Val sample 1:   Started exercising again and the endorphins are real! Feeling motivated and strong.
Train sample 2: My best friend just gets me. After our talk, I feel supported and understood.
Val sample 2:   Becoming a mom has brought so many emotions - joy, fear, overwhelming love, and complete exhaustion.
Train sample 3: Had a fight with my partner and feel so hurt and misunderstood. Why is communication so hard?
Val sample 3:   Took a mental health day and already feeling more in control. Sometimes you just need to pause.
T

#Tokenize the Text with DistilBertTokenizer Fast

In [41]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_dataset_tk = train_dataset.map(tokenize, batched=True)
val_dataset_tk = val_dataset.map(tokenize, batched=True)

# This ensures your dataset is ready for PyTorch training
train_dataset_tk.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset_tk.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
print("Head of tokenized train dataset:")
print(train_dataset_tk[1])


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Head of tokenized train dataset:
{'labels': tensor([0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.]), 'input_ids': tensor([  101,  3225,  2023,  2047,  3105,  2038,  2033,  3110, 11480,  2021,
         2632, 15549,  2618, 17772,  1012,  2009,  1005,  1055, 12459,  1998,
        10990,  2012,  1996,  2168,  2051,  1012,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [42]:
from transformers import Trainer
from sklearn.metrics import f1_score, accuracy_score

labels_matrix = np.array(train_dataset["labels"])
label_freq = labels_matrix.sum(axis=0)
num_samples = labels_matrix.shape[0]

# Avoid divide-by-zero and clip very large weights
pos_weights = (num_samples - label_freq) / (label_freq + 1e-5)
pos_weights = np.clip(pos_weights, a_min=1.0, a_max=None)

class_weights_tensor = torch.tensor(pos_weights, dtype=torch.float).to(device)


class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights_tensor)
        loss = loss_fct(logits, labels.float())
        return (loss, outputs) if return_outputs else loss


def compute_metrics(pred):
    preds = pred.predictions
    labels = pred.label_ids

    sigmoid = lambda x: 1 / (1 + np.exp(-x))
    probs = sigmoid(preds)
    optimal_threshold = 0.7  # You can try different values later

    y_pred = np.where(probs >= optimal_threshold, 1, 0)

    print("Sample true:", labels[0])
    print("Sample pred:", y_pred[0])
    # print("Raw logits sample:", preds[0])
    # print("Sigmoid probs sample:", probs[0])
    
    f1 = f1_score(labels, y_pred, average='micro')
    acc = accuracy_score(labels, y_pred)

    return {"accuracy": acc, "f1": f1}

In [43]:
from transformers import DistilBertForSequenceClassification
from transformers import TrainerCallback

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    problem_type="multi_label_classification"
).to(device)
model = model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [44]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./lotuso",
    eval_strategy="epoch",   
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    warmup_steps=500,
    lr_scheduler_type="cosine",
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)
class BestF1Callback(TrainerCallback):
    def __init__(self):
        self.best_f1 = 0

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        f1 = metrics.get("eval_f1", 0)
        if f1 > self.best_f1:
            print(f"\nNew best F1: {f1:.4f}")
            self.best_f1 = f1
            control.should_save = True
        else:
            control.should_save = False
        return control

In [59]:
# from transformers import AdamW
from torch.optim import AdamW
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score
from tqdm import tqdm

# Loss and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 16  # Number of epochs
# DataLoader
train_loader = DataLoader(train_dataset_tk, batch_size=16, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset_tk, batch_size=16,num_workers=0)

def train_model(train_loader, val_loader, model, training_args):
    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device).float()

            outputs = model(**inputs)
            loss = criterion(outputs.logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1} - Training Loss: {total_loss / len(train_loader):.4f}")

        # Evaluation
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
                labels = batch["labels"].cpu().numpy()
                logits = model(**inputs).logits
                probs = torch.sigmoid(logits).cpu().numpy()
                preds = (probs >= 0.5).astype(int)

                all_preds.extend(preds)
                all_labels.extend(labels)

        f1 = f1_score(all_labels, all_preds, average="micro")
        print(f"Epoch {epoch + 1} - Validation F1: {f1:.4f}")


In [60]:
hist = train_model(train_loader, val_loader, model, training_args)


Training Epoch 1: 100%|██████████| 282/282 [00:43<00:00,  6.54it/s]


Epoch 1 - Training Loss: 0.4103
Epoch 1 - Validation F1: 0.2672


Training Epoch 2: 100%|██████████| 282/282 [00:43<00:00,  6.45it/s]


Epoch 2 - Training Loss: 0.4101
Epoch 2 - Validation F1: 0.2669


Training Epoch 3: 100%|██████████| 282/282 [00:44<00:00,  6.40it/s]


Epoch 3 - Training Loss: 0.4097
Epoch 3 - Validation F1: 0.2732


Training Epoch 4: 100%|██████████| 282/282 [00:43<00:00,  6.45it/s]


Epoch 4 - Training Loss: 0.4096
Epoch 4 - Validation F1: 0.2698


Training Epoch 5: 100%|██████████| 282/282 [00:42<00:00,  6.61it/s]


Epoch 5 - Training Loss: 0.4094
Epoch 5 - Validation F1: 0.2670


Training Epoch 6: 100%|██████████| 282/282 [00:43<00:00,  6.52it/s]


Epoch 6 - Training Loss: 0.4095
Epoch 6 - Validation F1: 0.2703


Training Epoch 7: 100%|██████████| 282/282 [00:42<00:00,  6.58it/s]


Epoch 7 - Training Loss: 0.4088
Epoch 7 - Validation F1: 0.2720


Training Epoch 8: 100%|██████████| 282/282 [00:44<00:00,  6.39it/s]


Epoch 8 - Training Loss: 0.4090
Epoch 8 - Validation F1: 0.2726


Training Epoch 9: 100%|██████████| 282/282 [00:45<00:00,  6.18it/s]


Epoch 9 - Training Loss: 0.4083
Epoch 9 - Validation F1: 0.2714


Training Epoch 10: 100%|██████████| 282/282 [00:46<00:00,  6.06it/s]


Epoch 10 - Training Loss: 0.4086
Epoch 10 - Validation F1: 0.2687


Training Epoch 11: 100%|██████████| 282/282 [00:46<00:00,  6.03it/s]


Epoch 11 - Training Loss: 0.4083
Epoch 11 - Validation F1: 0.2685


Training Epoch 12: 100%|██████████| 282/282 [00:47<00:00,  5.95it/s]


Epoch 12 - Training Loss: 0.4074
Epoch 12 - Validation F1: 0.2763


Training Epoch 13: 100%|██████████| 282/282 [00:47<00:00,  5.96it/s]


Epoch 13 - Training Loss: 0.4076
Epoch 13 - Validation F1: 0.2732


Training Epoch 14: 100%|██████████| 282/282 [00:47<00:00,  5.93it/s]


Epoch 14 - Training Loss: 0.4067
Epoch 14 - Validation F1: 0.2782


Training Epoch 15: 100%|██████████| 282/282 [00:47<00:00,  5.90it/s]


Epoch 15 - Training Loss: 0.4064
Epoch 15 - Validation F1: 0.2665


Training Epoch 16: 100%|██████████| 282/282 [00:47<00:00,  5.88it/s]


Epoch 16 - Training Loss: 0.4066
Epoch 16 - Validation F1: 0.2656


In [55]:
# Save the model
torch.save(model.state_dict(), '../../models/lotus_menstrual_emotion/lotus_menstrual_emotion_classifier_v1.pt')

In [56]:

# You already have the emotion label columns as list
label_list = label_columns  # this comes from earlier step where we extracted all label column names

# Build mapping (just use identity mappings for now)
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

# Update model config
model.config.id2label = {str(k): v for k, v in id2label.items()}
model.config.label2id = label2id
model.config.num_labels = len(label_list)
model.config.problem_type = "multi_label_classification"


# Save model and tokenizer huggingface format
model.save_pretrained("../../models/lotus_menstrual_emotion/lotus_menstrual_emotion_model_v1")
tokenizer.save_pretrained("../../models/lotus_menstrual_emotion/lotus_menstrual_emotion_model_v1")


('../../models/lotus_menstrual_emotion/lotus_menstrual_emotion_model_v1/tokenizer_config.json',
 '../../models/lotus_menstrual_emotion/lotus_menstrual_emotion_model_v1/special_tokens_map.json',
 '../../models/lotus_menstrual_emotion/lotus_menstrual_emotion_model_v1/vocab.txt',
 '../../models/lotus_menstrual_emotion/lotus_menstrual_emotion_model_v1/added_tokens.json',
 '../../models/lotus_menstrual_emotion/lotus_menstrual_emotion_model_v1/tokenizer.json')

In [54]:
import numpy as np
import torch

sigmoid = lambda x: 1 / (1 + np.exp(-x))  # multi-label output

# You already defined:
# tokenizer, model, device, emotion_columns

id2label = {i: label for i, label in enumerate(label_columns)}

def predict_emotions(text, threshold=0.5):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits
        probs = sigmoid(logits.cpu().numpy()[0])

    return [(id2label[i], float(p)) for i, p in enumerate(probs) if p >= threshold]


# Example
#print(predict_emotions("I am scared and angry, but also a bit hopeful."))

print(predict_emotions("I am so happy today! The sun is shining and I feel great."))
print(predict_emotions("I am in great pain"))
print(predict_emotions("I never ever got cramps, ever. Just some acne. And then one time when I was...21? 22? I wound up spending day 1 vomiting and ever since I've gotten 2-3 days of horrible, horrible cramps."))


[('Feeling in control', 0.6071305274963379)]
[('Feeling overwhelmed', 0.5021209120750427)]
[('Tearfulness', 0.6932561993598938), ('Low self-esteem', 0.6783528923988342), ('Feeling overwhelmed', 0.6270961165428162), ('Physical discomfort', 0.6934010982513428), ('Relief', 0.7682949304580688)]
