# Load Data Set

In [59]:
# Menstruation the dataset structure
# load CSV dataset file and split into train and validation sets
from datasets import load_dataset 
# Load the dataset from a CSV file
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

#Rawdataset = load_dataset("csv", data_files={"dataset": "../../dataset/menstrual_emotion/menstruation-emotionDSclean.csv"})
Rawdataset = load_dataset("csv", data_files={"dataset": "../../dataset/menstrual_emotion/synthetic_data_womens_health.csv"})
# Display the first few rows of the dataset 
#print(dataset["dataset"][:5])  # Display the first 5 rows

print(f"Column names: {Rawdataset['dataset'].column_names}") 
columns_to_keep = ['text', 'emotions']

# Remove all other columns
Rawdataset = Rawdataset.remove_columns([col for col in Rawdataset['dataset'].column_names if col not in columns_to_keep])

print(Rawdataset["dataset"][:5])  # Display the first 5 rows

Using device: mps
Column names: ['Unnamed: 0', 'text', 'emotions']
{'text': ["I'm feel so energetic right now, like I could conquer the world! My skin is glowing and I actually want to go out tonight.", 'Dating in my 30s is exhausting. Feeling lonely but trying to stay hopeful.', 'Dating in my 30s is exhausting. Feeling lonely but trying to stay hopeful.', 'My best friend just gets me. After our talk, I feel supported and understood.', 'Feeling incredibly confident and attractive right now. Is this what they mean by ovulation glow?'], 'emotions': ["{'Improved mood': 1, 'Hopefulness': 0, 'Renewed energy': 0, 'Optimism': 0, 'Productivity': 0, 'Clarity': 0, 'Confidence': 0, 'High energy': 0, 'Sociability': 0, 'Empowerment': 0, 'Motivation': 0, 'Sadness': 0, 'Tearfulness': 0, 'Low self-esteem': 1, 'Loneliness or Isolation': 0, 'Feeling overwhelmed': 0, 'Anger or frustration': 0, 'Irritability': 0, 'Mood swings': 0, 'Anxiety': 0, 'Sensitivity to rejection': 0, 'Restlessness': 0, 'Emotional 

In [38]:
import random
import pandas as pd

class MenstrualSyntheticDataGenerator:
    def __init__(self):
        self.scenarios = {
            "menstrual_phase": {
                "follicular": [
                    "I'm feeling so energetic today, like I could conquer the world! My skin is glowing and I actually want to go out tonight.",
                    "Finally feeling clear-headed after last week. I'm ready to tackle that project I've been putting off.",
                    "Woke up feeling optimistic about everything. Maybe it's just my hormones, but I'll take it!"
                ],
                "ovulation": [
                    "Feeling incredibly confident and attractive today. Is this what they mean by ovulation glow?",
                    "My energy is through the roof and I feel so social. Called three friends just to chat!",
                    "Everything feels possible today. Started two new projects and signed up for that class."
                ],
                "luteal": [
                    "Why am I crying at this commercial? I know it's just PMS but these emotions feel so real.",
                    "Feeling irritable and bloated. Just want to hide under a blanket with chocolate.",
                    "My anxiety is through the roof today. Everything feels overwhelming and I can't focus."
                ],
                "menstrual": [
                    "Cramps are killing me and I feel so emotional. Called in sick because I just can't today.",
                    "Feeling drained but also weirdly relieved that my period finally came. Time for self-care.",
                    "The pain is manageable today but I'm exhausted. Grateful for a quiet day at home."
                ]
            },
            "life_transitions": [
                "Starting this new job has me feeling anxious but also hopeful. It's scary and exciting at the same time.",
                "Becoming a mom has brought so many emotions - joy, fear, overwhelming love, and complete exhaustion.",
                "Going through menopause and these mood swings are intense. One moment I'm fine, the next I'm furious."
            ],
            "relationships": [
                "Had a fight with my partner and feeling so hurt and misunderstood. Why is communication so hard?",
                "My best friend just gets me. After our talk, I feel supported and understood.",
                "Dating in my 30s is exhausting. Feeling lonely but trying to stay hopeful."
            ],
            "self_care": [
                "Took a mental health day and already feeling more in control. Sometimes you just need to pause.",
                "Finally went to therapy and feeling vulnerable but proud of myself for taking this step.",
                "Started exercising again and the endorphins are real! Feeling motivated and strong."
            ]
        }
        self.emotion_combinations = {
            "menstrual_mixed": [
                "Physical discomfort", "Tearfulness", "Relief", 
                "Feeling overwhelmed", "Low self-esteem"
            ]
        }

    def _add_variations(self, text):
        variations = [
            ("feeling", random.choice(["feeling", "feel", "experiencing", "going through"])),
            ("today", random.choice(["today", "right now", "at the moment", "lately"])),
            ("so", random.choice(["so", "really", "incredibly", "quite", "very"]))
        ]
        for original, replacement in variations:
            if original in text and random.random() > 0.5:
                text = text.replace(original, replacement, 1)
        return text

    def _assign_emotions(self, emotion_combo):
        all_emotions = sorted(list(set([
            e for emotions in self.emotion_combinations.values() for e in emotions
        ])))
        emotion_labels = {emotion: 0 for emotion in all_emotions}
        base_emotions = self.emotion_combinations[emotion_combo]
        for emotion in base_emotions:
            if random.random() > 0.2:
                emotion_labels[emotion] = 1
        for emotion in all_emotions:
            if emotion not in base_emotions and random.random() > 0.95:
                emotion_labels[emotion] = 1
        return emotion_labels

    def generate_menstrual_samples(self, n_samples=100):
        samples = []
        for _ in range(n_samples):
            base_text = random.choice(self.scenarios["menstrual_phase"]["menstrual"])
            text = self._add_variations(base_text)
            emotions = self._assign_emotions("menstrual_mixed")
            samples.append({'text': text, **emotions})
        return samples


In [None]:
# Reverse emotion mapping for consistency
ObjGenerator = MenstrualSyntheticDataGenerator()
samples = ObjGenerator.generate_menstrual_samples(500)
dataset_men = pd.DataFrame(samples)


#menstrual_only = [s for s in samples if "period" in s["text"].lower() or "cramps" in s["text"].lower()]
print(f"The length of synthetic data: {len(dataset_men)}.")
#dataset_men.to_csv("synt_menstrual_data.csv")
dataset_men.head()

The length of synthetic data: 500.


Unnamed: 0,text,Feeling overwhelmed,Low self-esteem,Physical discomfort,Relief,Tearfulness
0,The pain is manageable today but I'm exhausted...,0,0,1,1,1
1,Feeling drained but also weirdly relieved that...,1,1,1,0,1
2,Cramps are killing me and I feel so emotional....,1,1,1,1,0
3,Cramps are killing me and I feel so emotional....,1,1,1,1,1
4,Feeling drained but also weirdly relieved that...,0,1,1,1,0


In [53]:
def filter_unlabeled(dataset):
    filtered = []
    for row in dataset:
        label = row.get('label')
        if label is not None and label.strip().lower() not in ['', 'none', 'unlabeled']:  # remove None, '', 'None'
            filtered.append(row)
    return filtered

dataset = filter_unlabeled(Rawdataset['dataset'])  # or dataset['train'] etc.
print(f"Filtered dataset size: {len(dataset)}")

Filtered dataset size: 91


In [55]:
# Get all unique labels across dataset
def get_unique_labels(dataset):
    unique = set()
    for row in dataset:
        labels = [label.strip() for label in row['label'].split(',')]
        unique.update(labels)
    return sorted(unique)

all_labels = get_unique_labels(dataset)  # or combine all splits if needed
label2id = {label: i for i, label in enumerate(all_labels)}
id2label = {i: label for label, i in label2id.items()}

num_labels = len(label2id)
print(f"Number of unique labels: {num_labels}")
print("Display Labels", label2id)

print(dataset)

Number of unique labels: 32
Display Labels {'Anxiety': 0, 'Confidence': 1, 'Emotional sensitivity': 2, 'Emotional sensitivity (PMS)': 3, 'Emotional sensitivity (gratitude': 4, 'Empathy': 5, 'Empowerment (validation from shared experience)': 6, 'Fatigue': 7, 'Feeling in control': 8, 'Feeling overwhelmed': 9, 'Feeling overwhelmed (variability of severity implied)': 10, 'Hopefulness': 11, 'Improved mood': 12, 'Informational': 13, 'Irritability': 14, 'Loneliness': 15, 'Loneliness or Isolation': 16, 'Low self-esteem': 17, 'Mild emotional sensitivity': 18, 'Mild physical discomfort': 19, 'Mood swings': 20, 'Motivation': 21, "Optimism (positive acknowledgment of others' experience)": 22, 'Physical discomfort': 23, 'Physical discomfort (back pains)': 24, 'Physical discomfort (mild)': 25, 'Realization': 26, 'Restlessness': 27, 'Sadness': 28, 'Tearfulness': 29, 'earfulness': 30, 'empathy)': 31}
[{'comment': 'Constant bloating and fatigue, punctuated by bouts of sharp stomach pain.\nPeriods are a

In [60]:
from datasets import Dataset

dataset = Dataset.from_list(dataset)  # Convert the list of dictionaries to a Dataset object 


# Split the dataset into train and validation sets
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)

train_dataset_raw = split_dataset['train']
val_dataset_raw = split_dataset['test']  #



# Confirm counts
print(f"Number of training samples: {len(train_dataset_raw)}")
print(f"Number of validation samples: {len(val_dataset_raw)}")
#print(f"Number of test samples: {len(test_dataset_raw)}")


# Display the number of samples in each split
print(f"Number of training samples: {len(train_dataset_raw)}")
print(f"Number of validation samples: {len(val_dataset_raw)}")
#print(f"Number of test samples: {len(test_dataset_raw)}")

# Display the first few rows of the training set
print("\n\nTraining set sample:")
print(train_dataset_raw[:1])  # Display the first 5 rows of the training set
# Display the first few rows of the validation set
print("Validation set sample:")
print(val_dataset_raw[:1])  # Display the first 5 rows of the validation set


Number of training samples: 81
Number of validation samples: 10
Number of training samples: 81
Number of validation samples: 10


Training set sample:
{'comment': ['Before I got an iud mine were painful with a lot of bloating, heavy bleeding, and dull constant pain from cramps. The pain felt like a boa constrictor was gripping my lower abdomen and upper thighs and squeezing me on and off for 3-4 days. I used to miss class and call off work when it was especially painful.  The IUD helps so much I barely get cramps now. \n\nI did go to the obgyn and they found cysts in my vagina which were removed, but that did not change the period pain level.'], 'label': ['Physical discomfort, Fatigue, Feeling overwhelmed']}
Validation set sample:
{'comment': ["Well, you can always use tampons instead, which is what I do, because I'm a hygiene freak and I feel too dirty with pads.\n\nBut there are a lot of issues with tampons as well. I have to buy the ultra sized ones, because I bleed a lot an certain

Label Pereparation

In [6]:
import numpy as np
import torch
from torch.nn import BCEWithLogitsLoss
from collections import Counter

def encode_labels(example):
    labels = [0] * len(label2id)
    for lbl in example['label'].split(','):
        lbl = lbl.strip()
        if lbl in label2id:
            labels[label2id[lbl]] = 1
    example["labels"] = labels
    return example

# Apply to all splits
train_dataset = train_dataset_raw.map(encode_labels)
val_dataset = val_dataset_raw.map(encode_labels)

print("Head of train dataset:")
print(train_dataset[1])
print("\nHead of validation dataset:")
print(val_dataset[0])


Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Head of train dataset:
{'comment': "It feels like my body is trying to give birth to a baby that doesn't exist", 'label': 'Physical discomfort', 'labels': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]}

Head of validation dataset:
{'comment': "Well, you can always use tampons instead, which is what I do, because I'm a hygiene freak and I feel too dirty with pads.\n\nBut there are a lot of issues with tampons as well. I have to buy the ultra sized ones, because I bleed a lot an certain days. But they're uncomfortable, because you can feel them in there, especially when they're filled. And I personally have to go change them every two hours or so.\n\nPlus there's that always-looming threat of Toxic Shock Syndrome, though I don't know anyone who's ever gotten it.", 'label': 'Physical discomfort, Anxiety', 'labels': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]}


In [7]:
# Filter Neutral Samples as empty samples add noise and teach the model that predicting nothing is normal
# Filter training samples that have at least one label
train_dataset = train_dataset.filter(lambda x: sum(x['labels']) > 0)
val_dataset = val_dataset.filter(lambda x: sum(x['labels']) > 0)

print(f"Filtered train size: {len(train_dataset)}")
print(f"Filtered validation size: {len(val_dataset)}")

Filter:   0%|          | 0/81 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

Filtered train size: 81
Filtered validation size: 10


In [8]:
label_counts = [sum(example['labels']) for example in train_dataset]
print("Average labels/sample:", np.mean(label_counts))
print("Unique label values:", np.unique(train_dataset[0]['labels']))



Average labels/sample: 2.493827160493827
Unique label values: [0 1]


#Tokenize the Text with DistilBertTokenizer Fast

In [9]:
from transformers import DistilBertTokenizerFast

# Initialize the tokenizer
# DistilBertTokenizerFast is a fast tokenizer for DistilBERT that uses the WordPiece algorithm. 
#distilbert-base-uncased is a pre-trained DistilBERT model that has been fine-tuned on the English language.


tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["comment"], padding="max_length", truncation=True, max_length=128)

train_dataset_tk = train_dataset.map(tokenize, batched=True)
val_dataset_tk = val_dataset.map(tokenize, batched=True)
# test_dataset_tk = test_dataset.map(tokenize, batched=True)

# This ensures your dataset is ready for PyTorch training
train_dataset_tk.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset_tk.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
# test_dataset_tk.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

print("Head of tokenized train dataset:")
print(train_dataset_tk[1])


Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Head of tokenized train dataset:
{'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0]), 'input_ids': tensor([ 101, 2009, 5683, 2066, 2026, 2303, 2003, 2667, 2000, 2507, 4182, 2000,
        1037, 3336, 2008, 2987, 1005, 1056, 4839,  102,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    

In [10]:
from transformers import Trainer
from sklearn.metrics import f1_score, accuracy_score


In [11]:
from transformers import DistilBertForSequenceClassification
from transformers import TrainerCallback

# use DistilBERT for multi-label classification with distilbert-base-uncased
# DistilBERT is a smaller, faster, cheaper, and lighter version of BERT
# It retains 97% of BERT's language understanding while being 60% faster and 40% smaller.

# DistilBertForSequenceClassification is a DistilBERT model with a sequence classification head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks.
# distilbert-base-uncased is a pre-trained DistilBERT model that has been fine-tuned on the English language.


model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    problem_type="multi_label_classification"
).to(device)
model = model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import AdamW
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score
from tqdm import tqdm

# Loss and optimizer
criterion = torch.nn.BCEWithLogitsLoss() # BCEWithLogitsLoss is suitable for multi-label classification tasks 
optimizer = AdamW(model.parameters(), lr=2e-5)
# DataLoader
train_loader = DataLoader(train_dataset_tk, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset_tk, batch_size=16)

def train_model(train_loader, val_loader, model, num_epoc=10, training_args=None):
    for epoch in range(num_epoc):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device).float()

            outputs = model(**inputs)
            loss = criterion(outputs.logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        # Evaluation
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
                labels = batch["labels"].cpu().numpy()
                logits = model(**inputs).logits
                probs = torch.sigmoid(logits).cpu().numpy()
                preds = (probs >= 0.5).astype(int)

                all_preds.extend(preds)
                all_labels.extend(labels)

        f1 = f1_score(all_labels, all_preds, average="micro")

        # 
        print(f"Epoch {epoch + 1}: Train Loss = {avg_loss:.4f} | Val F1 = {f1:.4f}")




In [13]:
hist = train_model(train_loader, val_loader, model)
torch.save(model.state_dict(), './saved_models/lotus_menstrual_emotion_classifier_v1.pt')

Training Epoch 1: 100%|██████████| 6/6 [00:04<00:00,  1.46it/s]


Epoch 1: Train Loss = 0.6721 | Val F1 = 0.2524


Training Epoch 2: 100%|██████████| 6/6 [00:00<00:00,  7.54it/s]


Epoch 2: Train Loss = 0.6410 | Val F1 = 0.3846


Training Epoch 3: 100%|██████████| 6/6 [00:00<00:00,  7.55it/s]


Epoch 3: Train Loss = 0.5943 | Val F1 = 0.4211


Training Epoch 4: 100%|██████████| 6/6 [00:00<00:00,  7.54it/s]


Epoch 4: Train Loss = 0.5521 | Val F1 = 0.4571


Training Epoch 5: 100%|██████████| 6/6 [00:00<00:00,  7.54it/s]


Epoch 5: Train Loss = 0.5003 | Val F1 = 0.4242


Training Epoch 6: 100%|██████████| 6/6 [00:00<00:00,  7.55it/s]


Epoch 6: Train Loss = 0.4603 | Val F1 = 0.4242


Training Epoch 7: 100%|██████████| 6/6 [00:00<00:00,  7.55it/s]


Epoch 7: Train Loss = 0.4364 | Val F1 = 0.4571


Training Epoch 8: 100%|██████████| 6/6 [00:00<00:00,  7.57it/s]


Epoch 8: Train Loss = 0.4029 | Val F1 = 0.4571


Training Epoch 9: 100%|██████████| 6/6 [00:00<00:00,  7.54it/s]


Epoch 9: Train Loss = 0.3729 | Val F1 = 0.4242


Training Epoch 10: 100%|██████████| 6/6 [00:00<00:00,  7.63it/s]


Epoch 10: Train Loss = 0.3469 | Val F1 = 0.4242


In [14]:

# Get all unique labels
label_list = sorted(set(example['label'] for example in dataset))

# S Build label <-> id mappings
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

#  Update model config
model.config.label2id = {label: str(i) for i, label in enumerate(label_list)}
model.config.id2label = {str(i): label for i, label in enumerate(label_list)}
model.save_pretrained("./saved_models/lotus_menstrual_emotion_model_v1")
tokenizer.save_pretrained("./saved_models/lotus_menstrual_emotion_model_v1")

('./saved_models/lotus_menstrual_emotion_model_v1/tokenizer_config.json',
 './saved_models/lotus_menstrual_emotion_model_v1/special_tokens_map.json',
 './saved_models/lotus_menstrual_emotion_model_v1/vocab.txt',
 './saved_models/lotus_menstrual_emotion_model_v1/added_tokens.json',
 './saved_models/lotus_menstrual_emotion_model_v1/tokenizer.json')

In [15]:

# Sigmoid function for multi-label output
sigmoid = lambda x: 1 / (1 + np.exp(-x))

# Get label index mapping
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}

def predict_emotions(text, threshold=0.5):
    # Tokenize and move input to correct device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits
        probs = sigmoid(logits.cpu().numpy()[0])  # move to CPU before numpy

    # Return all emotions with prob >= threshold
    return [(id2label(i), float(p)) for i, p in enumerate(probs) if p >= threshold]

# Example
#print(predict_emotions("I am scared and angry, but also a bit hopeful."))

print(predict_emotions("loved"))
print(predict_emotions("i am in period and the pain is unbearable."))


TypeError: 'dict' object is not callable