In [1]:
import pandas as pd

# Define the community data
data = {
    "CommunityName": [
        "BodybuildersUnited", "CardioGoats", "YogaFlow", "75Hard", "WeightLossWarriors",
        "HealthyLiving", "CyclingBuddies", "StrengthSquad", "RunRunners", "FlexibilityFreaks",
        "EnduranceElite", "LiftLegends", "FitFam", "WellnessJourney", "FitGoalGetters",
        "CardioWarriors", "YogaTribe", "MuscleMinds", "StretchMasters", "HIITHeroes",
        "ShredSquad", "ZenFlex", "CardioKings", "PowerPilates", "RunStrong",
        "BarbellBros", "SweatSquad", "BalanceBoosters", "IronWarriors", "MorningMovers",
        "ActiveMoms"
    ],
    "FitnessGoal": [
        "Muscle Gain", "Weight Loss", "Flexibility", "Weight Loss", "Weight Loss",
        "General Fitness", "Endurance", "Muscle Gain", "Endurance", "Flexibility",
        "Endurance", "Muscle Gain", "General Fitness", "General Fitness", "General Fitness",
        "Endurance", "Flexibility", "Muscle Gain", "Flexibility", "Weight Loss",
        "Weight Loss", "Flexibility", "Endurance", "Flexibility", "Endurance",
        "Muscle Gain", "Weight Loss", "General Fitness", "Muscle Gain", "General Fitness",
        "General Fitness"
    ],
    "FitnessType": [
        "Strength Training", "Cardio", "Yoga & Wellness", "Cardio", "Cardio",
        "Yoga & Wellness", "Cardio", "Strength Training", "Cardio", "Yoga & Wellness",
        "HIIT", "Strength Training", "Yoga & Wellness", "Yoga & Wellness", "Cardio",
        "Cardio", "Yoga & Wellness", "Strength Training", "Yoga & Wellness", "HIIT",
        "Strength Training", "Yoga & Wellness", "Cardio", "Yoga & Wellness", "Cardio",
        "Strength Training", "Cardio", "Yoga & Wellness", "Strength Training", "Cardio",
        "Yoga & Wellness"
    ]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save as CSV
df.to_csv("fitness_communities.csv", index=False)

print("CSV file created successfully!")


CSV file created successfully!


In [4]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset

# Load preprocessed dataset
df_text = pd.read_csv("processed_fitness_text_data.csv")

# Ensure required columns exist
required_columns = ["cleaned_text"]
missing_columns = [col for col in required_columns if col not in df_text.columns]

if missing_columns:
    raise ValueError(f"Missing required columns: {missing_columns}")

# Handle missing values
df_text["cleaned_text"] = df_text["cleaned_text"].fillna("No content available")

# Prepare dataset for fine-tuning
df_text["input_text"] = "Summarize: " + df_text["cleaned_text"]
df_text["target_text"] = df_text["cleaned_text"]

# Save modified dataset
df_text.to_csv("processed_fitness_text_data_modified.csv", index=False)

# Load Tokenizer & Model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Load dataset using Hugging Face datasets library
dataset = load_dataset("csv", data_files="processed_fitness_text_data_modified.csv")["train"]

# Select 500 random samples from the dataset
dataset = dataset.shuffle(seed=42).select([i for i in range(500)])

# Ensure dataset contains the correct columns
print("Dataset columns:", dataset.column_names)
print("Example row:", dataset[0])

# Tokenization function with input and label processing
def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=512)

    # Tokenize target text (labels) and shift left by 1 for decoder
    labels = tokenizer(text_target=examples["target_text"], padding="max_length", truncation=True, max_length=512)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization (keeping only necessary columns)
dataset = dataset.map(tokenize_function, batched=True, remove_columns=["title", "score", "id", "url", "comms_num", "created", "body", "timestamp", "tokens", "sentiment_score", "engagement_score", "input_text", "target_text"])

# Ensure the dataset has 'input_ids' and 'labels'
print("Dataset after tokenization:", dataset.column_names)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # ✅ Updated
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    fp16=False,
    no_cuda=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset
)

# Train Model
trainer.train()

# Save Fine-Tuned Model
model.save_pretrained("fine_tuned_t5_fitness")
tokenizer.save_pretrained("fine_tuned_t5_fitness")

print("Training complete! Model saved to 'fine_tuned_t5_fitness'")


Generating train split: 1874 examples [00:00, 13833.74 examples/s]


Dataset columns: ['title', 'score', 'id', 'url', 'comms_num', 'created', 'body', 'timestamp', 'cleaned_text', 'tokens', 'sentiment_score', 'engagement_score', 'input_text', 'target_text']
Example row: {'title': 'Daily Simple Questions Thread - December 11, 2021', 'score': 38, 'id': 'rdwnpn', 'url': 'https://www.reddit.com/r/Fitness/comments/rdwnpn/daily_simple_questions_thread_december_11_2021/', 'comms_num': 553, 'created': 1639216816.0, 'body': 'Welcome to the /r/Fitness Daily Simple Questions Thread - Our daily thread to ask about all things fitness. Post your questions here related to your diet and nutrition or your training routine and exercises. Anyone can post a question and the community as a whole is invited and encouraged to provide an answer. \n        \n# As always, be sure to [read the wiki](https://thefitness.wiki) first. Like, all of it. Rule #0 still applies in this thread.\n        \nAlso, there\'s a [handy search function](https://www.reddit.com/r/Fitness/search?&rest

Map: 100%|██████████| 500/500 [00:00<00:00, 1236.92 examples/s]


Dataset after tokenization: ['cleaned_text', 'input_ids', 'attention_mask', 'labels']


Epoch,Training Loss,Validation Loss
1,6.3346,0.304719
2,1.3306,0.127438
3,0.7667,0.111966


Training complete! Model saved to 'fine_tuned_t5_fitness'


In [5]:
# 📦 Imports
import os
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import evaluate
import shutil

# 📌 Paths
model_output_dir = "fitness_t5_model_v2"

# 🔄 Remove old model directory if exists
if os.path.exists(model_output_dir):
    shutil.rmtree(model_output_dir)

# 📄 Load datasets
df_content = pd.read_excel("fitness_content_generation_pairs2.xlsx")
df_profile = pd.read_excel("processed_fitness_data.xlsx")

# 📝 Create 'community_type' by combining 'Fitness Goal' and 'Fitness Type'
df_profile['community_type'] = df_profile['Fitness Goal'] + " | " + df_profile['Fitness Type']

# 📊 Assign random community type to each content sample (since no direct mapping)
df_content['community_type'] = df_profile['community_type'].sample(n=len(df_content), replace=True, random_state=42).values

# 🔍 Sample 500 records for quick fine-tuning (or adjust if needed)
df_sampled_content = df_content.sample(n=500, random_state=42).reset_index(drop=True)

# 📚 Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df_sampled_content)

# 🔠 Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# 📊 Load ROUGE evaluation metric
rouge = evaluate.load("rouge")

# ✂️ Preprocessing function: tokenize inputs and targets
def preprocess_function(examples):
    inputs = tokenizer(
        [f"Community: {community} | {text}" for community, text in zip(examples['community_type'], examples['input_text'])],
        truncation=True, padding="max_length", max_length=256
    )
    targets = tokenizer(
        examples['target_text'],
        truncation=True, padding="max_length", max_length=128
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

# 🔄 Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 📊 Compute ROUGE metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

# ⚙️ Training configuration
training_args = TrainingArguments(
    output_dir=model_output_dir,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=5e-5,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    fp16=False,
    no_cuda=True,  # CPU mode for macOS M3
    overwrite_output_dir=True,
    seed=42
)

# 🏋️ Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    compute_metrics=compute_metrics
)

# 🚀 Train the model
trainer.train()

# 💾 Save model and tokenizer
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)

print("✅ Model 1 fine-tuning complete and saved at:", model_output_dir)


Map: 100%|██████████| 500/500 [00:00<00:00, 1212.35 examples/s]


Step,Training Loss
50,3.321
100,1.9356
150,1.3455


✅ Model 1 fine-tuning complete and saved at: fitness_t5_model_v2
