In [5]:
!pip install vaderSentiment




In [6]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Define a small list of common stopwords
stop_words = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", 
    "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", 
    "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", 
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that", 
    "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", 
    "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", 
    "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", 
    "at", "by", "for", "with", "about", "against", "between", "into", "through", 
    "during", "before", "after", "above", "below", "to", "from", "up", "down", 
    "in", "out", "on", "off", "over", "under", "again", "further", "then", 
    "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", 
    "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", 
    "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", 
    "just", "don", "should", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", 
    "aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven", "isn", "ma", "mightn", 
    "mustn", "needn", "shan", "shouldn", "wasn", "weren", "won", "wouldn"
])

# Load Dataset
df_text = pd.read_csv("Fitness.csv")  # Ensure correct file path

# Drop Missing Values (Use "body" instead of "text")
df_text = df_text.dropna(subset=["body"])

# Text Cleaning Function
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = " ".join([word.lower() for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Apply Cleaning Function (Use "body" instead of "text")
df_text["cleaned_text"] = df_text["body"].apply(clean_text)

# Tokenization (We don’t need nltk.download("punkt") now)
df_text["tokens"] = df_text["cleaned_text"].apply(lambda x: x.split())

# Sentiment Analysis
analyzer = SentimentIntensityAnalyzer()
df_text["sentiment_score"] = df_text["cleaned_text"].apply(lambda x: analyzer.polarity_scores(x)["compound"])

# Extract Engagement Features
df_text["engagement_score"] = df_text["score"] + (df_text["comms_num"] * 2)

# Save Preprocessed Data
df_text.to_csv("processed_fitness_text_data.csv", index=False)

print("✅ Text preprocessing complete! Cleaned data saved.")


✅ Text preprocessing complete! Cleaned data saved.


In [35]:
df_text.head()
df_text.columns

Index(['title', 'score', 'id', 'url', 'comms_num', 'created', 'body',
       'timestamp', 'cleaned_text', 'tokens', 'sentiment_score',
       'engagement_score'],
      dtype='object')

In [29]:
!pip install torch torchvision torchaudio 

Collecting torchvision
  Downloading torchvision-0.21.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.6.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.6 kB)
Downloading torchvision-0.21.0-cp312-cp312-macosx_11_0_arm64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading torchaudio-2.6.0-cp312-cp312-macosx_11_0_arm64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: torchvision, torchaudio
Successfully installed torchaudio-2.6.0 torchvision-0.21.0


In [30]:
!pip install datasets




In [31]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="processed_fitness_text_data.csv")

print(dataset["train"].column_names)  # ✅ Check the available columns


['title', 'score', 'id', 'url', 'comms_num', 'created', 'body', 'timestamp', 'cleaned_text', 'tokens', 'sentiment_score', 'engagement_score']


In [32]:
!pip install transformers torch accelerate




In [None]:
import os
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import evaluate

# Load new datasets
df_content = pd.read_excel("fitness_content_generation_pairs2.xlsx")
df_profile = pd.read_excel("processed_fitness_data.xlsx")

# Create 'community_type' by combining 'Fitness Goal' and 'Fitness Type' (both already text)
df_profile['community_type'] = df_profile['Fitness Goal'] + " | " + df_profile['Fitness Type']

# Assign a community type to each content sample randomly (since no direct mapping)
df_content['community_type'] = df_profile['community_type'].sample(n=len(df_content), replace=True, random_state=42).values

# 🔹 Sample 500 random records for quick fine-tuning (or use full set if small)
df_sampled_content = df_content.sample(n=500, random_state=42).reset_index(drop=True)

# Convert sampled DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df_sampled_content)

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Load ROUGE evaluation metric
rouge = evaluate.load("rouge")

# Preprocessing function (tokenization + add community context)
def preprocess_function(examples):
    inputs = tokenizer(
        [f"Community: {community} | {text}" for community, text in zip(examples['community_type'], examples['input_text'])],
        truncation=True, padding="max_length", max_length=256
    )
    targets = tokenizer(
        examples['target_text'],
        truncation=True, padding="max_length", max_length=128
    )
    inputs["labels"] = targets["input_ids"]
    return inputs

# Apply preprocessing to dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Compute ROUGE metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=5e-5,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    fp16=False,
    no_cuda=True  # CPU mode to avoid MPS precision issues (Mac)
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save model and tokenizer after training
model.save_pretrained("./results/fitness_t5_model_v2")
tokenizer.save_pretrained("./results/fitness_t5_model_v2")

print("✅ Fine-tuning complete and model saved.")
