In [1]:
# Step 0 - Create a new conda environment and install dependencies
# conda create -n rl_ft_gpu python=3.11 -y
# conda activate rl_ft_gpu
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# pip install transformers==4.44.2 datasets accelerate trl==0.9.6 sentencepiece
# pip install scikit-learn wandb jupyter notebook
# python -m ipykernel install --user --name=rl_ft_gpu --display-name "RL Finetune (GPU)"


In [2]:
# ============================================
# üìò Step 1 ‚Äî Environment Setup & GPU Check
# ============================================

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import os

os.environ["JAVA_HOME"] = "C:\\Program Files\\Eclipse Adoptium\\jdk-21.0.8.9-hotspot"
os.environ["PATH"] += os.pathsep + os.path.join(os.environ["JAVA_HOME"], "bin")
print("JAVA_HOME:", os.environ.get("JAVA_HOME"))
print("PATH:", os.environ.get("PATH"))

# Print environment info
print("Torch version:", torch.__version__)
print("CUDA version:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("Device name:", torch.cuda.get_device_name(0))
    print("Memory allocated:", round(torch.cuda.memory_allocated(0)/1024**2, 1), "MB")
    print("Memory reserved:", round(torch.cuda.memory_reserved(0)/1024**2, 1), "MB")
else:
    print("‚ùå CUDA not available ‚Äî check environment")

# Make sure transformers & TRL are importable
import transformers
import datasets
import trl

print("\nTransformers version:", transformers.__version__)
print("TRL version:", trl.__version__)
print("Datasets version:", datasets.__version__)


JAVA_HOME: C:\Program Files\Eclipse Adoptium\jdk-21.0.8.9-hotspot
PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\libnvvp;;C:\Program Files\Microsoft SDKs\Azure\CLI2\wbin;C:\WINDOWS\system32;C:\WINDOWS;C:\WINDOWS\System32\Wbem;C:\WINDOWS\System32\WindowsPowerShell\v1.0\;C:\WINDOWS\System32\OpenSSH\;C:\Program Files\NVIDIA Corporation\NVIDIA app\NvDLISR;C:\Program Files (x86)\NVIDIA Corporation\PhysX\Common;C:\Program Files\LINQPad8;C:\Program Files\dotnet\;C:\Program Files\Microsoft SQL Server\150\Tools\Binn\;C:\Program Files\Microsoft SQL Server\Client SDK\ODBC\170\Tools\Binn\;C:\Program Files\Microsoft Service Fabric\bin\Fabric\Fabric.Code;C:\Program Files\Microsoft SDKs\Service Fabric\Tools\ServiceFabricLocalClusterManager;C:\Program Files\nodejs\;C:\Program Files\NVIDIA Corporation\Nsight Compute 2025.3.0\;C:\Program Files\Git\cmd;C:\Program Files\CMake\bin;C:\Users\moidhassan\AppData\Local\Microsoft\Window

In [3]:
# ============================================
# üìò Cell 2A ‚Äî Load & Prepare Synthetic Seller Email Dataset (Simple List)
# ============================================
import os
import json
import pandas as pd

def load_seller_emails(file_path="data/seller_emails.json"):
    """
    Load the synthetic seller emails dataset where each element is a full email body (string).
    Example:
        [
            "Hi John, I wanted to share some details about our new Surface lineup...",
            "Dear Priya, our latest enterprise offers might interest your company..."
        ]
    Returns a pandas DataFrame with one column: email_text
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"‚ùå File not found: {file_path}")

    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if not isinstance(data, list):
        raise ValueError("‚ùå Expected JSON to be a list of email strings.")
    if not all(isinstance(e, str) for e in data):
        raise ValueError("‚ùå Each element in the JSON list must be a string (email body).")

    df = pd.DataFrame({"email_text": data})
    df.loc[:,"len_email_text"] = df["email_text"].str.len()

    # Clean and normalize text
    df["email_text"] = df["email_text"].str.replace(r"\s+", " ", regex=True).str.strip()
    df = df.drop_duplicates(subset=["email_text"]).reset_index(drop=True)

    print(f"‚úÖ Loaded {len(df)} seller emails from {file_path}")
    return df


def save_cleaned_dataset(df, output_path="data/seller_emails_clean.csv"):
    """
    Save the cleaned dataset to CSV for use in fine-tuning.
    """
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    df.to_csv(output_path, index=False, encoding="utf-8")
    print(f"üìÅ Cleaned dataset saved at: {output_path}")


In [4]:
# ============================================
# üìò Step 2B ‚Äî Execute & Inspect Synthetic Seller Dataset
# ============================================
DATA_PATH = "data/seller_emails_v3.json"
OUTPUT_PATH = "data/seller_emails_clean.csv"

df_seller = load_seller_emails(DATA_PATH)
save_cleaned_dataset(df_seller, OUTPUT_PATH)

print("\n‚úÖ Sample Cleaned Emails:\n")
for i in range(min(3, len(df_seller))):
    print(f"--- Email #{i+1} ---")
    print(df_seller.iloc[i]["email_text"])
    print()


‚úÖ Loaded 38 seller emails from data/seller_emails_v3.json
üìÅ Cleaned dataset saved at: data/seller_emails_clean.csv

‚úÖ Sample Cleaned Emails:

--- Email #1 ---
Biodegradable packaging: lower footprint, same cost. Talk?

--- Email #2 ---
Dear HR Director, I'm Jennifer from HealthFirst Wellness, and I wanted to share an opportunity that could significantly impact your employee satisfaction and retention. Our corporate wellness programs have helped over 200 companies reduce healthcare costs by an average of $450 per employee annually while boosting morale. Can we schedule a brief demo to show you how easy implementation can be? Warm regards, Jennifer Park Corporate Wellness Consultant HealthFirst Wellness

--- Email #3 ---
Learning platform lifts test scores 18%. Pilot?



In [5]:
df_seller

Unnamed: 0,email_text,len_email_text
0,"Biodegradable packaging: lower footprint, same...",58
1,"Dear HR Director, I'm Jennifer from HealthFirs...",478
2,Learning platform lifts test scores 18%. Pilot?,47
3,"Hello, I work with boutique hotels to enhance ...",432
4,Cut cloud spend 30‚Äì40%. 15‚Äëmin chat next week?...,57
5,"Hello, I'm Jake from RetailAnalytics Plus, and...",436
6,Logistics costs down 20‚Äì30%. Explore fulfillment?,49
7,"Hi, I'm reaching out from DataSync Solutions b...",427
8,"Dental no‚Äëshows down 40%, throughput up 15%. D...",50
9,"Good morning, As consumers increasingly prefer...",478


In [6]:
# ============================================
# üìò Step 3A ‚Äî Load Base Model & Tokenize Dataset
# ============================================
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
import torch

def load_base_model(model_name="distilgpt2"):
    """
    Load the base causal language model and tokenizer.
    Uses GPU if available.
    """
    print(f"üöÄ Loading base model: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(model_name)
    model = model.to("cuda" if torch.cuda.is_available() else "cpu")

    print(f"‚úÖ Model loaded on: {'cuda' if torch.cuda.is_available() else 'cpu'}")
    return model, tokenizer


def tokenize_seller_dataset(df, tokenizer, max_length=256):
    """
    Tokenize the seller email dataset for RL training.
    Converts each email into input_ids and attention masks.
    """
    print("üîÑ Tokenizing seller emails...")
    dataset = Dataset.from_pandas(df)

    def tokenize_fn(example):
        return tokenizer(
            example["email_text"],
            truncation=True,
            max_length=max_length,
            padding="max_length",
        )

    tokenized_dataset = dataset.map(tokenize_fn, batched=True)
    tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

    print(f"‚úÖ Tokenized {len(tokenized_dataset)} samples")
    return tokenized_dataset


In [7]:
# ============================================
# üìò Step 3B ‚Äî Execute Model & Tokenizer Setup
# ============================================
MODEL_NAME = "distilgpt2"

model, tokenizer = load_base_model(MODEL_NAME)
tokenized_seller_dataset = tokenize_seller_dataset(df_seller, tokenizer)

# Inspect one sample
sample = tokenized_seller_dataset[0]
print("\nüìù Example Tokenized Sample:")
print("Input IDs:", sample["input_ids"][:40])
print("Decoded text:", tokenizer.decode(sample["input_ids"], skip_special_tokens=True)[:200])


üöÄ Loading base model: distilgpt2




‚úÖ Model loaded on: cuda
üîÑ Tokenizing seller emails...


Map:   0%|          | 0/38 [00:00<?, ? examples/s]

‚úÖ Tokenized 38 samples

üìù Example Tokenized Sample:
Input IDs: tensor([23286,  1098,  9744,   540, 16846,    25,  2793, 24713,    11,   976,
         1575,    13, 12167,    30, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256])
Decoded text: Biodegradable packaging: lower footprint, same cost. Talk?


In [8]:
# ============================================
# üß© Step 4A ‚Äî Define PPO Model and Trainer Functions
# ============================================
import torch
from trl import AutoModelForCausalLMWithValueHead, PPOTrainer, PPOConfig

def prepare_model_for_ppo(base_model_name: str):
    """
    Load a pretrained causal LM and wrap it with a value head for PPO training.
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"üß© Loading and wrapping '{base_model_name}' on {device}...")

    ppo_model = AutoModelForCausalLMWithValueHead.from_pretrained(base_model_name)
    ppo_model = ppo_model.to(device)

    print("‚úÖ PPO-ready model created.")
    return ppo_model


def create_ppo_trainer(model, tokenizer, learning_rate=1e-5, batch_size=2, log_with=None):
    """
    Initialize PPO Trainer with config.
    """
    print("‚öôÔ∏è Initializing PPO Trainer...")
    config = PPOConfig(
        model_name=None,  # already loaded
        learning_rate=learning_rate,
        batch_size=batch_size,
        mini_batch_size=batch_size,
        optimize_cuda_cache=True,
        log_with=log_with
    )
    trainer = PPOTrainer(config=config, model=model, tokenizer=tokenizer)
    print("‚úÖ PPO Trainer initialized.")
    return trainer


In [9]:
# ============================================
# ‚öôÔ∏è Step 4B ‚Äî Execute PPO Setup
# ============================================

MODEL_NAME = "distilgpt2"
learning_rate = 1e-5
batch_size = 2

ppo_model = prepare_model_for_ppo(MODEL_NAME)
ppo_trainer = create_ppo_trainer(ppo_model, tokenizer, learning_rate=learning_rate, batch_size=batch_size)

print("\n‚úÖ Model and PPO Trainer are ready for fine-tuning.")


üß© Loading and wrapping 'distilgpt2' on cuda...


  state_dict = loading_func(filename if not use_safe else safe_filename, **load_kwargs)


‚úÖ PPO-ready model created.
‚öôÔ∏è Initializing PPO Trainer...




‚úÖ PPO Trainer initialized.

‚úÖ Model and PPO Trainer are ready for fine-tuning.


In [12]:
df_seller.head()

Unnamed: 0,email_text,len_email_text
0,"Biodegradable packaging: lower footprint, same...",58
1,"Dear HR Director, I'm Jennifer from HealthFirs...",478
2,Learning platform lifts test scores 18%. Pilot?,47
3,"Hello, I work with boutique hotels to enhance ...",432
4,Cut cloud spend 30‚Äì40%. 15‚Äëmin chat next week?...,57


In [None]:
# ============================================
# üß© Step 5A ‚Äî Reward Modeling Functions (Fully Explained)
# ============================================
from transformers import pipeline
import numpy as np
import torch
import re
import pandas as pd

def load_sentiment_analyzer(use_gpu=True):
    """
    Load sentiment analyzer (DistilBERT fine-tuned on SST-2).
    """
    device = 0 if use_gpu and torch.cuda.is_available() else -1
    print(f"üîç Loading sentiment analyzer on {'GPU' if device == 0 else 'CPU'}...")
    analyzer = pipeline("sentiment-analysis",
                        model="distilbert-base-uncased-finetuned-sst-2-english",
                        device=device)
    return analyzer

def has_cta_phrase(txt):
    """
    Detect CTA (Call-to-Action) phrases in text.
    Returns (bool, list_of_matched_phrases)
    """
    txt_lower = txt.lower()
    
    # Define CTAs (mix of single and multiword)
    cta_phrases = [
        "call", "reply", "schedule", "meet",
        "connect", "reach out", "get in touch", "book a demo",
        "set up a meeting", "schedule a call", "contact us"
    ]
    
    matched_phrases = []

    for phrase in cta_phrases:
        # Handle multiword phrases directly
        if " " in phrase:
            if phrase in txt_lower:
                matched_phrases.append(phrase)
        else:
            # Match whole word only (e.g., "call" ‚â† "calling")
            if re.search(rf"\b{re.escape(phrase)}\b", txt_lower):
                matched_phrases.append(phrase)

    has_match = len(matched_phrases) > 0
    return has_match, matched_phrases

def compute_reward(text: str, sentiment_analyzer=None, tool=None, weights=None, detailed=False):
    """
    Compute and explain the reward breakdown for a given email.
    Categories:
      1. Length Reward
      2. Politeness Reward
      3. Sentiment Reward
      4. Clarity
      5. CTA (Call-to-Action)
      6. Personalization
      7. Grammar
      8. Value Proposition
      9. Spam Avoidance
      10. Structure
    """
    txt = (text or "").strip()
    txt_lower = txt.lower()
    weights = weights or {"length": 1.0, "politeness": 1.0, "sentiment": 1.0}

    # --- 1Ô∏è‚É£ Length ---
    length = len(txt)
    if 100 <= length <= 300:
        length_r = 1.0
        length_reason = f"‚úÖ Ideal length ({length} chars between 100‚Äì300)."
    elif length > 300 and length <= 450:
        length_r = 0.2
        length_reason = f"‚ö†Ô∏è Slightly long ({length} chars > 300)."
    elif length > 450:
        length_r = -0.7
        length_reason = f"‚ö†Ô∏è Too long ({length} chars > 450)."
    else:
        length_r = -0.7
        length_reason = f"‚ö†Ô∏è Too short ({length} chars < 100)."

    # --- 2Ô∏è‚É£ Politeness ---
    polite_terms = ["thank", "appreciate", "please", "hope", "kindly", "regards", "grateful", "welcome", "would you", "could you"]
    found_terms = [term for term in polite_terms if re.search(rf"\b{term}\b", txt_lower)]
    polite_r = 0.5 * len(found_terms)
    if len(found_terms) > 0:
        polite_reason = f"‚úÖ Found polite terms: {', '.join(found_terms)} (+{polite_r:.1f})."
    else:
        polite_reason = "‚ö†Ô∏è No polite words detected."

    # --- 3Ô∏è‚É£ Sentiment ---
    sentiment_r = 0.0
    sentiment_reason = ""
    if sentiment_analyzer is not None:
        try:
            out = sentiment_analyzer(txt[:512])
            if out and isinstance(out, list) and "label" in out[0]:
                label = out[0]["label"].upper()
                score = out[0].get("score", 0)
                if label.startswith("POS"):
                    sentiment_r = 1.0
                    sentiment_reason = f"‚úÖ Positive tone detected (score={score:.2f})."
                elif label.startswith("NEG"):
                    sentiment_r = -0.3
                    sentiment_reason = f"‚ö†Ô∏è Negative tone detected (score={score:.2f})."
                else:
                    sentiment_reason = f"üòê Neutral tone detected (score={score:.2f})."
        except Exception as e:
            sentiment_reason = f"‚ö†Ô∏è Sentiment analysis failed: {e}"
            sentiment_r = 0.0

    # --- 4Ô∏è‚É£ Clarity ---
    unclear_terms = ["utilize", "leverage", "synergy", "paradigm", "bandwidth", "ecosystem", "turnkey", "disruptive"]
    found_terms = [term for term in unclear_terms if re.search(rf"\b{term}\b", txt_lower)]
    clarity_r = -0.3 * len(found_terms) if found_terms else 0.7
    clarity_reason = f"‚ö†Ô∏è Found unclear terms: {', '.join(found_terms)}" if found_terms else "‚úÖ Clear and simple language."

    # --- 5Ô∏è‚É£ CTA (Call-to-Action) ---
    cta_phrases = ["schedule a call", "book a demo", "let‚Äôs connect", "reply", "meet", "call", "let me know", "get started", "sign up", "try it now"]
    has_cta = any(p in txt_lower for p in cta_phrases)
    #has_cta = has_cta_phrase(txt)
    has_cta, matched_ctas = has_cta_phrase(txt)
    cta_r = 1.0 if has_cta else -0.8
    cta_reason = "‚úÖ Contains clear call-to-action words like " + ", ".join(matched_ctas) if has_cta else "‚ö†Ô∏è No clear call-to-action."

    # --- 6Ô∏è‚É£ Personalization ---
    personalization_terms = ["you", "your team", "your company", "dear", "hello"]
    personalized = any(t in txt_lower for t in personalization_terms)
    personalization_r = 0.7 if personalized else -0.2
    personalization_reason = "‚úÖ Personalized tone with terms like " + ", ".join([t for t in personalization_terms if t in txt_lower]) if personalized else "‚ö†Ô∏è No personalization detected."

    # --- 7Ô∏è‚É£ Grammar ---
    grammar_r, grammar_reason = 0.0, ""
    if tool is not None:
        try:
            correction = tool.check(txt)
            n_errors = len(correction)
            if n_errors == 0:
                grammar_r = 1.0
                grammar_reason = "‚úÖ No grammatical errors detected."
            elif n_errors < 4:
                grammar_r = 0.5
                grammar_reason = f"‚ö†Ô∏è Few grammatical errors detected ({n_errors} issues)."
            else:
                grammar_r = -0.3
                grammar_reason = f"‚ùå Many grammatical errors detected ({n_errors} issues)."
        except Exception as e:
            grammar_reason = f"‚ö†Ô∏è Grammar check failed: {e}"
            grammar_r = 0.0

    # --- 8Ô∏è‚É£ Value Proposition ---
    value_terms = re.findall(r"\b(save|reduce|increase|boost|improve|growth|roi|cost|revenue|profit)\b", txt, re.I)
    value_r = min(len(value_terms) * 0.5, 1.0)
    value_reason = f"‚úÖ Value terms found: {', '.join(set(value_terms))}." if value_terms else "‚ö†Ô∏è No value proposition terms."

    # --- 9Ô∏è‚É£ Spam Avoidance ---
    spam_terms = ["free", "winner", "click here", "urgent", "act now", "limited time", "guarantee"]
    found_spam = [term for term in spam_terms if re.search(rf"\b{term}\b", txt_lower)]
    spam_r = -0.8 * len(found_spam) if found_spam else 0.5
    spam_reason = f"‚ùå Spammy terms found: {', '.join(found_spam)}." if found_spam else "‚úÖ No spammy terms detected."

    # --- üîü Structure ---
    has_greeting = bool(re.search(r"\b(dear|hi|hello|greetings|to whom it may concern)\b", txt_lower))
    has_closing = bool(re.search(r"(regards|sincerely|best)", txt_lower))
    if has_greeting and has_closing:
        structure_r, structure_reason = 1.0, "‚úÖ Proper greeting and closing."
    elif has_greeting or has_closing:
        structure_r, structure_reason = 0.5, "‚ö†Ô∏è Missing either greeting or closing."
    else:
        structure_r, structure_reason = -0.5, "‚ö†Ô∏è Missing both greeting and closing."

    # --- Weighted total ---
    total_reward = (
        weights["length"] * length_r
        + weights["politeness"] * polite_r
        + weights["sentiment"] * sentiment_r
        + weights["clarity"] * clarity_r
        + weights["cta"] * cta_r
        + weights["personalization"] * personalization_r
        + weights["grammar"] * grammar_r
        + weights["value"] * value_r
        + weights["spam"] * spam_r
        + weights["structure"] * structure_r
    )
    
    print(f"total_reward before clipping: {total_reward}")
    total_reward = float(np.clip(total_reward, -4.0, 6.0))
    print(f"total_reward after clipping: {total_reward}")

    if detailed:
        return total_reward, {
            "length": length_r,
            "politeness": polite_r,
            "sentiment": sentiment_r,
            "clarity": clarity_r,
            "cta": cta_r,
            "personalization": personalization_r,
            "grammar": grammar_r,
            "value": value_r,
            "spam": spam_r,
            "structure": structure_r,
            "reasons": {
                "length": length_reason,
                "politeness": polite_reason,
                "sentiment": sentiment_reason,
                "clarity": clarity_reason,
                "cta": cta_reason,
                "personalization": personalization_reason,
                "grammar": grammar_reason,
                "value": value_reason,
                "spam": spam_reason,
                "structure": structure_reason
            }
        }
    return total_reward


def test_reward_function(df, reward_fn, n_samples=3, **kwargs):
    """
    Evaluate reward on sample seller emails with complete reasoning.
    """
    print("\nüßÆ Testing Reward Function with Explanations:\n")
    sample_emails = df["email_text"].sample(n_samples, random_state=42)

    weights = kwargs.get("weights", {"length": 1.0, "politeness": 1.0, "sentiment": 1.0, "clarity":1.0, "cta":1.0, "personalization":1.0, "grammar":1.0, "value":1.0, "spam":1.0, "structure":1.0})

    for i, email in enumerate(sample_emails, 1):
        
        print(f"üìß Email #{i}")
        print(f"Excerpt: {email[:180]}...\n")
        
        total, breakdown = reward_fn(email, detailed=True, **kwargs)
        reasons = breakdown["reasons"]

        print("üßæ Reward Components & Explanations:")
        print(f"  ‚îú‚îÄ Length Reward:     {breakdown['length']:+.2f} - {reasons['length']}")
        print(f"  ‚îú‚îÄ Politeness Reward: {breakdown['politeness']:+.2f} - {reasons['politeness']}")
        print(f"  ‚îî‚îÄ Sentiment Reward:  {breakdown['sentiment']:+.2f} - {reasons['sentiment']}")
        print(f"  ‚îú‚îÄ Clarity Reward:    {breakdown['clarity']:+.2f} - {reasons['clarity']}")
        print(f"  ‚îú‚îÄ CTA Reward:        {breakdown['cta']:+.2f} - {reasons['cta']}")
        print(f"  ‚îú‚îÄ Personalization:   {breakdown['personalization']:+.2f} - {reasons['personalization']}")
        print(f"  ‚îú‚îÄ Grammar Reward:    {breakdown['grammar']:+.2f} - {reasons['grammar']}")
        print(f"  ‚îú‚îÄ Value Prop Reward: {breakdown['value']:+.2f} - {reasons['value']}")
        print(f"  ‚îú‚îÄ Spam Avoidance:    {breakdown['spam']:+.2f} - {reasons['spam']}")
        print(f"  ‚îî‚îÄ Structure Reward:  {breakdown['structure']:+.2f} - {reasons['structure']}")
        print(f"  ‚Ä¢ Weights Used:      {weights}")

        # Explicit calculation formula
        calc_str = (
            f"({weights['length']}√ó{breakdown['length']:.2f}) + "
            f"({weights['politeness']}√ó{breakdown['politeness']:.2f}) + "
            f"({weights['sentiment']}√ó{breakdown['sentiment']:.2f}) + "
            f"({weights['clarity']}√ó{breakdown['clarity']:.2f}) + "
            f"({weights['cta']}√ó{breakdown['cta']:.2f}) + "
            f"({weights['personalization']}√ó{breakdown['personalization']:.2f}) + "
            f"({weights['grammar']}√ó{breakdown['grammar']:.2f}) + "
            f"({weights['value']}√ó{breakdown['value']:.2f}) + "
            f"({weights['spam']}√ó{breakdown['spam']:.2f}) + "
            f"({weights['structure']}√ó{breakdown['structure']:.2f})"
        )
        intermediate_sum = (
            weights["length"] * breakdown["length"]
            + weights["politeness"] * breakdown["politeness"]
            + weights["sentiment"] * breakdown["sentiment"]
            + weights["clarity"] * breakdown["clarity"]
            + weights["cta"] * breakdown["cta"]
            + weights["personalization"] * breakdown["personalization"]
            + weights["grammar"] * breakdown["grammar"]
            + weights["value"] * breakdown["value"]
            + weights["spam"] * breakdown["spam"]
            + weights["structure"] * breakdown["structure"]
        )

        print("\nüßÆ Calculation:")
        print(f"  = {calc_str}")
        print(f"  = {intermediate_sum:.2f}")
        print(f"  ‚Üí Final Clipped Reward: {total:.2f}")
        print("-" * 70)


In [17]:
# ============================================
# ‚öôÔ∏è Step 5B ‚Äî Run Reward Evaluation (Extended)
# ============================================
from language_tool_python import LanguageTool

# ‚úÖ Load sentiment model once (cached)
if "sentiment_analyzer" not in globals():
    sentiment_analyzer = load_sentiment_analyzer(use_gpu=True)

# ‚úÖ Load LanguageTool only once (cached)
if "tool" not in globals():
    print("üß† Initializing LanguageTool (cached once)...")
    tool = LanguageTool('en-US')

# Define reward weights
reward_weights = {
    "length": 1.2,
    "politeness": 1.2,
    "sentiment": 0.7,
    "clarity": 0.6,
    "cta": 1.4,
    "personalization": 0.7,
    "grammar": 0.8,
    "value": 1.1,
    "spam": 0.8,
    "structure": 0.8
}
# Evaluate on sample emails (now with extended components and explanations)
test_reward_function(
    df_seller,
    compute_reward,
    n_samples=5,
    sentiment_analyzer=sentiment_analyzer,
    tool=tool,
    weights=reward_weights
)


üßÆ Testing Reward Function with Explanations:

total_reward before clipping: 1.98
total_reward after clipping: 1.98
üìß Email #1
Excerpt: Fleet telematics can save ‚âà$85K annually. Brief call?...

üßæ Reward Components & Explanations:
  ‚îú‚îÄ Length Reward:     -0.70 - ‚ö†Ô∏è Too short (53 chars < 100).
  ‚îú‚îÄ Politeness Reward: +0.00 - ‚ö†Ô∏è No polite words detected.
  ‚îî‚îÄ Sentiment Reward:  -0.30 - ‚ö†Ô∏è Negative tone detected (score=1.00).
  ‚îú‚îÄ Clarity Reward:    +0.70 - ‚úÖ Clear and simple language.
  ‚îú‚îÄ CTA Reward:        +1.00 - ‚úÖ Contains clear call-to-action words like call
  ‚îú‚îÄ Personalization:   -0.20 - ‚ö†Ô∏è No personalization detected.
  ‚îú‚îÄ Grammar Reward:    +1.00 - ‚úÖ No grammatical errors detected.
  ‚îú‚îÄ Value Prop Reward: +0.50 - ‚úÖ Value terms found: save.
  ‚îú‚îÄ Spam Avoidance:    +0.50 - ‚úÖ No spammy terms detected.
  ‚îî‚îÄ Structure Reward:  -0.50 - ‚ö†Ô∏è Missing both greeting and closing.
  ‚Ä¢ Weights Used:      {'length

Downloading LanguageTool latest: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 254M/254M [00:52<00:00, 4.84MB/s] 
Unzipping C:\Users\MOIDHA~1\AppData\Local\Temp\tmpur1grx_3.zip to C:\Users\moidhassan\.cache\language_tool_python.
Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to C:\Users\moidhassan\.cache\language_tool_python.


total_reward before clipping: 6.960000000000001
total_reward after clipping: 6.0
üìß Email #4
Excerpt: Hi, We optimize e-commerce shipping and fulfillment. Clients see 20-30% logistics cost reduction through our warehouse network and carrier partnerships. Brief call about your fulfi...

üßæ Reward Components & Explanations:
  ‚îú‚îÄ Length Reward:     +1.00 - ‚úÖ Ideal length (242 chars between 100‚Äì300).
  ‚îú‚îÄ Politeness Reward: +0.50 - ‚úÖ Found polite terms: regards (+0.5).
  ‚îî‚îÄ Sentiment Reward:  +1.00 - ‚úÖ Positive tone detected (score=0.85).
  ‚îú‚îÄ Clarity Reward:    +0.70 - ‚úÖ Clear and simple language.
  ‚îú‚îÄ CTA Reward:        +1.00 - ‚úÖ Contains clear call-to-action words like call
  ‚îú‚îÄ Personalization:   +0.70 - ‚úÖ Personalized tone with terms like you
  ‚îú‚îÄ Grammar Reward:    +0.50 - ‚ö†Ô∏è Few grammatical errors detected (1 issues).
  ‚îú‚îÄ Value Prop Reward: +0.50 - ‚úÖ Value terms found: cost.
  ‚îú‚îÄ Spam Avoidance:    +0.50 - ‚úÖ No spammy te

In [14]:
# ============================================
# üß© Step 6A ‚Äî RL Environment for Email Task
# ============================================

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import random

class EmailEnv:
    """
    Minimal RL environment for email generation.
    The model generates an email ‚Üí we compute its reward.
    """
    def __init__(self, model_name="gpt2", sentiment_analyzer=None, tool=None, weights=None):
        print(f"üöÄ Initializing model: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.sentiment_analyzer = sentiment_analyzer
        self.tool = tool
        self.weights = weights or {}
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)
        self.model.eval()

    def generate_email(self, prompt, max_new_tokens=150, temperature=0.7):
        """
        Generate a sample email given a short prompt.
        """
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            pad_token_id=self.tokenizer.eos_token_id
        )
        generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return generated

    def compute_reward(self, email_text):
        """
        Compute reward for a generated email.
        """
        total, details = compute_reward(
            email_text,
            detailed=True,
            sentiment_analyzer=self.sentiment_analyzer,
            tool=self.tool,
            weights=self.weights
        )
        return total, details


In [18]:
print(reward_weights)

{'length': 1.2, 'politeness': 1.2, 'sentiment': 0.7, 'clarity': 0.6, 'cta': 1.4, 'personalization': 0.7, 'grammar': 0.8, 'value': 1.1, 'spam': 0.8, 'structure': 0.8}


In [26]:
# ============================================
# ‚öôÔ∏è Step 6B ‚Äî Test Environment + Reward
# ============================================

# Initialize environment
env = EmailEnv(
    model_name="gpt2",
    sentiment_analyzer=sentiment_analyzer,
    tool=tool,
    weights=reward_weights
)

prompt = "Write a professional email to introduce a new Surface device to a potential enterprise buyer."
generated_email = env.generate_email(prompt)
reward, details = env.compute_reward(generated_email)

print("\nüìß Generated Email:\n", generated_email)
print("\nüèÜ Reward:", reward)
print("Breakdown:", details)
for key, val in details['reasons'].items():
    print(f"{key}: {val}")


üöÄ Initializing model: gpt2




total_reward before clipping: 3.3699999999999997
total_reward after clipping: 3.3699999999999997

üìß Generated Email:
 Write a professional email to introduce a new Surface device to a potential enterprise buyer.

1. Choose a Product

If you are an enterprise user, you will be able to purchase a Surface Pro 3. The Surface Pro 3 is the best choice. The price of the Surface Pro 3 is around $200. The Pro 3 is a tablet-friendly device (and the easiest to use). It has a built-in keyboard, and it's built-in camera.

The Surface Pro 3 is also a great choice for businesses looking to have a full-sized tablet. Many companies use the Surface Pro 3 as a desktop environment. They have large, powerful screens and the Pro 3 comes with a built-in camera with a built-in video and audio recorder.

2. Choose a Product



üèÜ Reward: 3.3699999999999997
Breakdown: {'length': -0.7, 'politeness': 0.0, 'sentiment': 1.0, 'clarity': 0.7, 'cta': 1.0, 'personalization': 0.7, 'grammar': 0.5, 'value': 0.0, 'spa