In [1]:
# %% [HUGGING FACE LOGIN]
# Hugging Face login cell — paste your token directly here.
# ⚠️ Only do this on your personal machine. Do NOT share or push notebooks with your token visible.
from huggingface_hub import login


hf_token = "" # <-- paste your actual Hugging Face token here
login(token=hf_token, add_to_git_credential=True)
print("✅ Logged into Hugging Face successfully!")

✅ Logged into Hugging Face successfully!


In [2]:
# %%
# CELL 1: Install dependencies
# Run in Anaconda or WSL2 if possible for stability.


import sys
print('Python', sys.version)


# Uncomment if needed:
# !pip install -q transformers accelerate datasets bitsandbytes trl peft auto-gptq sentence-transformers scikit-learn wandb


# %%
# CELL 2: Check GPU and library setup
import torch
print('Torch', torch.__version__, 'CUDA available:', torch.cuda.is_available())


from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
AutoModelForSequenceClassification,
BitsAndBytesConfig,
TrainingArguments,
Trainer,
)


try:
    from trl import PPOTrainer, PPOConfig
except Exception as e:
    print('TRL not installed or incompatible:', e)

Python 3.11.14 | packaged by Anaconda, Inc. | (main, Oct 21 2025, 18:30:03) [MSC v.1929 64 bit (AMD64)]
Torch 2.5.1+cu121 CUDA available: True


In [3]:
# %%
# CELL 3: Setup paths and directories
from pathlib import Path
PROJECT_DIR = Path('rlft_project')
PROJECT_DIR.mkdir(exist_ok=True)
MODEL_DIR = PROJECT_DIR / 'models'
MODEL_DIR.mkdir(exist_ok=True)
DATA_PATH = PROJECT_DIR / 'emails.csv'

In [6]:
import random
import pandas as pd

random.seed(42)

industries = [
    "Retail", "Finance", "IT Services", "Manufacturing", 
    "Travel", "Healthcare", "Education", "E-commerce", "Real Estate"
]

products = {
    "Retail": ["POS system", "inventory app", "billing software"],
    "Finance": ["payment gateway", "investment dashboard", "fraud detection API"],
    "IT Services": ["AI chatbot", "CRM integration", "cloud migration service"],
    "Manufacturing": ["supply chain tool", "equipment monitoring system", "IoT sensors"],
    "Travel": ["booking automation platform", "hotel CRM", "analytics dashboard"],
    "Healthcare": ["patient management system", "appointment scheduler", "telehealth app"],
    "Education": ["learning management system", "student CRM", "assessment platform"],
    "E-commerce": ["email automation tool", "conversion tracking app", "inventory sync system"],
    "Real Estate": ["lead management software", "property analytics tool", "CRM suite"]
}

def generate_context(industry, product):
    role = random.choice([
        "procurement head", "operations manager", "founder", "marketing director",
        "CTO", "CFO", "sales head", "admin manager"
    ])
    intent = random.choice([
        "looking for solutions", "asked about pricing", "requested demo", "cold prospect",
        "was referred by a partner", "attended a webinar", "existing customer"
    ])
    return f"Customer: {role} at a {industry} firm {intent} for a {product}."

def generate_email(industry, product, outcome):
    openings = [
        "Hi there,", "Hello [Name],", "Greetings,", "Hey [Name],", "Good morning,"
    ]
    pitches = [
        f"Our {product} has helped several {industry.lower()} companies optimize their workflow.",
        f"We designed our {product} specifically for {industry.lower()} businesses like yours.",
        f"I noticed your company operates in {industry.lower()} — our {product} could be a great fit.",
        f"With our {product}, teams in {industry.lower()} are saving hours every week."
    ]
    closings = [
        "Would you be open to a quick call this week?",
        "Can I send over a short demo video?",
        "Happy to share a case study if interested.",
        "Would love to show how this could help your team."
    ]

    if outcome == 0:
        # No reply: generic, slightly spammy, less personalized
        tone = random.choice([
            "We offer best-in-class pricing for your needs. Let's connect soon.",
            "Don’t miss this opportunity to upgrade your business solutions.",
            "We are reaching out to introduce our services. Please respond for more info.",
            "Exclusive offer valid till end of this week!"
        ])
    elif outcome == 1:
        # Reply but no conversion: decent personalization, informative but not compelling
        tone = random.choice([
            f"Thanks for your interest earlier! Our {product} can streamline your operations easily.",
            f"I’m following up regarding your query on {product}. We’d be happy to provide more details.",
            f"Our {product} integrates seamlessly — perhaps a short demo could help?",
            f"Following up from our last email — any feedback on the proposal?"
        ])
    else:
        # Conversion: warm, clear CTA, value-oriented
        tone = random.choice([
            f"Appreciate your earlier response — many {industry.lower()} firms achieved 25% growth with our {product}.",
            f"Glad you found the demo useful! Let's schedule onboarding for your team next week.",
            f"Delighted to know you’re proceeding with our {product}. We'll ensure a smooth rollout.",
            f"Excited to start the partnership! Our support team will reach out to get you set up."
        ])

    return f"{random.choice(openings)} {random.choice(pitches)} {tone} {random.choice(closings)}"

# Create data
data = []
for _ in range(100):
    industry = random.choice(industries)
    product = random.choice(products[industry])
    outcome = random.choices([0,1,2], weights=[60,25,15])[0]
    context = generate_context(industry, product)
    email = generate_email(industry, product, outcome)
    data.append({
        "context": context,
        "email_text": email,
        "outcome": outcome
    })

df = pd.DataFrame(data)
df.to_csv("rlft_project/emails.csv", index=False)
print("✅ Created 100 synthetic email examples with realistic distribution and diversity.")
print(df['outcome'].value_counts(normalize=True))
df.head(5)


✅ Created 100 synthetic email examples with realistic distribution and diversity.
outcome
0    0.66
1    0.21
2    0.13
Name: proportion, dtype: float64


Unnamed: 0,context,email_text,outcome
0,Customer: marketing director at a Finance firm...,"Hi there, Our payment gateway has helped sever...",1
1,Customer: marketing director at a Retail firm ...,"Good morning, We designed our POS system speci...",0
2,Customer: procurement head at a Manufacturing ...,"Hey [Name], I noticed your company operates in...",0
3,Customer: CFO at a IT Services firm looking fo...,"Hey [Name], Our AI chatbot has helped several ...",2
4,Customer: procurement head at a Healthcare fir...,"Good morning, Our telehealth app has helped se...",0


In [4]:
# %%
# CELL 4: Load dataset
import pandas as pd

df = pd.read_csv(DATA_PATH)
print('Data loaded:', len(df), 'rows')

Data loaded: 100 rows


In [5]:
# %%
# CELL 5: Create prompt templates
PROMPT_TEMPLATE = (
"You are a salesperson writing an outreach or reply email.\n"
"Context:\n{context}\n\n"
"Write a persuasive, concise email that a seller would write to a prospect or a customer.\n\n"
"Email:\n"
)

df['prompt'] = df['context'].apply(lambda c: PROMPT_TEMPLATE.format(context=c))
df['input_text'] = df['prompt']
df['target_text'] = df['email_text']
print(df.shape)
df.head(3)

(100, 6)


Unnamed: 0,context,email_text,outcome,prompt,input_text,target_text
0,Customer: marketing director at a Finance firm...,"Hi there, Our payment gateway has helped sever...",1,You are a salesperson writing an outreach or r...,You are a salesperson writing an outreach or r...,"Hi there, Our payment gateway has helped sever..."
1,Customer: marketing director at a Retail firm ...,"Good morning, We designed our POS system speci...",0,You are a salesperson writing an outreach or r...,You are a salesperson writing an outreach or r...,"Good morning, We designed our POS system speci..."
2,Customer: procurement head at a Manufacturing ...,"Hey [Name], I noticed your company operates in...",0,You are a salesperson writing an outreach or r...,You are a salesperson writing an outreach or r...,"Hey [Name], I noticed your company operates in..."


In [6]:
print(df.loc[0,"input_text"])
print("\n=====================\n")
print(df.loc[0,"target_text"])

You are a salesperson writing an outreach or reply email.
Context:
Customer: marketing director at a Finance firm asked about pricing for a payment gateway.

Write a persuasive, concise email that a seller would write to a prospect or a customer.

Email:



Hi there, Our payment gateway has helped several finance companies optimize their workflow. I’m following up regarding your query on payment gateway. We’d be happy to provide more details. Would love to show how this could help your team.


In [7]:
# %%
# CELL 6: Reward Model (DistilBERT-based classifier)
from sklearn.model_selection import train_test_split


train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['outcome'], random_state=42)


from datasets import Dataset
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)


rm_model_name = 'distilbert-base-uncased'
rm_tokenizer = AutoTokenizer.from_pretrained(rm_model_name)


def preprocess_rm(batch):
    texts = [c + "\n" + e for c, e in zip(batch["context"], batch["email_text"])]
    enc = rm_tokenizer(texts, truncation=True, padding='max_length', max_length=256)
    enc["labels"] = batch["outcome"]  # <-- crucial fix
    return enc


train_ds = train_ds.map(preprocess_rm, batched=True)
val_ds = val_ds.map(preprocess_rm, batched=True)

train_ds = train_ds.remove_columns(['context', 'email_text', 'outcome'])
val_ds = val_ds.remove_columns(['context', 'email_text', 'outcome'])

train_ds.set_format(type="torch")
val_ds.set_format(type="torch")

rm_model = AutoModelForSequenceClassification.from_pretrained(rm_model_name, num_labels=3)


training_args = TrainingArguments(
output_dir=str(MODEL_DIR / 'rm'),
num_train_epochs=5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
learning_rate=2e-5,
fp16=torch.cuda.is_available(),
logging_steps=10,
)


rm_trainer = Trainer(
model=rm_model,
args=training_args,
train_dataset=train_ds,
eval_dataset=val_ds,
)


rm_trainer.train() # Uncomment to train


print('Reward Model setup complete (train step commented).')



Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
wandb: Currently logged in as: moid (moid-microsoft) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


  0%|          | 0/50 [00:00<?, ?it/s]

{'loss': 1.0006, 'grad_norm': 2.3375067710876465, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0}
{'loss': 0.8494, 'grad_norm': 4.1476263999938965, 'learning_rate': 1.2e-05, 'epoch': 2.0}
{'loss': 0.7654, 'grad_norm': 4.163939952850342, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.0}
{'loss': 0.693, 'grad_norm': 2.1181089878082275, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}
{'loss': 0.6398, 'grad_norm': 2.9335880279541016, 'learning_rate': 0.0, 'epoch': 5.0}
{'train_runtime': 36.2278, 'train_samples_per_second': 11.041, 'train_steps_per_second': 1.38, 'train_loss': 0.789647216796875, 'epoch': 5.0}
Reward Model setup complete (train step commented).


In [9]:
# %%
# CELL 7: Reward function
import torch.nn.functional as F


def reward_fn(prompts, responses):
    texts = [p + '\n' + r for p,r in zip(prompts, responses)]
    print(f"texts - {texts}")
    enc = rm_tokenizer(texts, truncation=True, padding=True, max_length=256, return_tensors='pt')
    if torch.cuda.is_available():
        enc = {k:v.to('cuda') for k,v in enc.items()}
    with torch.no_grad():
        out = rm_model(**enc)
        probs = F.softmax(out.logits, dim=-1)
        score = probs[:,2]*1.0 + probs[:,1]*0.5 # map 2→1.0, 1→0.5
    return score.detach().cpu().tolist()

In [34]:
# CELL 8: Load Mistral-7B-Instruct without quantization (standard FP16/BF16)
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# --- Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# --- Model (no bitsandbytes) ---
# If you want faster performance with smaller VRAM footprint, use torch_dtype=torch.float16
# or torch.bfloat16 if your GPU supports it (your RTX 4060 does)
policy_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,   # or torch.bfloat16 for Ampere+
    device_map=None,           # Let HF/Accelerate handle GPU/CPU split
    trust_remote_code=True,
    low_cpu_mem_usage=True
)

policy_model.config.use_cache = True
policy_model = policy_model.to("cuda")   # ✅ manually move to GPU
policy_model.eval()

print("✅ Loaded Mistral-7B-Instruct successfully (no quantization).")


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Loaded Mistral-7B-Instruct successfully (no quantization).


In [30]:
model_name

'mistralai/Mistral-7B-Instruct-v0.2'

In [38]:
# %%
# CELL 9: PPO Config & Trainer setup
try:
    # Convert your policy model into a value-head model
    model_for_ppo = AutoModelForCausalLMWithValueHead.from_pretrained(policy_model)

    ppo_config = PPOConfig(
    model_name=model_name,
    learning_rate=1e-6,
    batch_size=int(4),
    mini_batch_size=int(1),
    ppo_epochs=int(2),
    gradient_accumulation_steps=int(2),
    )


    ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=model_for_ppo,
    tokenizer=tokenizer,
    )
except Exception as e:
    print('Error initializing PPOTrainer:', e)
    ppo_trainer = None

Error initializing PPOTrainer: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.



In [39]:
# CELL 9: PPO Config & Trainer setup
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead

try:
    # Convert your policy model into a value-head model
    model_for_ppo = AutoModelForCausalLMWithValueHead.from_pretrained(policy_model)

    # Define PPO configuration
    ppo_config = PPOConfig(
        model_name=model_name,
        learning_rate=1e-6,
        batch_size=8,
        mini_batch_size=4,
        ppo_epochs=2,
        gradient_accumulation_steps=1,
        log_with=None,
        remove_unused_columns=False,
    )

    # Create PPO trainer
    ppo_trainer = PPOTrainer(
        config=ppo_config,
        model=model_for_ppo,
        tokenizer=tokenizer,
    )

    print("✅ PPOTrainer initialized successfully with value head!")

except Exception as e:
    print("❌ Error initializing PPOTrainer:", e)
    ppo_trainer = None


❌ Error initializing PPOTrainer: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

