# Fake Dataset and RL Runner

In [60]:
import pandas as pd
from faker import Faker
import random
from datetime import datetime, timedelta

df = pd.read_csv('/Users/janduplessis/code/janduplessis883/attribut-app/data/synthetic_dataset2.csv')

In [61]:
df.head()

Unnamed: 0,state,action,reward,next_state
0,"{'tasks': [{'id': 0, 'name': 'monetize efficie...","{'task_id': 6, 'time_block': (datetime.datetim...",0.1,"{'tasks': [{'id': 0, 'name': 'monetize efficie..."
1,"{'tasks': [{'id': 0, 'name': 'empower customiz...","{'task_id': 1, 'time_block': (datetime.datetim...",0.1,"{'tasks': [{'id': 0, 'name': 'empower customiz..."
2,"{'tasks': [{'id': 0, 'name': 'maximize 24/7 we...","{'task_id': 3, 'time_block': (datetime.datetim...",0.2,"{'tasks': [{'id': 0, 'name': 'maximize 24/7 we..."
3,"{'tasks': [{'id': 0, 'name': 'productize B2C m...","{'task_id': 0, 'time_block': (datetime.datetim...",0.1,"{'tasks': [{'id': 1, 'name': 'incubate strateg..."
4,"{'tasks': [{'id': 0, 'name': 'brand 24/365 syn...","{'task_id': 1, 'time_block': (datetime.datetim...",0.1,"{'tasks': [{'id': 0, 'name': 'brand 24/365 syn..."


In [62]:
df['state'][1]

"{'tasks': [{'id': 0, 'name': 'empower customized channels', 'deadline': datetime.datetime(2025, 2, 11, 0, 12, 43, 449336), 'priority': 2, 'duration': 30, 'dependencies': []}, {'id': 1, 'name': 'redefine revolutionary models', 'deadline': datetime.datetime(2025, 2, 11, 11, 12, 2, 563684), 'priority': 3, 'duration': 30, 'dependencies': [0], 'type': 'meeting', 'scheduled_time': (datetime.datetime(2025, 2, 4, 11, 15), datetime.datetime(2025, 2, 4, 11, 45))}, {'id': 2, 'name': 'target compelling users', 'deadline': datetime.datetime(2025, 2, 11, 11, 27, 45, 325533), 'priority': 2, 'duration': 30, 'dependencies': [0]}], 'calendar': [{'start': datetime.datetime(2025, 2, 4, 12, 30), 'end': datetime.datetime(2025, 2, 4, 13, 0), 'type': 'meeting'}, {'start': datetime.datetime(2025, 2, 4, 13, 0), 'end': datetime.datetime(2025, 2, 4, 14, 0), 'type': 'meeting'}], 'preferences': {'productive_hours': [11, 18], 'work_days': [0, 1, 2, 3, 4]}}"

In [63]:
df['action'][0]

"{'task_id': 6, 'time_block': (datetime.datetime(2025, 2, 4, 14, 30), datetime.datetime(2025, 2, 4, 16, 0))}"

In [64]:
df['next_state'][2]

"{'tasks': [{'id': 0, 'name': 'maximize 24/7 web-readiness', 'deadline': datetime.datetime(2025, 2, 7, 0, 42, 20, 473396), 'priority': 1, 'duration': 90, 'dependencies': []}, {'id': 1, 'name': 'integrate strategic markets', 'deadline': datetime.datetime(2025, 2, 9, 15, 1, 10, 848200), 'priority': 3, 'duration': 30, 'dependencies': []}, {'id': 2, 'name': 'envisioneer open-source models', 'deadline': datetime.datetime(2025, 2, 11, 11, 20, 59, 135046), 'priority': 3, 'duration': 60, 'dependencies': []}, {'id': 4, 'name': 'evolve open-source solutions', 'deadline': datetime.datetime(2025, 2, 6, 0, 40, 43, 960830), 'priority': 3, 'duration': 30, 'dependencies': [3, 2], 'type': 'meeting'}, {'id': 5, 'name': 'extend enterprise models', 'deadline': datetime.datetime(2025, 2, 10, 11, 41, 18, 852463), 'priority': 2, 'duration': 30, 'dependencies': []}, {'id': 6, 'name': 'transition bleeding-edge portals', 'deadline': datetime.datetime(2025, 2, 6, 10, 25, 1, 957652), 'priority': 1, 'duration': 90

PreProcessing

In [65]:
import numpy as np
import pandas as pd
import json
import torch
from torch.utils.data import Dataset, DataLoader
from datetime import datetime, timedelta
import datetime as dt
from sklearn.model_selection import train_test_split

# -------------------------------
# 1. Helper Functions
# -------------------------------
def safe_eval(x):
    """
    Evaluate a string containing a dictionary with datetime objects.
    Provides the datetime module and timedelta for evaluation.
    """
    try:
        # Use the datetime module from dt so that "datetime.datetime(...)" works.
        return eval(x, {"datetime": dt, "timedelta": dt.timedelta})
    except Exception as e:
        print("Error evaluating string:", e)
        return x

def state_to_features(state, max_tasks=10, time_slots=48):
    """
    Convert the state dictionary into a numerical feature vector.

    Feature breakdown:
      - 2 features: sine & cosine of current time-of-day.
      - 1 feature: normalized productive hours remaining.
      - max_tasks * 6 features: for each task, include:
            * normalized priority,
            * days remaining until deadline,
            * normalized duration (relative to 4 hours),
            * normalized count of dependencies,
            * binary flag if type contains 'meeting',
            * binary flag if the task is already scheduled.
      - time_slots features: one-hot encoding of occupied calendar time slots (30-minute slots).
    """
    now = datetime.now()
    # Total feature length = 3 (time context) + (max_tasks*6) + time_slots
    features = np.zeros(3 + max_tasks * 6 + time_slots)

    # 1. Time of day (sine/cosine encoding)
    hour = now.hour + now.minute / 60.0
    features[0] = np.sin(2 * np.pi * hour / 24)
    features[1] = np.cos(2 * np.pi * hour / 24)

    # 2. Productive hours remaining (normalized over 24 hours)
    productive_start, productive_end = state['preferences']['productive_hours']
    features[2] = (productive_end - hour) / 24.0

    # 3. Task features (up to max_tasks)
    for i, task in enumerate(state['tasks'][:max_tasks]):
        offset = 3 + i * 6
        # Normalize priority (assuming values 1-3)
        features[offset] = task['priority'] / 3.0
        # Days until deadline (could be negative if past due)
        features[offset + 1] = (task['deadline'] - now).total_seconds() / 86400.0
        # Duration normalized to max 4 hours (240 minutes)
        features[offset + 2] = task['duration'] / 240.0
        # Number of dependencies normalized (assuming maximum of 5)
        features[offset + 3] = len(task['dependencies']) / 5.0
        # Binary flag: does the task type contain 'meeting'?
        features[offset + 4] = 1 if 'meeting' in task.get('type', '') else 0
        # Binary flag: has the task been scheduled?
        features[offset + 5] = 1 if task.get('scheduled_time') else 0

    # 4. Calendar time slots (48 slots for 30-minute intervals in a day)
    base_index = 3 + max_tasks * 6
    for event in state['calendar']:
        start = event['start']
        end = event['end']
        start_slot = int((start.hour * 60 + start.minute) / 30)
        duration_slots = int((end - start).total_seconds() / 1800)
        for j in range(start_slot, min(start_slot + duration_slots, time_slots)):
            features[base_index + j] = 1

    return features

def action_to_label(action, time_slots=48):
    """
    Convert the action into a discrete label.
    Label is computed as: task_id * time_slots + time_slot index.
    """
    start_time = action['time_block'][0]
    time_slot = int((start_time.hour * 60 + start_time.minute) / 30)
    return action['task_id'] * time_slots + time_slot

# -------------------------------
# 2. Process DataFrame
# -------------------------------
# Assuming you have already loaded your DataFrame `df` from a CSV.
# For example: df = pd.read_csv("your_dataset.csv")
# The DataFrame should contain the columns: 'state', 'action', 'reward', 'next_state'.

# Convert string representations to dictionaries.
df['state'] = df['state'].apply(safe_eval)
df['action'] = df['action'].apply(safe_eval)
df['next_state'] = df['next_state'].apply(safe_eval)

# Compute features for the state and next state, and compute discrete action labels.
df['features'] = df['state'].apply(state_to_features)
df['next_features'] = df['next_state'].apply(state_to_features)
df['action_label'] = df['action'].apply(action_to_label)

# Quick check of the features and action labels.
print(df[['features', 'action_label']].head())

# Convert the lists in the DataFrame to NumPy arrays.
observations = np.stack(df['features'].values)         # shape: [num_samples, feature_dim]
next_observations = np.stack(df['next_features'].values) # shape: [num_samples, feature_dim]
action_labels = df['action_label'].values                # shape: [num_samples]
rewards = df['reward'].values.astype(np.float32)         # shape: [num_samples]
# Mark each transition as terminal (modify if you have a "done" flag in your data).
dones = np.ones_like(rewards, dtype=bool)

# Split into training and validation sets.
indices = np.arange(len(df))
train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)

# -------------------------------
# 3. Create a PyTorch Dataset
# -------------------------------
class RLTransitionDataset(Dataset):
    def __init__(self, observations, actions, rewards, next_observations, dones):
        self.observations = torch.tensor(observations, dtype=torch.float32)
        self.actions = torch.tensor(actions, dtype=torch.long)
        self.rewards = torch.tensor(rewards, dtype=torch.float32)
        self.next_observations = torch.tensor(next_observations, dtype=torch.float32)
        self.dones = torch.tensor(dones, dtype=torch.bool)
        
    def __len__(self):
        return len(self.observations)
    
    def __getitem__(self, idx):
        return {
            "obs": self.observations[idx],
            "action": self.actions[idx],
            "reward": self.rewards[idx],
            "next_obs": self.next_observations[idx],
            "done": self.dones[idx],
        }

# Create training and validation datasets.
train_dataset = RLTransitionDataset(
    observations[train_indices],
    action_labels[train_indices],
    rewards[train_indices],
    next_observations[train_indices],
    dones[train_indices]
)

val_dataset = RLTransitionDataset(
    observations[val_indices],
    action_labels[val_indices],
    rewards[val_indices],
    next_observations[val_indices],
    dones[val_indices]
)

# Create DataLoaders.
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

print("Training Data Loader Ready")

                                            features  action_label
0  [-0.9426414910921783, -0.33380685923377124, -0...           317
1  [-0.9426414910921783, -0.33380685923377124, 0....            70
2  [-0.9426414910921783, -0.33380685923377124, 0....           178
3  [-0.9426414910921783, -0.33380685923377124, 0....            39
4  [-0.9426414910921783, -0.33380685923377124, 0....            76
Training Data Loader Ready


In [66]:
import json
import torch
from torch.utils.data import Dataset, DataLoader

class TorchOfflineDataset(Dataset):
    def __init__(self, jsonl_file):
        self.data = []
        with open(jsonl_file, "r") as f:
            for line in f:
                self.data.append(json.loads(line))
                
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        # Convert the fields to PyTorch tensors with the appropriate types.
        return {
            'obs': torch.tensor(sample['obs'], dtype=torch.float32),
            'action': torch.tensor(sample['actions'], dtype=torch.long),
            'reward': torch.tensor(sample['rewards'], dtype=torch.float32),
            'done': torch.tensor(sample['dones'], dtype=torch.bool),
            'next_obs': torch.tensor(sample['next_obs'], dtype=torch.float32)
        }

# Usage:
jsonl_filename = "offline_data.jsonl"  # Path to your prepared JSONL dataset.
dataset = TorchOfflineDataset(jsonl_filename)
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Quick test of the loader:
for batch in data_loader:
    print(batch)
    break

import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# -------------------------------
# 1. Define the Offline Dataset (JSONL based)
# -------------------------------
class TorchOfflineDataset(Dataset):
    def __init__(self, jsonl_file):
        self.data = []
        with open(jsonl_file, "r") as f:
            for line in f:
                self.data.append(json.loads(line))
        print(f"Loaded {len(self.data)} transitions from {jsonl_file}.")
                
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        # Convert each field to a PyTorch tensor with the appropriate type.
        return {
            'obs': torch.tensor(sample['obs'], dtype=torch.float32),
            'action': torch.tensor(sample['actions'], dtype=torch.long),
            'reward': torch.tensor(sample['rewards'], dtype=torch.float32),
            'done': torch.tensor(sample['dones'], dtype=torch.bool),
            'next_obs': torch.tensor(sample['next_obs'], dtype=torch.float32)
        }

# Specify the JSONL file path for your offline dataset.
jsonl_filename = "offline_data.jsonl"
dataset = TorchOfflineDataset(jsonl_filename)
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)
print("DataLoader created. Ready to train the model.\n")

# -------------------------------
# 2. Define the Q-Network and Helper Functions
# -------------------------------
class QNetwork(nn.Module):
    def __init__(self, obs_dim, n_actions):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(obs_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.out = nn.Linear(256, n_actions)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.out(x)  # returns Q-values for each action

def soft_update(target, source, tau):
    for target_param, source_param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(tau * source_param.data + (1.0 - tau) * target_param.data)

# -------------------------------
# 3. Hyperparameters & Setup
# -------------------------------
obs_dim = 111      # observation dimension (as defined by your environment)
n_actions = 480    # total number of discrete actions
gamma = 0.99       # discount factor
alpha = 1.0        # conservative penalty weight (tune this!)
batch_size = 64
learning_rate = 3e-4
n_epochs = 10
target_update_tau = 0.005

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}\n")

q_net = QNetwork(obs_dim, n_actions).to(device)
target_q_net = QNetwork(obs_dim, n_actions).to(device)
target_q_net.load_state_dict(q_net.state_dict())

optimizer = optim.Adam(q_net.parameters(), lr=learning_rate)

# -------------------------------
# 4. Training Loop (CQL)
# -------------------------------
print("Starting training loop...\n")
for epoch in range(n_epochs):
    epoch_loss = 0.0
    batch_count = 0
    for batch in data_loader:
        # Prepare batch data as tensors (they are already tensors from our dataset)
        obs = batch['obs'].to(device)           # shape: [B, obs_dim]
        actions = batch['action'].to(device)      # shape: [B]
        rewards = batch['reward'].unsqueeze(1).to(device)  # shape: [B, 1]
        # Convert done flag to float (1.0 if done, 0.0 if not) for computation.
        dones = batch['done'].float().unsqueeze(1).to(device)  # shape: [B, 1]
        next_obs = batch['next_obs'].to(device)   # shape: [B, obs_dim]
        
        # Compute current Q-values for all actions.
        q_values = q_net(obs)  # shape: [B, n_actions]
        # Gather Q-values for the taken actions.
        q_value = q_values.gather(1, actions.unsqueeze(1))  # shape: [B, 1]
        
        # Compute target Q-value using the target network (max over actions).
        with torch.no_grad():
            next_q_values = target_q_net(next_obs)  # shape: [B, n_actions]
            next_q_value, _ = next_q_values.max(dim=1, keepdim=True)
            target = rewards + gamma * (1 - dones) * next_q_value  # shape: [B, 1]
        
        # Compute the Bellman error (MSE loss).
        bellman_loss = nn.MSELoss()(q_value, target)
        
        # Conservative penalty term:
        # Compute logsumexp over all actions to encourage lower Q-values on out-of-distribution actions.
        batch_q = q_net(obs)  # shape: [B, n_actions]
        max_q, _ = batch_q.max(dim=1, keepdim=True)
        logsumexp = torch.log(torch.sum(torch.exp(batch_q - max_q), dim=1, keepdim=True)) + max_q
        conservative_penalty = logsumexp - q_value  # shape: [B, 1]
        
        loss = bellman_loss + alpha * conservative_penalty.mean()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        batch_count += 1

    # Soft update the target network at the end of each epoch.
    soft_update(target_q_net, q_net, target_update_tau)
    
    print(f"Epoch {epoch+1}/{n_epochs}: Average Loss = {epoch_loss / batch_count:.4f}")
    
print("\nTraining complete.")

# -------------------------------
# 5. Save the Trained Model
# -------------------------------
torch.save(q_net.state_dict(), "offline_cql_model_torch.pth")
print("Model saved as offline_cql_model_torch.pth")

{'obs': tensor([[0.9958, 0.0915, 0.5562,  ..., 0.0000, 0.0000, 0.0000],
        [0.9958, 0.0915, 0.5146,  ..., 0.0000, 0.0000, 0.0000],
        [0.9958, 0.0915, 0.5562,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.9958, 0.0915, 0.6396,  ..., 0.0000, 0.0000, 0.0000],
        [0.9958, 0.0915, 0.4729,  ..., 0.0000, 0.0000, 0.0000],
        [0.9958, 0.0915, 0.6396,  ..., 0.0000, 0.0000, 0.0000]]), 'action': tensor([129,  62, 218, 187, 118,  71, 170,  87,  24, 308, 125,  36, 259, 128,
         61,  34,  27,  23, 214, 226, 160,  78, 156, 261,  74, 263,  27, 314,
        213,  22, 167,  77,  25,  19, 208,  25,  25, 120,  37, 122,  17, 170,
        167, 114, 352,  77, 119,  75,  36,  29, 123, 137, 279,  26, 263,  71,
        113,  85, 170, 267, 174, 165,  23,  34]), 'reward': tensor([ 3.0000e-01,  1.0000e-01,  7.0000e-02,  3.6667e-02,  2.0000e-01,
         1.0000e-01,  1.4000e-01,  1.0000e-01,  2.0000e-01,  1.4000e-01,
         1.0667e-01,  1.0000e-01,  7.0000e-02,  7.0000e-02,  1.00

In [67]:
import torch
import numpy as np

# Assume state_to_features is defined exactly as during training.
# And assume your QNetwork class is available.

# Define the device and recreate the network architecture.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
obs_dim = 111
n_actions = 480

# Initialize the network and load the trained weights.
q_net = QNetwork(obs_dim, n_actions).to(device)
q_net.load_state_dict(torch.load("offline_cql_model_torch.pth", map_location=device))
q_net.eval()  # set to evaluation mode

def select_action(real_world_state):
    """
    real_world_state: a dictionary with the same structure as the state used during training.
    """
    # Convert your state into features.
    features = state_to_features(real_world_state)  # returns a NumPy array of shape [111]
    features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)  # add batch dim

    # Forward pass to get Q-values.
    with torch.no_grad():
        q_values = q_net(features_tensor)  # shape: [1, n_actions]
    
    # Choose the action with highest Q-value.
    action_index = q_values.argmax(dim=1).item()  # greedy selection

    # Optional: If you have a mapping function to decode the action index back into a more interpretable action:
    # action = decode_action(action_index)
    
    return action_index

# Example usage:
# Construct a sample real-world state dictionary that mirrors your training data's structure.
# This is just a dummy example. Replace it with actual real-world data.
real_world_state = {
    "preferences": {
        "productive_hours": (14, 18)  # for instance, 9 AM to 5 PM
    },
    "tasks": [
        {
            "priority": 1,
            "deadline": datetime.now() + timedelta(days=1),
            "duration": 15,  # in minutes
            "dependencies": [],
            "type": "email",
            "scheduled_time": None
        },
        # ... up to however many tasks you want to simulate
    ],
    "calendar": [
        {
            "start": datetime.now() + timedelta(hours=1),
            "end": datetime.now() + timedelta(hours=2)
        },
        # Additional calendar events as needed.
    ]
}

# Select an action based on the current real-world state.
selected_action = select_action(real_world_state)
print("Selected Action Index:", selected_action)

Selected Action Index: 136


  q_net.load_state_dict(torch.load("offline_cql_model_torch.pth", map_location=device))


In [68]:
from datetime import datetime, timedelta

def decode_action(action_index, time_slots=48):
    """
    Given an encoded action index, decode it to (task_id, time_slot).
    """
    task_id = action_index // time_slots
    time_slot = action_index % time_slots
    return task_id, time_slot

def get_timeslot_start(time_slot, base_time=None):
    """
    Convert a time_slot index to an actual datetime object.
    Assumes that the day starts at base_time (default: midnight of today).
    Each slot is 30 minutes long.
    """
    if base_time is None:
        # Define midnight of the current day.
        base_time = datetime.combine(datetime.today(), datetime.min.time())
    return base_time + timedelta(minutes=time_slot * 30)

def update_task_calendar(state, action_index, time_slots=48):
    """
    Given a state dictionary and an encoded action index, update the state:
      - Decode the action to get task_id and time_slot.
      - Set the scheduled_time for that task.
      - Append a new calendar event for that time slot.
    """
    # Decode the action index.
    task_id, time_slot = decode_action(action_index, time_slots)
    print(f"Decoded action {action_index}: task_id = {task_id}, time_slot = {time_slot}")
    
    # Determine the start time for the time slot.
    slot_start = get_timeslot_start(time_slot)
    slot_end = slot_start + timedelta(minutes=30)  # assuming 30-minute duration
    print(f"Time slot {time_slot} corresponds to {slot_start.time()} - {slot_end.time()}")
    
    # Update the task if it exists.
    if task_id < len(state['tasks']):
        state['tasks'][task_id]['scheduled_time'] = slot_start
        print(f"Task {task_id} scheduled at {slot_start}")
    else:
        print(f"Warning: Task ID {task_id} not found in the state.")
    
    # Update the calendar: mark this time slot as occupied.
    new_event = {
        "start": slot_start,
        "end": slot_end
    }
    state['calendar'].append(new_event)
    print(f"Added calendar event: {new_event}")
    
    return state

# Example usage:
# Let's assume you have a real-world state dictionary similar to your training state.
real_world_state = {
    "preferences": {
        "productive_hours": (9, 17)  # example: 9 AM to 5 PM
    },
    "tasks": [
        {
            "priority": 2,
            "deadline": datetime.now() + timedelta(days=2),
            "duration": 60,  # in minutes
            "dependencies": [],
            "type": "email",
        },
        {
            "priority": 1,
            "deadline": datetime.now() + timedelta(days=3),
            "duration": 120,
            "dependencies": [],
            "type": "meeting",
            "scheduled_time": None
        },
        {
            "priority": 3,
            "deadline": datetime.now() + timedelta(days=1),
            "duration": 30,
            "dependencies": [],
            "type": "call",
            "scheduled_time": None
        },
        # Add more tasks if needed...
    ],
    "calendar": []  # initially empty
}

# Suppose your model selected action index 116.
selected_action_index = 27
updated_state = update_task_calendar(real_world_state, selected_action_index)

# At this point, the `updated_state` has the task with task_id 2 updated with a scheduled time,
# and the calendar now includes the corresponding event.

Decoded action 27: task_id = 0, time_slot = 27
Time slot 27 corresponds to 13:30:00 - 14:00:00
Task 0 scheduled at 2025-02-04 13:30:00
Added calendar event: {'start': datetime.datetime(2025, 2, 4, 13, 30), 'end': datetime.datetime(2025, 2, 4, 14, 0)}
