In [None]:
!pip install gym
!pip install stable-baselines3[extra]
!pip install shimmy>=2.0
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.5.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cublas_cu12-12.4.5.8-py

In [None]:
data = pd.read_csv('/content/soapnutshistory.csv')
data['Report Date'] = pd.to_datetime(data['Report Date'])
data.sort_values('Report Date', inplace=True)

  and should_run_async(code)


In [None]:
# Handle missing values in one step
fill_values = {
    'Total Sales': data['Total Sales'].median(),
    'Product Price': data['Product Price'].median(),
    'Predicted Sales': data['Total Sales'].median(),
    'Organic Conversion Percentage': data['Organic Conversion Percentage'].mean(),
    'Ad Conversion Percentage': data['Ad Conversion Percentage'].mean()
}

data = data.fillna(fill_values)

  and should_run_async(code)


In [None]:
# Feature engineering
data['day_of_week'] = data['Report Date'].dt.dayofweek
data['month'] = data['Report Date'].dt.month
data['year'] = data['Report Date'].dt.year


In [None]:
# Initialize scalers
scaler = StandardScaler()
price_scaler = StandardScaler()
sales_scaler = StandardScaler()

  and should_run_async(code)


In [None]:
# Scale features using NumPy arrays to avoid feature name warnings
features = ['day_of_week', 'month', 'year']
data[features] = scaler.fit_transform(data[features].values)
data['Product Price'] = price_scaler.fit_transform(data[['Product Price']].values)
data['Total Sales'] = sales_scaler.fit_transform(data[['Total Sales']].values)


  and should_run_async(code)


In [None]:
# Prepare sales prediction model
valid_rows = data[['Product Price', 'day_of_week', 'month', 'year', 'Total Sales']].dropna()
X = valid_rows[['Product Price', 'day_of_week', 'month', 'year']].values
y = valid_rows['Total Sales'].values
sales_model = RandomForestRegressor(n_estimators=100, random_state=42)
sales_model.fit(X, y)

In [None]:
# Verify there are no NaN values
assert not np.isnan(X).any(), "X contains NaN values"
assert not np.isnan(y).any(), "y contains NaN values"

  and should_run_async(code)


In [None]:
from stable_baselines3.common.callbacks import BaseCallback

In [None]:
# Custom Environment
class PricingEnv(gym.Env):
    def __init__(self, data, sales_model):
        super(PricingEnv, self).__init__()
        self.data = data
        self.sales_model = sales_model
        self.current_step = 0
        self.max_steps = len(data) - 1

        # Action space: limited to ±10% of current price
        self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(1,), dtype=np.float32)

        # State space: [current_price, day_of_week, month, year]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(4,), dtype=np.float32)

        # Median values for reward normalization
        self.median_sales = np.median(data['Total Sales'])
        self.median_price = np.median(data['Product Price'])

        # Reward tracking
        self.current_reward_components = {}

    def reset(self):
        self.current_step = 0
        return self._get_state(self.current_step)

    def _get_state(self, step):
        return np.array([
            self.data.iloc[step]['Product Price'],
            self.data.iloc[step]['day_of_week'],
            self.data.iloc[step]['month'],
            self.data.iloc[step]['year']
        ], dtype=np.float32)

    def step(self, action):
        # 1. Apply action (price change) with ±15% limit
        current_price = price_scaler.inverse_transform(
            [[self.data.iloc[self.current_step]['Product Price']]]
        )[0][0]
        max_change = current_price * 0.15  # Max ±15% change
        proposed_price = np.clip(
            price_scaler.inverse_transform([[action[0]]])[0][0],
            current_price - max_change,
            current_price + max_change
        )
        proposed_price_scaled = price_scaler.transform([[proposed_price]])[0][0]

        # Move to next time step
        next_step = self.current_step + 1
        done = next_step >= self.max_steps
        next_step = self.current_step if done else next_step

        # Predict sales for proposed price
        next_row = self.data.iloc[next_step]
        model_input = np.array([
            [proposed_price_scaled,
             next_row['day_of_week'],
             next_row['month'],
             next_row['year']]
        ], dtype=np.float32)
        predicted_sales_scaled = self.sales_model.predict(model_input)[0]
        predicted_sales = sales_scaler.inverse_transform([[predicted_sales_scaled]])[0][0]

        # Get original predicted sales (if available)
        original_predicted_sales = sales_scaler.transform([[next_row['Predicted Sales']]])[0][0] if not np.isnan(next_row['Predicted Sales']) else 0

        # 2. Reward Calculation
        # Sales Reward (normalized and bounded)
        sales_reward = np.tanh(predicted_sales_scaled / sales_scaler.scale_[0])  # Bounded between -1 and 1

        # Price Reward (progressive incentive)
        price_diff = proposed_price - self.median_price
        if price_diff > 0:  # Reward for prices above median
            price_reward = price_diff * 1.5 / price_scaler.scale_[0]
        else:  # Small penalty for prices below median
            price_reward = price_diff * 0.5 / price_scaler.scale_[0]

        # Conversion Bonus (capped at 20%)
        conversion_bonus = np.clip(
            (next_row['Organic Conversion Percentage'] + next_row['Ad Conversion Percentage']) / 100.0,
            0, 0.2  # Max 20% bonus
        )

        # Punishment (only for significant misses)
        punishment = 0
        if predicted_sales_scaled < 0.7 * original_predicted_sales:  # Only punish significant misses
            punishment = (original_predicted_sales - predicted_sales_scaled) * 10  # Reduced multiplier

        # Total Reward (weighted sum)
        total_reward = (
            1.5 * sales_reward +
            0.8 * price_reward +
            conversion_bonus -
            1.4 * punishment
        )

        # Reward Clipping for Stability
        total_reward = np.clip(total_reward, -5.0, 10.0)

        # Update state and reward tracking
        self.current_step = next_step
        next_state = self._get_state(next_step)
        self.current_reward_components = {
            'sales': sales_reward,
            'price': price_reward,
            'conversion': conversion_bonus,
            'punishment': -punishment,
            'total': total_reward
        }

        return next_state, total_reward, done, {}

    def get_reward_breakdown(self):
        return self.current_reward_components


# Reward Logger Callback
class RewardLoggerCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(RewardLoggerCallback, self).__init__(verbose)
        self.episode_rewards = []
        self.current_episode_reward = 0

    def _on_step(self) -> bool:
        # Accumulate reward
        self.current_episode_reward += self.locals['rewards'][0]

        # Print every 500 steps
        if self.n_calls % 500 == 0:
            env = self.training_env.envs[0].unwrapped
            if hasattr(env, 'get_reward_breakdown'):
                rewards = env.get_reward_breakdown()
                print("\nReward Components:")
                print(f"Sales: {rewards['sales']:.2f}")
                print(f"Price: {rewards['price']:.2f}")
                print(f"Conversion: {rewards['conversion']:.2f}")
                print(f"Punishment: {rewards['punishment']:.2f}")
                print(f"Total: {rewards['total']:.2f}")

            # Print price information
            current_price = price_scaler.inverse_transform(
                [[env.data.iloc[env.current_step]['Product Price']]]
            )[0][0]
            print(f"Current Price: ${current_price:.2f}")
            print(f"Median Price: ${env.median_price:.2f}")
            print(f"Price Change %: {(current_price - env.median_price)/env.median_price*100:.1f}%")

        # Track episode rewards
        if 'done' in self.locals and self.locals['done']:
            self.episode_rewards.append(self.current_episode_reward)
            print(f"\nEpisode {len(self.episode_rewards)}")
            print(f"Total Reward: {self.current_episode_reward:.2f}")
            print(f"Average Reward (last 10): {np.mean(self.episode_rewards[-10:]):.2f}")
            self.current_episode_reward = 0  # Reset for next episode

            def _on_training_end(self):
        # Plot the rewards over episodes
                self.plot_rewards()

            def plot_rewards(self):
               plt.figure(figsize=(10, 6))
               plt.plot(self.episode_rewards, label='Episode Reward')
               plt.xlabel('Episode')
               plt.ylabel('Total Reward')
               plt.title('Rewards Over Episodes')
               plt.legend()
               plt.grid(True)
               plt.show()


        return True





In [None]:
# Training
env = PricingEnv(data, sales_model)
model = PPO(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=5e-5,
    gamma=0.90,
    ent_coef=0.2,
    clip_range=0.15,
    n_steps=4096,
    batch_size=256,
    n_epochs=5,
    target_kl=0.05
)
model.learn(total_timesteps=25000, callback=RewardLoggerCallback())

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.





Episode 1
Total Reward: 1340.10
Average Reward (last 10): 1340.10

Episode 2
Total Reward: 1331.83
Average Reward (last 10): 1335.96

Episode 3
Total Reward: 1330.38
Average Reward (last 10): 1334.10

Reward Components:
Sales: -0.01
Price: 25.09
Conversion: 0.20
Punishment: -2.01
Total: 10.00
Current Price: $15.05
Median Price: $-0.15
Price Change %: -9986.6%

Episode 4
Total Reward: 1337.46
Average Reward (last 10): 1334.94

Episode 5
Total Reward: 1329.52
Average Reward (last 10): 1333.86

Episode 6
Total Reward: 1335.03
Average Reward (last 10): 1334.05

Reward Components:
Sales: -0.00
Price: 25.09
Conversion: 0.20
Punishment: -0.19
Total: 10.00
Current Price: $14.95
Median Price: $-0.15
Price Change %: -9920.9%

Episode 7
Total Reward: 1329.34
Average Reward (last 10): 1333.38

Episode 8
Total Reward: 1330.42
Average Reward (last 10): 1333.01


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
from stable_baselines3.common.callbacks import BaseCallback

class LossCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(LossCallback, self).__init__(verbose)
        self.losses = []
        self.positive_rewards = []
        self.negative_rewards = []

    def _on_step(self) -> bool:
        # Access loss and rewards from the model's locals
        loss = self.locals.get("loss")
        rewards = self.locals.get("rewards")

        if loss is not None:
            self.losses.append(loss.item())  # Convert PyTorch tensor to float

        if rewards is not None:
            self.positive_rewards.extend([r for r in rewards if r > 0])
            self.negative_rewards.extend([r for r in rewards if r < 0])

        return True

# Create the callback instance
loss_callback = LossCallback()

# Train the model with the callback
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000, callback=loss_callback)

# Plot the results
plt.figure(figsize=(15, 5))

# Loss plot
plt.subplot(1, 3, 1)
plt.plot(loss_callback.losses, label="Loss")
plt.title("Loss Over Steps")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.legend()
plt.grid()

# Positive Rewards plot
plt.subplot(1, 3, 2)
plt.hist(loss_callback.positive_rewards, bins=20, label="Positive Rewards")
plt.title("Distribution of Positive Rewards")
plt.xlabel("Reward")
plt.ylabel("Frequency")
plt.legend()
plt.grid()

# Negative Rewards plot
plt.subplot(1, 3, 3)
plt.hist(loss_callback.negative_rewards, bins=20, label="Negative Rewards")
plt.title("Distribution of Negative Rewards")
plt.xlabel("Reward")
plt.ylabel("Frequency")
plt.legend()
plt.grid()

plt.tight_layout()
plt.show()