In [17]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Define dimensions
input_dim = 20      # Number of features in the state representation
action_dim = 1000   # Number of possible influencer actions (recommendations)
hidden_dim = 128



In [18]:
# Define the DQN model (for Q-value prediction)
class DQN(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)


In [19]:
# Initialize the RL model
rl_model = DQN(input_dim, action_dim, hidden_dim)

# Load pre-trained weights (if available) for the shared layers (e.g., fc1)
try:
    pretrained_dict = torch.load("pretrained_recommendation_model.pth")
    model_dict = rl_model.state_dict()
    # Update only keys related to the first layer (if the architectures match)
    pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict and "fc1" in k}
    model_dict.update(pretrained_dict)
    rl_model.load_state_dict(model_dict)
    print("Pre-trained weights loaded for fc1.")
except Exception as e:
    print("Pre-trained weights not loaded, using random initialization:", e)

Pre-trained weights loaded for fc1.


  pretrained_dict = torch.load("pretrained_recommendation_model.pth")


In [20]:
# Set up the target network for stability
target_net = DQN(input_dim, action_dim, hidden_dim)
target_net.load_state_dict(rl_model.state_dict())

# Define optimizer and hyperparameters for RL training
optimizer_rl = optim.Adam(rl_model.parameters(), lr=1e-4)
gamma = 0.99
epsilon = 1.0
min_epsilon = 0.05
epsilon_decay = 0.995

# We'll use a simple list as our replay buffer for this example
replay_buffer = []

In [21]:
# Define a simple dummy environment (replace this with your actual environment)
class RLDummyEnv:
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
    
    def reset(self):
        # Return an initial random state vector
        return np.random.rand(self.state_dim)
    
    def step(self, action):
        # Simulate an environment step:
        # - next_state: random state (in practice, depends on action and current context)
        # - reward: random reward (replace with your reward logic)
        # - done: small probability to end the episode
        next_state = np.random.rand(self.state_dim)
        reward = np.random.rand()  # Replace with a meaningful reward function
        done = random.random() < 0.05
        return next_state, reward, done, {}


In [22]:
env = RLDummyEnv(input_dim, action_dim)

# Simple function to sample a minibatch from the replay buffer
def sample_replay(buffer, batch_size):
    return random.sample(buffer, batch_size)

# RL training loop parameters
num_episodes = 500
batch_size = 32


In [23]:
for episode in range(num_episodes):
    state = env.reset()
    done = False
    while not done:
        # Epsilon-greedy action selection
        if random.random() < epsilon:
            action = random.randint(0, action_dim - 1)
        else:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                action = rl_model(state_tensor).argmax().item()
        
        next_state, reward, done, _ = env.step(action)
        replay_buffer.append((state, action, reward, next_state, done))
        state = next_state
        
        # Train the network if enough samples are available
        if len(replay_buffer) >= batch_size:
            batch = sample_replay(replay_buffer, batch_size)
            states, actions, rewards, next_states, dones = zip(*batch)
            states = torch.FloatTensor(states)
            actions = torch.LongTensor(actions).unsqueeze(1)
            rewards = torch.FloatTensor(rewards).unsqueeze(1)
            next_states = torch.FloatTensor(next_states)
            # Convert boolean done flags to float (1.0 if done, 0.0 if not)
            dones = torch.FloatTensor([1.0 if d else 0.0 for d in dones]).unsqueeze(1)
            
            # Compute current Q-values
            q_values = rl_model(states).gather(1, actions)
            # Compute next Q-values from the target network
            next_q_values = target_net(next_states).max(1)[0].unsqueeze(1)
            # Compute the target Q-values using the Bellman equation
            targets = rewards + gamma * next_q_values * (1 - dones)
            
            loss_rl = nn.MSELoss()(q_values, targets)
            optimizer_rl.zero_grad()
            loss_rl.backward()
            optimizer_rl.step()
    
    # Decay epsilon after each episode
    epsilon = max(min_epsilon, epsilon * epsilon_decay)
    
    # Update target network periodically
    if episode % 10 == 0:
        target_net.load_state_dict(rl_model.state_dict())
        print(f"Episode {episode}, Epsilon: {epsilon:.3f}")

print("Reinforcement Learning fine-tuning completed.")

Episode 0, Epsilon: 0.995
Episode 10, Epsilon: 0.946
Episode 20, Epsilon: 0.900
Episode 30, Epsilon: 0.856
Episode 40, Epsilon: 0.814
Episode 50, Epsilon: 0.774
Episode 60, Epsilon: 0.737
Episode 70, Epsilon: 0.701
Episode 80, Epsilon: 0.666
Episode 90, Epsilon: 0.634
Episode 100, Epsilon: 0.603
Episode 110, Epsilon: 0.573
Episode 120, Epsilon: 0.545
Episode 130, Epsilon: 0.519
Episode 140, Epsilon: 0.493
Episode 150, Epsilon: 0.469
Episode 160, Epsilon: 0.446
Episode 170, Epsilon: 0.424
Episode 180, Epsilon: 0.404
Episode 190, Epsilon: 0.384
Episode 200, Epsilon: 0.365
Episode 210, Epsilon: 0.347
Episode 220, Epsilon: 0.330
Episode 230, Epsilon: 0.314
Episode 240, Epsilon: 0.299
Episode 250, Epsilon: 0.284
Episode 260, Epsilon: 0.270
Episode 270, Epsilon: 0.257
Episode 280, Epsilon: 0.245
Episode 290, Epsilon: 0.233
Episode 300, Epsilon: 0.221
Episode 310, Epsilon: 0.210
Episode 320, Epsilon: 0.200
Episode 330, Epsilon: 0.190
Episode 340, Epsilon: 0.181
Episode 350, Epsilon: 0.172
Epi