# Step5. Reinforcement Learning



### Configuration:
- Action: Set the 8 behaviors in treatment period as sequential actions, all users perform saves first, follows next ... until the 8th action streams programmed streams
- Reward: The outcomes of the post period are the rewards towards each data, so there are four rewards in total.
- Observation: The status after each action is an observation, so there are 8 observations for a data (user).
- Timesteps: In this case, the actions are timesteps itself, timestep1 represents action 1, which is save, timestep2 represents action 2, which is follow ,etc.

### Objectives:
Find out the degree of reward that different actions can contributed to. e.g. can a save can lead to plus 2 of reward, can two follows can lead to minus 1 of reward, etc.
Since the reward can only be known after all actions have been performed, a method that can let the reward redistributed among actions is key to the problem.

### Data representation:
- Action: An 8 dimensional vector, each position represents the corresponding action e.g. 1st position is save, 2nd position is follow.
- Reward: A number, which is the outcome value
- Observation: An 8 dimensional vector, each position also represents the corresponding action. Unlike action, observation records all the actions done before.

## Methods used:
- Rudder: Used to let the reward redistributed among timesteps (actions). The main task in Rudder is to train an LSTM model to predict the return of each sample at the end of the sequence. As auxiliary task, the model also be trained to predict the final return at every sequence position. This will allow us to use differences of predictions for contribution analysis.

In [None]:
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

In [None]:
def load_data(path):
    with open(path,'rb') as file:
        unpicker = pickle.Unpickler(file,encoding = 'latin1')
        return unpicker.load()

## 1. Specify name of actions and rewards

In [None]:
action_list = [
    'saves_treatment_period',
    'follows_treatment_period',
    'playlists_treatment_period',
    'tickets_treatment_period',
    'merch_treatment_period',
    'shares_treatment_period',
    'streams_active_streams_treatment_period',
    'streams_programmed_streams_treatment_period'
]
reward_list = [
    'shares_following_four_weeks',
    'merch_following_four_weeks',
    'ticket_following_four_weeks',
    'streams_active_streams_following_four_weeks',
    'streams_programmed_streams_following_four_weeks'
]

## 2. Load data from disk

In [None]:
# load the data
clean_data1 = load_data('../Data/CausalFandom_main_data.pickle')

clean_data2 = clean_data1.sample(frac=0.5, replace=False, random_state=41)

clean_data3 = clean_data2.sample(frac=0.1, replace=False, random_state=41).reset_index()

## 3. Implement dataset class used for pytorch training

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, action_list, reward_aim):
        super(CustomDataset, self).__init__()
        self.data = data
        self.action_list = action_list
        self.reward_aim = reward_aim
    def __len__(self):
        return self.data.shape[0]
    def __getitem__(self, idx):
        tmp = self.data.loc[[idx],:]
        obs, act, rew = self.create_dataset(tmp, self.action_list, self.reward_aim)
        return obs.astype(np.float32), act.astype(np.float32), rew.astype(np.float32)
    # Transform the original data into 8d action, 8d observation and 1d reward
    def create_dataset(self, data, action_list, reward_aim):
        obs_act = data[action_list].to_numpy()
        rewards = data[reward_aim].to_numpy()
        num_data = obs_act.shape[0]
        num_pos = obs_act.shape[1]
        obs = np.zeros((num_data * num_pos, num_pos))
        act = np.zeros((obs.shape[0], obs.shape[1]))
        rew = np.zeros(obs.shape[0])
        for i in range(num_data):
            cur_row = obs_act[i,:]
            for j in range(num_pos):
                obs[i*num_pos+j : (i+1)*num_pos, j] = cur_row[j]
                act[i*num_pos+j, j] = cur_row[j]
            rew[(i+1)*num_pos-1] = rewards[i]
        return obs, act, rew

# Define 'streams_active_streams_following_four_weeks' as reward
dataset = CustomDataset(clean_data3, action_list=action_list, reward_aim=reward_list[3])

batch_size = 128
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
# test type/size of obs,act,rew
for data in dataloader:
    obs, act, rew = data
    print('obs:',obs.dtype)
    print('act:',act.shape)
    print('rew:',rew.shape)
    break

## 4. Custom LSTM training

In [None]:
obs_dim = 8
act_dim = 8
time_steps = act_dim = 8
input_dim = obs_dim + act_dim

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, obs, act):
        x = torch.cat((obs, act), dim=2)
        out, _ = self.lstm(x)
        # Using the prediction of last timestep (action) as output
        # out = self.fc(out[:, -1, :])
        out = self.fc(out)
        return out

hidden_dim = 64
num_layers = 2
lstm = LSTMModel(input_dim, hidden_dim, num_layers)

criterion = nn.MSELoss()
optimizer = optim.Adam(lstm.parameters(), lr=0.001)

num_epochs = 5
for epoch in range(num_epochs):
    for batch_obs, batch_act, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = lstm(batch_obs, batch_act)
        loss = criterion(outputs.squeeze(), batch_y)
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print('Training finished.')


## 5. Visualise two samples to see the redistribution performance

In [None]:
# Load 2 samples
obs0, a0, r0 = dataset[3]
obs1, a1, r1 = dataset[70]

# Apply reward redistribution model to the samples
test_obs = torch.stack([torch.Tensor(obs0), torch.Tensor(obs1)], dim=0)
test_act = torch.stack([torch.Tensor(a0), torch.Tensor(a1)], dim=0)
test_rew = torch.stack([torch.Tensor(r0), torch.Tensor(r1)], dim=0)

with torch.no_grad():
    predictions = lstm(test_obs, test_act)

print(predictions.shape)
print('-------------')
print(predictions[:, 1:])
print(predictions[:, :-1])

# Use the differences of predictions as redistributed reward
redistributed_reward = predictions[:, 1:] - predictions[:, :-1]

# For the first timestep we will take (0-predictions[:, :1]) as redistributed reward
redistributed_reward = torch.cat([predictions[:, :1], redistributed_reward], dim=1)

In [None]:
redistributed_reward = redistributed_reward.cpu().detach().numpy()
rr0, rr1 = redistributed_reward[0], redistributed_reward[1]

fig, axes = plt.subplots(4, 2, figsize=(8, 6), dpi=100)
axes[0, 0].plot(obs0.argmax(-1) - 6)
axes[0, 1].plot(obs1.argmax(-1) - 6)
axes[0, 0].set_ylim(-6, 6)
axes[0, 1].set_ylim(-6, 6)
axes[0, 0].axhline(2, linestyle='--', color='r')
axes[0, 1].axhline(2, linestyle='--', color='r')
axes[0, 0].xaxis.grid(True)
axes[0, 1].xaxis.grid(True)
axes[0, 0].set_title('observations (sample 1)')
axes[0, 1].set_title('observations (sample 2)')
axes[0, 0].set_xlabel('Actions')
axes[0, 1].set_xlabel('Actions')

axes[1, 0].plot(a0.argmax(-1))
axes[1, 1].plot(a1.argmax(-1))
axes[1, 0].xaxis.grid(True)
axes[1, 1].xaxis.grid(True)
axes[1, 0].set_title('actions (sample 1)')
axes[1, 1].set_title('actions (sample 2)')

axes[1, 0].set_xlabel('Actions')
axes[1, 1].set_xlabel('Actions')

axes[2, 0].plot(r0)
axes[2, 1].plot(r1)
axes[2, 0].xaxis.grid(True)
axes[2, 1].xaxis.grid(True)
axes[2, 0].set_title('original rewards (sample 1)')
axes[2, 1].set_title('original rewards (sample 2)')
axes[2, 0].set_xlabel('Actions')
axes[2, 1].set_xlabel('Actions')

axes[3, 0].plot(rr0)
axes[3, 1].plot(rr1)
axes[3, 0].xaxis.grid(True)
axes[3, 1].xaxis.grid(True)
axes[3, 0].set_title('redistributed rewards (sample 1)')
axes[3, 1].set_title('redistributed rewards (sample 2)')
axes[3, 0].set_xlabel('Actions')
axes[3, 1].set_xlabel('Actions')

fig.tight_layout()

## 6. Visualise the reward contribution that different actions have

Using the reward difference between two timesteps as the reward of certain value of the action. e.g.
- timestep1: reward=5, perform 5 times of action A
- timestep2: reward=10, perform 9 times of action B ...
The reward of action A equals to 5 is 4


In [None]:
all_predictions = []

for idx in range(len(dataset)):
    obs, act, _ = dataset[idx]
    obs_tensor = torch.tensor(obs, dtype=torch.float32)
    act_tensor = torch.tensor(act, dtype=torch.float32)

    with torch.no_grad():
        prediction = lstm(obs_tensor.unsqueeze(0), act_tensor.unsqueeze(0)).squeeze().numpy()

    all_predictions.append(prediction)

# Transform tensor to numpy array then back to tensor again to reduce dimension
all_predictions = torch.tensor(np.array(all_predictions)).unsqueeze(2)

# Use the differences of predictions as redistributed reward
redistributed_reward = all_predictions[:, 1:] - all_predictions[:, :-1]

# For the first timestep we will take (0-predictions[:, :1]) as redistributed reward
redistributed_reward = torch.cat([all_predictions[:, :1], redistributed_reward], dim=1)

redistributed_reward = np.array(redistributed_reward.squeeze(2))

In [None]:
fig, axes = plt.subplots(4, 2, figsize=(8, 6), dpi=100)

axes[0,0].scatter(clean_data3[action_list[0]].to_numpy(), redistributed_reward[:,0], s=10)
axes[0,0].set_xlabel('Value')
axes[0,0].set_ylabel('Reward')
axes[0,0].set_title(action_list[0])
axes[0,1].scatter(clean_data3[action_list[1]].to_numpy(), redistributed_reward[:,1], s=10)
axes[0,1].set_xlabel('Value')
axes[0,1].set_ylabel('Reward')
axes[0,1].set_title(action_list[1])
axes[1,0].scatter(clean_data3[action_list[2]].to_numpy(), redistributed_reward[:,2], s=10)
axes[1,0].set_xlabel('Value')
axes[1,0].set_ylabel('Reward')
axes[1,0].set_title(action_list[2])
axes[1,1].scatter(clean_data3[action_list[3]].to_numpy(), redistributed_reward[:,3], s=10)
axes[1,1].set_xlabel('Value')
axes[1,1].set_ylabel('Reward')
axes[1,1].set_title(action_list[3])
axes[2,0].scatter(clean_data3[action_list[4]].to_numpy(), redistributed_reward[:,4], s=10)
axes[2,0].set_xlabel('Value')
axes[2,0].set_ylabel('Reward')
axes[2,0].set_title(action_list[4])
axes[2,1].scatter(clean_data3[action_list[5]].to_numpy(), redistributed_reward[:,5], s=10)
axes[2,1].set_xlabel('Value')
axes[2,1].set_ylabel('Reward')
axes[2,1].set_title(action_list[5])
axes[3,0].scatter(clean_data3[action_list[6]].to_numpy(), redistributed_reward[:,6], s=10)
axes[3,0].set_xlabel('Value')
axes[3,0].set_ylabel('Reward')
axes[3,0].set_title(action_list[6])
axes[3,1].scatter(clean_data3[action_list[7]].to_numpy(), redistributed_reward[:,7], s=10)
axes[3,1].set_xlabel('Value')
axes[3,1].set_ylabel('Reward')
axes[3,1].set_title(action_list[7])

fig.tight_layout()

As can be seen from the plots above, the reward towards each value of 8 behaviors,
- Save can have a clear path
- Follow's reward contribution range from -1 to 2
- Playlist and share don't have clear pattern but their contribution almost lie below 0
- In opposite, active stream and programmed stream's contributions both mainly larger than zero