In [None]:
import numpy as np
import torch
import torch.nn as nn
import random
import pandas as pd
from collections import deque
from copy import deepcopy
from tqdm import tqdm

class OUNoise:
    def __init__(self, action_dimension, mu=0, theta=0.15, sigma=0.3):
        self.action_dimension = action_dimension
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.action_dimension) * self.mu
        self.reset()

    def reset(self):
        self.state = np.ones(self.action_dimension) * self.mu

    def sample(self):
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return self.state


class TwoLayersNeuralNetwork(nn.Module):
    def __init__(self, input_dim, layer1_dim, layer2_dim, output_dim, output_tanh):
        super().__init__()
        self.layer1 = nn.Linear(input_dim, layer1_dim)
        self.layer2 = nn.Linear(layer1_dim, layer2_dim)
        self.layer3 = nn.Linear(layer2_dim, output_dim)
        self.output_tanh = output_tanh
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        
    def forward(self, input):
        hidden = self.layer1(input)
        hidden = self.relu(hidden)
        hidden = self.layer2(hidden)
        hidden = self.relu(hidden)
        output = self.layer3(hidden)
        
        if self.output_tanh:
            return self.tanh(output)
        else:
            return output
        
        
class DDPG():
    def __init__(self, state_dim, action_dim, action_scale, noise_decrease,
                 gamma=0.99, batch_size=64, q_lr=1e-3, pi_lr=1e-4, tau=1e-2, memory_size=100000):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_scale = action_scale
        self.pi_model = TwoLayersNeuralNetwork(self.state_dim, 400, 300, self.action_dim, output_tanh=True)
        self.q_model = TwoLayersNeuralNetwork(self.state_dim + self.action_dim, 400, 300, 1, output_tanh=False)
        self.pi_target_model = deepcopy(self.pi_model)
        self.q_target_model = deepcopy(self.q_model)
        self.noise = OUNoise(self.action_dim)
        self.noise_threshold = 1
        self.noise_decrease = noise_decrease
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.q_optimazer = torch.optim.Adam(self.q_model.parameters(), lr=q_lr)
        self.pi_optimazer = torch.optim.Adam(self.pi_model.parameters(), lr=pi_lr)
        self.memory = deque(maxlen=memory_size)
    
    def get_action(self, state):
        pred_action = self.pi_model(torch.FloatTensor(state)).detach().numpy()
        action = self.action_scale * (pred_action + self.noise_threshold * self.noise.sample())
        return np.clip(action, -self.action_scale, self.action_scale)
    
    def update_target_model(self, target_model, model, optimazer, loss):
        optimazer.zero_grad()
        loss.backward()
        optimazer.step()
        for target_param, param in zip(target_model.parameters(), model.parameters()):
            target_param.data.copy_((1 - self.tau) * target_param.data + self.tau * param.data) 
    
    
    def fit(self, state, action, reward, done, next_state):
        self.memory.append([state, action, reward, done, next_state])
        
        if len(self.memory) > self.batch_size:
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, dones, next_states = map(torch.FloatTensor, zip(*batch))
            rewards = rewards.reshape(self.batch_size, 1)
            dones = dones.reshape(self.batch_size, 1)
            
            pred_next_actions = self.action_scale * self.pi_target_model(next_states)
            next_states_and_pred_next_actions = torch.cat((next_states, pred_next_actions), dim=1)
            targets = rewards + self.gamma * (1 - dones) * self.q_target_model(next_states_and_pred_next_actions)
            
            states_and_actions = torch.cat((states, actions), dim=1)
            temp = (self.q_model(states_and_actions) - targets.detach())
            q_loss = torch.mean((targets.detach() - self.q_model(states_and_actions)) ** 2)
            self.update_target_model(self.q_target_model, self.q_model, self.q_optimazer, q_loss)
            
            pred_actions = self.action_scale * self.pi_model(states)
            states_and_pred_actions = torch.cat((states, pred_actions), dim=1)
            pi_loss = - torch.mean(self.q_model(states_and_pred_actions))
            self.update_target_model(self.pi_target_model, self.pi_model, self.pi_optimazer, pi_loss)
            
        if self.noise_threshold > 0:
            self.noise_threshold = max(0, self.noise_threshold - self.noise_decrease)


class MovieLensEnv:
    def __init__(self, ratings_data):
        self.ratings_data = ratings_data
        self.users = ratings_data['userId'].unique()
        self.movies = ratings_data['movieId'].unique()
        self.state_dim = num_users + num_movies
        self.action_dim = num_movies
        self.current_user = None
        self.current_movie = None

    def reset(self):
        self.current_user = np.random.choice(self.users)
        available_movies = self.ratings_data[self.ratings_data['userId'] == self.current_user]['movieId'].values
        self.current_movie = np.random.choice(available_movies)
        return self.preprocess_state(self.current_user, self.current_movie)

    def step(self, action):
        selected_movie_id = np.argmax(action)
        if selected_movie_id in self.movies:
            self.current_movie = selected_movie_id
        reward = self.get_reward()
        next_state = self.preprocess_state(self.current_user, self.current_movie)
        done = True 
        return next_state, reward, done, {}

    def get_reward(self):
        rating = self.ratings_data[(self.ratings_data['userId'] == self.current_user) & 
                                   (self.ratings_data['movieId'] == self.current_movie)]['rating'].values
        return (rating.item() - 2.5) / 2.5 if len(rating) > 0 else -1

    def sample_action(self):
        action = np.zeros(self.action_dim)
        action[np.random.choice(self.action_dim)] = 1
        return action

    def preprocess_state(self, user_id, movie_id):
        state = np.zeros(num_users + num_movies)
        state[user_to_index[user_id]] = 1
        state[num_users + movie_to_index[movie_id]] = 1
        return state

ratings_data = pd.read_csv('ratings.csv')

user_ids = ratings_data['userId'].unique()
user_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
movie_ids = ratings_data['movieId'].unique()
movie_to_index = {movie_id: index for index, movie_id in enumerate(movie_ids)}

num_users = len(user_ids)
num_movies = len(movie_ids)

env = MovieLensEnv(ratings_data)
max_episodes = 100
max_steps = 1000


ddpg_agent = DDPG(state_dim=env.state_dim, action_dim=env.action_dim, action_scale=2, noise_decrease=0.0001)
episode_rewards = []
hit_rates = []

for episode in tqdm(range(max_episodes)):
    state = env.reset()
    episode_reward = 0
    hit_count = 0
    total_recommendations = 0

    for step in range(max_steps):
        action = ddpg_agent.get_action(state)
        next_state, reward, done, _ = env.step(action)

        if reward > 0:
            hit_count += 1

        total_recommendations += 1

        ddpg_agent.fit(state, action, reward, done, next_state)

        state = next_state
        episode_reward += reward

        if done:
            break

    hit_rate = hit_count / total_recommendations if total_recommendations > 0 else 0
    hit_rates.append(hit_rate)
    episode_rewards.append(episode_reward)


print(np.mean(hit_rates))


In [None]:
from torch.distributions import Normal
class SAC(nn.Module):
    def __init__(self, state_dim, action_dim, gamma=0.99, alpha=1e-3, tau=1e-2, 
                 batch_size=64, pi_lr=1e-3, q_lr=1e-3):
        super().__init__()

        self.pi_model = nn.Sequential(nn.Linear(state_dim, 128), nn.ReLU(), 
                                      nn.Linear(128, 128), nn.ReLU(), 
                                      nn.Linear(128, 2 * action_dim), nn.Tanh())

        self.q1_model = nn.Sequential(nn.Linear(state_dim + action_dim, 128), nn.ReLU(), 
                                      nn.Linear(128, 128), nn.ReLU(), 
                                      nn.Linear(128, 1))

        self.q2_model = nn.Sequential(nn.Linear(state_dim + action_dim, 128), nn.ReLU(), 
                                      nn.Linear(128, 128), nn.ReLU(), 
                                      nn.Linear(128, 1))

        self.gamma = gamma
        self.alpha = alpha
        self.tau = tau
        self.batch_size = batch_size
        self.memory = []

        self.pi_optimizer = torch.optim.Adam(self.pi_model.parameters(), pi_lr)
        self.q1_optimizer = torch.optim.Adam(self.q1_model.parameters(), q_lr)
        self.q2_optimizer = torch.optim.Adam(self.q2_model.parameters(), q_lr)
        self.q1_target_model = deepcopy(self.q1_model)
        self.q2_target_model = deepcopy(self.q2_model)

    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0)
        action, _ = self.predict_actions(state)
        return action.squeeze(1).detach().numpy()

    def fit(self, state, action, reward, done, next_state):
        self.memory.append([state, action, reward, done, next_state])

        if len(self.memory) > self.batch_size:
            batch = random.sample(self.memory, self.batch_size)
            states, actions, rewards, dones, next_states = map(torch.FloatTensor, zip(*batch))
            rewards, dones = rewards.unsqueeze(1), dones.unsqueeze(1)

            next_actions, next_log_probs = self.predict_actions(next_states)
            next_states_and_actions = torch.cat((next_states, next_actions), dim=1)
            next_q1_values = self.q1_target_model(next_states_and_actions)
            next_q2_values = self.q2_target_model(next_states_and_actions)
            next_min_q_values = torch.min(next_q1_values, next_q2_values)
            targets = rewards + self.gamma * (1 - dones) * (next_min_q_values - self.alpha * next_log_probs)

            states_and_actions = torch.cat((states, actions), dim=1)
            q1_loss = torch.mean((self.q1_model(states_and_actions) - targets.detach()) ** 2)
            q2_loss = torch.mean((self.q2_model(states_and_actions) - targets.detach()) ** 2)
            self.update_model(q1_loss, self.q1_optimizer, self.q1_model, self.q1_target_model)
            self.update_model(q2_loss, self.q2_optimizer, self.q2_model, self.q2_target_model)

            pred_actions, log_probs = self.predict_actions(states)
            states_and_pred_actions = torch.cat((states, pred_actions), dim=1)
            q1_values = self.q1_model(states_and_pred_actions)
            q2_values = self.q2_model(states_and_pred_actions)
            min_q_values = torch.min(q1_values, q2_values)
            pi_loss = - torch.mean(min_q_values - self.alpha * log_probs)
            self.update_model(pi_loss, self.pi_optimizer)
            
    def update_model(self, loss, optimizer, model=None, target_model=None):
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if model != None and target_model != None:
            for param, target_param in zip(model.parameters(), target_model.parameters()):
                new_target_param = (1 - self.tau) * target_param + self.tau * param
                target_param.data.copy_(new_target_param)

    def predict_actions(self, states):
        means, log_stds = self.pi_model(states).chunk(2, dim=1)
        dists = Normal(means, torch.exp(log_stds))
        actions = dists.rsample()
        log_probs = dists.log_prob(actions)
        return actions, log_probs
    def train_model(self, env, max_episodes=100, max_steps=1000, batch_size=64, warmup_steps=1000):
        episode_rewards = []
        hit_rates = []
        for episode in range(max_episodes):
            state = env.reset()
            episode_reward = 0
            for step in range(max_steps):
                if step < warmup_steps:
                    action = env.sample_action()
                else:
                    action = self.get_action(state)
                next_state, reward, done, _ = env.step(action)
                self.fit(state, action, reward, done, next_state)
                state = next_state
                episode_reward += reward
                if done:
                    break
            episode_rewards.append(episode_reward)
            hit_rate = hitrate(self, env)
            hit_rates.append(hit_rate)
        return episode_rewards, hit_rates

def hitrate(model, env, num_trials=1000):
    hits = 0
    for _ in range(num_trials):
        user_id = random.choice(env.users)
        user_state = np.zeros(env.state_dim)
        user_state[user_to_index[user_id]] = 1
        recommended_movie_id = np.argmax(model.get_action(user_state))
        if recommended_movie_id in env.ratings_data[env.ratings_data['userId'] == user_id]['movieId'].values:
            hits += 1
    return hits / num_trials


env = MovieLensEnv(ratings_data)

sac_model = SAC(state_dim=env.state_dim, action_dim=env.action_dim)
episode_rewards, hit_rates = sac_model.train_model(env, max_episodes=100, max_steps=1000, batch_size=64, warmup_steps=3000)

print(np.max(hit_rates))