In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from collections import deque
import random
import time
import os
import csv

torch.set_default_device("cuda")

In [2]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim=32, dropout=0.5):
        super(PolicyNetwork, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)

    def forward(self, x, hidden, available_cards_mask=None):
        out, hidden = self.lstm(x, hidden)
        out = self.layer_norm(out[:, -1, :])
        out = self.dropout(torch.relu(self.fc1(out)))
        logits = self.fc2(out)

        if available_cards_mask is not None:
            logits = logits.masked_fill(~available_cards_mask, float('-inf'))

        return F.softmax(logits, dim=-1), hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(2 * self.lstm.num_layers, batch_size, self.lstm.hidden_size),
                torch.zeros(2 * self.lstm.num_layers, batch_size, self.lstm.hidden_size))

class ValueNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout=0.5):
        super(ValueNetwork, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, hidden):
        out, hidden = self.lstm(x, hidden)
        out = self.dropout(out[:, -1, :])
        value = self.fc(out)
        return value, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(self.lstm.num_layers, batch_size, self.lstm.hidden_size),
                torch.zeros(self.lstm.num_layers, batch_size, self.lstm.hidden_size))


In [3]:
SUITES = ['H','D','C','S']

RANKS = ['6', '7', '8', '9', 'J', 'Q', 'K', 'A']

CARDS = [f'{a}{r}' for a in SUITES for r in RANKS]

def sym_to_cat(symbol, d):
    return [1 if s in symbol else 0 for s in d]

def input_arr(alpha, c_hand, hand, desk_my, desk_enemy):
    return np.array(sym_to_cat(alpha, SUITES) + sym_to_cat(c_hand, SUITES) + sym_to_cat(hand, CARDS) + sym_to_cat(desk_my, CARDS) + sym_to_cat(desk_enemy, CARDS))

In [4]:
# Define the game environment (stub example)
class OomiEnvironment:
    def __init__(self, policy_net, value_net):
        self.players = []
        self.desk = [0] * 4
        self.p_rewards = [0] * 4
        self.c_player = None
        self.cards = CARDS.copy()
        self.trump = np.random.choice(SUITES)
        self.hand = 'z'
        self.team_score = [0] * 2

        for i in range(4):
            self.players.append(Player(self, policy_net, value_net, i%2, i))

    def reset_round(self):
        self.hand = 'z'
        self.desk = [0] * 4
        self.p_rewards = [0] * 4


    def reset(self):
        self.deal_cards()
        self.desk = [0] * 4
        self.p_rewards = [0] * 4
        self.c_player = None
        self.hand = 'z'
        for player in self.players:
            player.reset_hidden()

        return self.desk

    def desk_winner(self):
        values = [self.card_to_val(card, self.trump) for card in self.desk]
        max_value = max(values)

        return values.index(max_value)

    def prep_rewards(self, rewards):
        ret = []
        for i, r in enumerate(self.p_rewards):
            ret.append([l + r for l in rewards[i]])

        self.p_rewards = [0] * 4
        return ret

    def next_player(self):
        if self.c_player is None or self.c_player == 3:
            self.c_player = 0
        else:
            self.c_player += 1

        return self.c_player

    def deal_cards(self):
        random.shuffle(self.cards)
        for i in range(4):
            self.players[i].actions = self.cards[i*8:(i+1)*8]

    def card_to_val(self, card, alpha):
        v = CARDS.index(card) % 8

        if alpha in card:
            v += 16
        elif self.hand in card:
            v += 8

        return v

    def step(self, action, player_id):
        if action not in self.players[player_id].actions:
            return self.desk, -10, False, False

        # Penalize if the action card has already been played in this round
#         if any(self.card_to_val(card, self.alpha, self.players[player_id].actions) == self.card_to_val(action, self.alpha, self.players[player_id].actions) for card in self.desk):
#             return self.state, -2, False

        if self.desk == [0, 0, 0, 0]:
            self.hand = action[0]
        else:
            if self.hand != action[0]:
                if any(self.hand in a for a in self.players[player_id].actions):
                    return self.desk, -10, False, False
        self.desk[player_id] = action
        self.players[player_id].actions.remove(action)

        if 0 not in self.desk:
            winner = self.desk_winner()
            self.p_rewards[winner] += 1
            if winner%2==0:
                self.p_rewards[0] += 1
                self.p_rewards[2] += 1
                self.team_score[0] += 1
            else:
                self.p_rewards[1] += 1
                self.p_rewards[3] += 1
                self.team_score[1] += 1


            if self.players[player_id].actions == []:
                return self.desk, 0, True, True
            else:
                return self.desk, 0, True, False

        return self.desk, 0, False, False

In [5]:
class Player:
    def __init__(self, env, policy_net, value_net, team, id):
        self.policy_net = policy_net
        self.value_net = value_net
        self.env = env
        self.actions = []
        self.pre_rewards = []
        self.policy_hidden = self.policy_net.init_hidden(1)
        self.value_hidden = self.value_net.init_hidden(1)
        self.team = team
        self.id = id


    def reset_hidden(self):
        self.policy_hidden = self.policy_net.init_hidden(1)
        self.value_hidden = self.value_net.init_hidden(1)


    def play(self):
        state = torch.tensor(input_arr(self.env.trump, self.env.hand, self.actions, [c for i, c in enumerate(self.env.desk) if i%2 == self.team], [c for i, c in enumerate(self.env.desk) if i%2 != self.team]), dtype=torch.float32).unsqueeze(0).unsqueeze(0)

        # state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        available_cards_mask = torch.tensor(sym_to_cat(self.actions, CARDS))
        available_cards_mask = available_cards_mask.to(torch.bool)

        action_probs, self.policy_hidden = self.policy_net(state, self.policy_hidden, available_cards_mask)

        try:
            action = torch.argmax(action_probs, 1).item()
        except RuntimeError:
            print(f'action probs - {action_probs}')
            print(f'state - {state}')
            print(f'policy_hidden - {self.policy_hidden}')
            raise RuntimeError

        next_state, reward, r_done, done = self.env.step(CARDS[action], self.id)
        
        return state,action,reward, r_done, done, action_probs[0][action].item()

In [6]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []

    def push(self, state, action, reward, action_prob):
        self.buffer.append((state, action, reward, action_prob))
        if len(self.buffer) > self.capacity:
            self.buffer.pop(0)
        
    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)


In [7]:
# Load the TensorBoard notebook extension
# %load_ext tensorboard

In [8]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

2024-07-11 06:31:27.851841: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-11 06:31:27.851955: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-11 06:31:27.998059: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [9]:
from math import e
import time

time.sleep(3)

# Define paths to save and load the models
policy_model_path = 'policy_model.pth'
value_model_path = 'value_model.pth'
backup_directory = '/content/drive/My Drive/Colab Notebooks/Oomi Backups' # Replace with your path
csv_path = 'dataset/'

# Initialize the models and optimizers
input_dim = 104
hidden_dim = 128
output_dim = 32
num_layers = 2
batch_size = 999

clip_param = 0.2  # PPO clipping parameter
entropy_coef = 0.01  # Entropy coefficient


# Initialize Replay Buffer
replay_buffer = ReplayBuffer(capacity=1000)  # Adjust capacity as needed

# policy_net = LSTMModel(input_dim, hidden_dim, output_dim)
# value_net = LSTMModel(input_dim, hidden_dim, output_dim, is_value_net=True)

# Load existing models if available, otherwise create new ones
if os.path.exists("/kaggle/input/rl-oomicardmodel/pytorch/v3.0.0/1/policy_model.pth"):
    print("loading exist policy model")
    policy_net = torch.load("/kaggle/input/rl-oomicardmodel/pytorch/v3.0.0/1/policy_model.pth")
else:
    print("creating new policy model")
    policy_net = PolicyNetwork(input_dim, hidden_dim, num_layers, output_dim, dropout=0.5)


if os.path.exists("/kaggle/input/rl-oomicardmodel/pytorch/v3.0.0/1/value_model.pth"):
    print("loading exist value model")
    value_net = torch.load("/kaggle/input/rl-oomicardmodel/pytorch/v3.0.0/1/value_model.pth")
else:
    print("creating new value model")
    value_net = ValueNetwork(input_dim, hidden_dim, num_layers, dropout=0.5)
    
policy_net.to("cuda")
value_net.to("cuda")

# Open CSV file and write header
csv_file_path = os.path.join(csv_path, 'training_data.csv')
os.makedirs(csv_path, exist_ok=True)
    
env = OomiEnvironment(policy_net, value_net)
policy_optimizer = optim.Adam(policy_net.parameters(), lr=1e-3)
value_optimizer = optim.Adam(value_net.parameters(), lr=1e-3)

with open(csv_file_path, mode='a+', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['PlayerID', 'State', 'Action', 'Reward', 'Done', 'Action Prob'])

    # Training loop
    num_episodes = 10000
    gamma = 0.99

    for episode in range(num_episodes):
        # Reset hidden states for all players
        for player in env.players:
            player.reset_hidden()

        desk = env.reset()
        done = False
        states = [[] for _ in range(4)]
        actions = [[] for _ in range(4)]
        rewards = [[] for _ in range(4)]
        action_probs = [[] for _ in range(4)]

        for _ in range(8):
            r_done = False
            r_rewards = [[] for _ in range(4)]
            for id in range(4):
                state, action, reward, r_done, done, action_prob = env.players[id].play()
                states[id].append(state.tolist())
                actions[id].append(action)
                r_rewards[id].append(reward)
                action_probs[id].append(action_prob)

                # Write step data to CSV
                csv_writer.writerow([id, state.tolist(), action, reward, done, action_prob])

            r_rewards = env.prep_rewards(r_rewards)
            for i in range(4):
                rewards[i].extend(r_rewards[i])
            env.reset_round()

        # Add some overall rewards
        for i in range(4):
            env.p_rewards[i] = env.team_score[i % 2]

        rewards = env.prep_rewards(rewards)

        for i in range(4):
            replay_buffer.push(torch.tensor(states[i], dtype=torch.float32).squeeze(1),
                               torch.tensor(actions[i], dtype=torch.int),
                               torch.tensor(rewards[i], dtype=torch.float32),
                               torch.tensor(action_probs[0][i], dtype=torch.float32))
            

        # Sample from the replay buffer and update the networks
        if len(replay_buffer) > batch_size:
            policy_losses = []
            value_losses = []
            
            for states, actions, rewards, old_action_probs in replay_buffer.sample(batch_size):
                sample_size = states.size(0)

                # Policy Network Forward Pass
#                 policy_hidden = policy_net.init_hidden(sample_size)
#                 action_probs, policy_hidden = policy_net(states, policy_hidden)
#                 selected_action_probs = action_probs[np.arange(sample_size), actions]

                # Value Network Forward Pass
                value_hidden = value_net.init_hidden(sample_size)
                state_values, value_hidden = value_net(states, value_hidden)
                state_values = state_values.squeeze()

#                 advantages = rewards - state_values

                # Compute policy loss with PPO clipped objective
#                 ratio = selected_action_probs / (old_action_probs + 1e-8)
#                 surr1 = ratio * advantages
#                 surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantages
#                 policy_loss = -torch.mean(torch.min(surr1, surr2)) + entropy_coef * -torch.mean(selected_action_probs * torch.log(selected_action_probs + 1e-8))
                
                # Compute value loss
                value_loss = F.mse_loss(state_values, rewards)

#                 policy_losses.append(policy_loss.item())
                value_losses.append(value_loss.item())

#                 total_loss = policy_loss + value_loss
                

#                 policy_optimizer.zero_grad()
                value_optimizer.zero_grad()

#                 total_loss.backward()
                value_loss
                # Gradient clipping
#                 torch.nn.utils.clip_grad_norm_(policy_net.parameters(), 0.5)
                torch.nn.utils.clip_grad_norm_(value_net.parameters(), 0.5)

#                 policy_optimizer.step()
                value_optimizer.step()

#             policy_loss_mean = torch.mean(torch.tensor(policy_losses, dtype=torch.float32))
            value_loss_mean = torch.mean(torch.tensor(value_losses, dtype=torch.float32))

#             writer.add_scalar("Loss/policy", policy_loss_mean.item(), episode)
            writer.add_scalar("Loss/value", value_loss_mean.item(), episode)

        # Save the models after each episode
#         torch.save(policy_net, policy_model_path)
        torch.save(value_net, value_model_path)
        # os.makedirs(backup_directory, exist_ok=True)
        # torch.save(policy_net, os.path.join(backup_directory, f'policy_model.pth'))
        # torch.save(value_net, os.path.join(backup_directory, f'value_model.pth'))

        try:
            if episode % 10 == 0:
                print(f'Episode {episode}, Policy Loss: , Value Loss: {value_loss_mean.item()}')
        except NameError:
            pass


writer.flush()
writer.close()

creating new policy model
creating new value model
Episode 250, Policy Loss: , Value Loss: 466.36553955078125
Episode 260, Policy Loss: , Value Loss: 526.7574462890625
Episode 270, Policy Loss: , Value Loss: 589.7896118164062
Episode 280, Policy Loss: , Value Loss: 664.8306274414062
Episode 290, Policy Loss: , Value Loss: 748.8280639648438
Episode 300, Policy Loss: , Value Loss: 833.1404418945312
Episode 310, Policy Loss: , Value Loss: 915.961669921875
Episode 320, Policy Loss: , Value Loss: 1006.14404296875
Episode 330, Policy Loss: , Value Loss: 1100.9150390625
Episode 340, Policy Loss: , Value Loss: 1194.9593505859375
Episode 350, Policy Loss: , Value Loss: 1295.819580078125
Episode 360, Policy Loss: , Value Loss: 1407.4720458984375
Episode 370, Policy Loss: , Value Loss: 1517.032958984375
Episode 380, Policy Loss: , Value Loss: 1630.9869384765625
Episode 390, Policy Loss: , Value Loss: 1757.8873291015625
Episode 400, Policy Loss: , Value Loss: 1887.6246337890625
Episode 410, Policy

In [10]:
# %tensorboard --logdir runs

In [11]:
# !rm -rf /kaggle/working/*

In [12]:
# ! cp /kaggle/input/rl-oomicardmodel/pytorch/v3.0.0/1/dataset/training_data.csv /kaggle/working/dataset
