In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from collections import deque
import random
import time
import os
import csv

# Adjustable variables
NUM_EPISODES = 100000
NUM_AGENTS = 4
HIDDEN_DIM = 64
BATCH_SIZE = 64
REPLAY_BUFFER_CAPACITY = 10000
LEARNING_RATE = 0.001
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 0.995

# Model saving and loading paths
MODEL_SAVE_PATH = "/kaggle/working/models/"
CSV_SAVE_PATH = "/kaggle/working/csvs/"
BEST_MODEL_PATH = os.path.join(MODEL_SAVE_PATH, "best_model.pth")
BACKUP_MODEL_PATH = os.path.join(MODEL_SAVE_PATH, "backup_model.pth")
CSV_FILE_PATH = os.path.join(CSV_SAVE_PATH, "training_stats.csv")

# Create model directory if it doesn't exist
os.makedirs(MODEL_SAVE_PATH, exist_ok=True)
os.makedirs(CSV_SAVE_PATH, exist_ok=True)

# Training control
CONTINUE_TRAINING = True  # Set to True to continue training after loading
ADDITIONAL_EPISODES = 100000  # Number of additional episodes to train if continuing


# Constants (unchanged)
SUITS = ['Hearts', 'Diamonds', 'Clubs', 'Spades']
VALUES = ['7', '8', '9', '10', 'J', 'Q', 'K', 'A']
NUM_PLAYERS = 4
HAND_SIZE = 8

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

class Card:
    def __init__(self, suit, value):
        self.suit = suit
        self.value = value

    def __str__(self):
        return f"{self.value} of {self.suit}"

    def __eq__(self, other):
        return self.suit == other.suit and self.value == other.value

    def to_index(self):
        return SUITS.index(self.suit) * len(VALUES) + VALUES.index(self.value)
    
class OomiGame:
    def __init__(self):
        self.deck = self.create_deck()
        self.hands = self.deal_cards()
        self.played_cards = []
        self.current_player = 0
        self.current_trick = []
        self.tricks_won = {'team1': 0, 'team2': 0}
        self.trump_suit = None

    def create_deck(self):
        return [Card(suit, value) for suit in SUITS for value in VALUES]

    def deal_cards(self):
        random.shuffle(self.deck)
        return [self.deck[i::NUM_PLAYERS] for i in range(NUM_PLAYERS)]

    def choose_trump(self, suit):
        self.trump_suit = suit

    def play_move(self, card):
        if card not in self.hands[self.current_player]:
            raise ValueError("Invalid move: card not in player's hand")

        self.hands[self.current_player].remove(card)
        self.current_trick.append(card)
        self.played_cards.append(card)

        if len(self.current_trick) == NUM_PLAYERS:
            winning_card = self.determine_trick_winner()
            winning_player = self.current_trick.index(winning_card)
            winning_team = 'team1' if winning_player in [0, 2] else 'team2'
            self.tricks_won[winning_team] += 1
            self.current_player = winning_player
            self.current_trick = []
        else:
            self.current_player = (self.current_player + 1) % NUM_PLAYERS

    def determine_trick_winner(self):
        lead_suit = self.current_trick[0].suit
        trump_cards = [card for card in self.current_trick if card.suit == self.trump_suit]
        if trump_cards:
            return max(trump_cards, key=lambda card: VALUES.index(card.value))
        else:
            return max((card for card in self.current_trick if card.suit == lead_suit), 
                       key=lambda card: VALUES.index(card.value))

    def get_state(self):
        return {
            'hand': self.hands[self.current_player].copy(),
            'desk': self.current_trick.copy(),
            'played': self.played_cards.copy(),
            'current_player': self.current_player,
            'tricks_won': self.tricks_won.copy(),
            'trump_suit': self.trump_suit
        }

    def is_game_over(self):
        return all(len(hand) == 0 for hand in self.hands)

    def get_winner(self):
        if self.tricks_won['team1'] > self.tricks_won['team2']:
            return 'team1'
        elif self.tricks_won['team2'] > self.tricks_won['team1']:
            return 'team2'
        else:
            return 'tie'
        
        
class CardGameState(nn.Module):
    def __init__(self, num_suits, num_ranks, hidden_dim):
        super(CardGameState, self).__init__()
        self.num_suits = num_suits
        self.num_ranks = num_ranks
        self.card_embedding = nn.Embedding(num_suits * num_ranks, hidden_dim)
        self.suit_embedding = nn.Embedding(num_suits + 1, hidden_dim)  # +1 for no trump
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.attention = nn.MultiheadAttention(hidden_dim, 4)
        
        # Add a final layer to combine all states
        self.combine_layer = nn.Linear(hidden_dim * 4, hidden_dim)

    def forward(self, trump_suit, hand, desk, played):
        # Embed trump suit
        trump_embed = self.suit_embedding(trump_suit).unsqueeze(1)
        
        # Process hand, desk, and played cards
        hand_state = self._process_cards(hand)
        desk_state = self._process_cards(desk)
        played_state = self._process_cards(played)
        
        # Combine all states
        game_state = torch.cat([trump_embed, hand_state, desk_state, played_state], dim=1)
        return self.combine_layer(game_state.view(game_state.size(0), -1))

    def _process_cards(self, cards):
        embedded = self.card_embedding(cards)
        lstm_out, _ = self.lstm(embedded)
        attn_out, _ = self.attention(lstm_out, lstm_out, lstm_out)
        return attn_out.mean(1).unsqueeze(1)
    

class PolicyNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_cards):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, num_cards)
        self.dropout = nn.Dropout(0.5)

    def forward(self, game_state, valid_actions):
        x = F.relu(self.fc1(game_state))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        logits = self.fc3(x)
        
        # Apply action mask
        logits = logits.masked_fill(~valid_actions, float('-inf'))
        return F.softmax(logits, dim=-1)
    
    
class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    

class RandomPlayer:
    def random_move(self, state):
        return random.choice(state['hand'])
    
    
class OomiAgent:
    def __init__(self, num_suits, num_ranks, hidden_dim):
        self.num_suits = num_suits
        self.num_ranks = num_ranks
        self.hidden_dim = hidden_dim
        self.num_cards = num_suits * num_ranks
        self.game_state_model = CardGameState(num_suits, num_ranks, hidden_dim).to(device)
        self.policy_network = PolicyNetwork(hidden_dim, hidden_dim, self.num_cards).to(device)
        self.optimizer = optim.Adam(
            list(self.game_state_model.parameters()) + 
            list(self.policy_network.parameters()),
            lr=LEARNING_RATE
        )
        self.epsilon = EPSILON_START
        self.total_training_time = 0
        self.total_games_played = 0
        self.best_loss = float('inf')

    def state_to_tensor(self, state):
        trump_suit = torch.tensor([SUITS.index(state['trump_suit']) if state['trump_suit'] else len(SUITS)], device=device)
        hand = torch.tensor([card.to_index() for card in state['hand']], dtype=torch.long, device=device)
        desk = torch.tensor([card.to_index() for card in state['desk']], dtype=torch.long, device=device)
        played = torch.tensor([card.to_index() for card in state['played']], dtype=torch.long, device=device)

        # Pad sequences to fixed length
        hand = F.pad(hand, (0, HAND_SIZE - len(hand)))
        desk = F.pad(desk, (0, NUM_PLAYERS - len(desk)))
        played = F.pad(played, (0, self.num_suits * self.num_ranks - len(played)))
        
        return trump_suit, hand.unsqueeze(0), desk.unsqueeze(0), played.unsqueeze(0)

    
    def get_valid_actions(self, state):
        valid_actions = torch.zeros(self.num_cards, dtype=torch.bool, device=device)
        
        c_hand_suit = state['desk'][0].suit if len(state['desk'])>0 else None
        
        if any([c_hand_suit == c.suit for c in state['hand']]):
            for card in state['hand']:
                if c_hand_suit == card.suit:
                    valid_actions[card.to_index()] = True
        else:
            for card in state['hand']:
                valid_actions[card.to_index()] = True
                
        return valid_actions.unsqueeze(0)

    def predict(self, state):
        trump_suit, hand, desk, played = self.state_to_tensor(state)
        valid_actions = self.get_valid_actions(state)
        game_state = self.game_state_model(trump_suit, hand, desk, played)        
        action_probs = self.policy_network(game_state, valid_actions)
        return action_probs.squeeze(0).cpu().detach().numpy()
    
    def choose_action(self, state):
        action_probs = self.predict(state)
       
        # Choose action based on the probability distribution
        chosen_card_index = np.random.choice(len(action_probs), p=action_probs)
        
        # Convert chosen index back to Card object
        suit_index = chosen_card_index // len(VALUES)
        value_index = chosen_card_index % len(VALUES)
        return Card(SUITS[suit_index], VALUES[value_index])
    
    def evaluate(self, num_games=100):
        wins = 0
        for _ in range(num_games):
            game = OomiGame()
            players = [self if i % 2 == 0 else RandomPlayer() for i in range(NUM_PLAYERS)]

            while not game.is_game_over():
                current_player = players[game.current_player]
                state = game.get_state()
                action = current_player.choose_action(state) if isinstance(current_player, OomiAgent) else current_player.random_move(state)
                game.play_move(action)

            if game.get_winner() == 'team1':  # Assuming self plays as team1
                wins += 1
        
        win_rate = wins / num_games
        
        print(f'win rate - {win_rate}')

        return win_rate  # Return win rate


    def perturb(self):
        with torch.no_grad():
            for param in self.game_state_model.parameters():
                param.add_(torch.randn(param.size()) * 0.1)
            for param in self.policy_network.parameters():
                param.add_(torch.randn(param.size()) * 0.1)

    def train(self, states, actions, rewards):
        self.optimizer.zero_grad()

        # Data preparation
        trump_suits, hands, desks, playeds = zip(*[self.state_to_tensor(state) for state in states])
        trump_suits = torch.cat(trump_suits)
        hands = torch.cat(hands)
        desks = torch.cat(desks)
        playeds = torch.cat(playeds)
        valid_actions = torch.cat([self.get_valid_actions(state) for state in states])

        # Forward pass
        game_states = self.game_state_model(trump_suits, hands, desks, playeds)
        action_probs = self.policy_network(game_states, valid_actions)

        # Calculate loss
        action_indices = torch.tensor([action.to_index() for action in actions], device=device)
        rewards_tensor = torch.tensor(rewards, dtype=torch.float32, device=device)

        # Compute baseline (mean reward for the batch)
        baseline = rewards_tensor.mean()

        # Normalize rewards
        if rewards_tensor.std() > 1e-8:
            rewards_tensor = (rewards_tensor - rewards_tensor.mean()) / (rewards_tensor.std() + 1e-8)
        else:
            rewards_tensor = rewards_tensor - rewards_tensor.mean()

        # Compute advantages
        advantages = rewards_tensor - baseline

        # Policy gradient loss using REINFORCE with baseline
        log_probs = torch.log(action_probs + 1e-8)
        policy_loss = -(log_probs[torch.arange(len(actions)), action_indices] * advantages).mean()

        # Entropy regularization
        entropy = -(action_probs * log_probs).sum(dim=-1).mean()

        # Total loss (including entropy regularization)
        loss = policy_loss - 0.01 * entropy

        # Backpropagation
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(self.game_state_model.parameters(), max_norm=1.0)
        torch.nn.utils.clip_grad_norm_(self.policy_network.parameters(), max_norm=1.0)

        # Optimization step
        self.optimizer.step()

        return loss.item()


    def save_model(self, path):
        torch.save({
            'game_state_model': self.game_state_model.state_dict(),
            'policy_network': self.policy_network.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'epsilon': self.epsilon,
            'total_training_time': self.total_training_time,
            'total_games_played': self.total_games_played,
            'best_loss': self.best_loss
        }, path)

    def load_model(self, path):
        checkpoint = torch.load(path, map_location=torch.device(device))
        self.game_state_model.load_state_dict(checkpoint['game_state_model'])
        self.policy_network.load_state_dict(checkpoint['policy_network'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        self.epsilon = checkpoint['epsilon']
        self.total_training_time = checkpoint['total_training_time']
        self.total_games_played = checkpoint['total_games_played']
        self.best_loss = checkpoint['best_loss']

def self_play_training(num_episodes, starting_agent=None):
    if starting_agent:
        agents = [starting_agent] + [OomiAgent(len(SUITS), len(VALUES), HIDDEN_DIM) for _ in range(NUM_AGENTS - 1)]
    else:
        agents = [OomiAgent(len(SUITS), len(VALUES), HIDDEN_DIM) for _ in range(NUM_AGENTS)]
    
    replay_buffer = ReplayBuffer(REPLAY_BUFFER_CAPACITY)
    
    episode_losses = []
    episode_rewards = []
    start_time = time.time()
    best_agent = agents[0]
    loss_increase_count = 0
    
    for episode in range(num_episodes):
        game = OomiGame()
        player_agents = random.sample(agents, NUM_PLAYERS)
        
        # Choose trump suit
        first_player_hand = game.hands[0][:4]  # First 4 cards of the first player
        trump_suit = player_agents[0].choose_action({'hand': first_player_hand, 'desk': [], 'played': [],
                                                     'current_player': 0, 'tricks_won': game.tricks_won,
                                                     'trump_suit': None}).suit
        game.choose_trump(trump_suit)
        
        data = []
        episode_reward = 0
        
        while not game.is_game_over():
            state = game.get_state()
            player_id = game.current_player
            current_agent = player_agents[game.current_player]
            
            if random.random() < current_agent.epsilon:
                action = random.choice(state['hand'])
            else:
                action = current_agent.choose_action(state)
            
            prev_tricks_won = game.tricks_won.copy()
            game.play_move(action)
            
            new_state = game.get_state()
            
            # Intermediate reward
            if game.tricks_won['team1'] > prev_tricks_won['team1']:
                reward = 0.1 if player_id in [0, 2] else -0.1
            elif game.tricks_won['team2'] > prev_tricks_won['team2']:
                reward = 0.1 if player_id in [1, 3] else -0.1
            else:
                reward = 0
            
            # Bonus for playing trump
            if action.suit == game.trump_suit:
                reward += 0.05
                
            data.append((state, action, reward, new_state, player_id))
        
        # Game over, assign final rewards
        winner = game.get_winner()
        for experience in data:
            state, action, intermediate_reward, new_state, player = experience
            if winner == 'team1':
                final_reward = 1 if player in [0, 2] else -1
            elif winner == 'team2':
                final_reward = 1 if player in [1, 3] else -1
            else:  # tie
                final_reward = 0
            
            # Scale final reward based on margin of victory
            margin = abs(game.tricks_won['team1'] - game.tricks_won['team2'])
            final_reward *= (1 + 0.1 * margin)
            
            # Combine intermediate and final rewards
            total_reward = intermediate_reward + final_reward
            replay_buffer.add((state, action, total_reward, new_state, player))
            episode_rewards.append(total_reward)
        
        # Training phase
        if len(replay_buffer.buffer) >= BATCH_SIZE:
            for agent in agents:
                batch = replay_buffer.sample(BATCH_SIZE)
                states, actions, rewards, _, _ = zip(*batch)
                loss = agent.train(states, actions, rewards)
                episode_losses.append(loss)
                
                # Update best agent if this agent has lower loss
                if loss < best_agent.best_loss:
                    best_agent = agent
                    best_agent.best_loss = loss
                    best_agent.save_model(BEST_MODEL_PATH)
                    loss_increase_count = 0
                else:
                    loss_increase_count += 1
                            
        # Update epsilon
        for agent in agents:
            agent.epsilon = max(EPSILON_END, agent.epsilon * EPSILON_DECAY)
        
        if episode % 100 == 0:
            best_agent.save_model(BEST_MODEL_PATH)
            mean_loss = np.mean(episode_losses[-100:])
            mean_reward = np.mean(episode_rewards[-100:])
            print(f"Episode {episode} completed. Mean loss: {mean_loss:.4f}, Mean reward: {mean_reward:.4f}")
        
        # Backup model if loss increases continuously
        if loss_increase_count >= 50:
            best_agent.save_model(BACKUP_MODEL_PATH)
#             print(f"Backup model saved due to continuous loss increase.")
            loss_increase_count = 0
    
    end_time = time.time()
    total_training_time = end_time - start_time
    
    # Save training statistics
    with open(CSV_FILE_PATH, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Episode', 'Mean Loss', 'Mean Reward'])
        for i in range(num_episodes):
            writer.writerow([i, np.mean(episode_losses[max(0, i-10):i+1]), np.mean(episode_rewards[max(0, i-10):i+1])])
    
    print(f"Training completed in {total_training_time:.2f} seconds")
    return best_agent

def load_or_train_model():
    if os.path.exists('/kaggle/input/rl-oomicardmodel/pytorch/new2.0.0/1/models/best_model.pth'):
        print("Loading existing model...")
        agent = OomiAgent(len(SUITS), len(VALUES), HIDDEN_DIM)
        agent.load_model('/kaggle/input/rl-oomicardmodel/pytorch/new2.0.0/1/models/best_model.pth')
        if CONTINUE_TRAINING:
            print(f"Continuing training for {ADDITIONAL_EPISODES} more episodes...")
            agent = self_play_training(ADDITIONAL_EPISODES, starting_agent=agent)
        return agent
    else:
        print("No existing model found. Starting new training...")
        return self_play_training(NUM_EPISODES)

# Main execution
best_agent = load_or_train_model()
print("The agent is ready to play Oomi!")

Using device: cuda
Loading existing model...
Continuing training for 100000 more episodes...


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Episode 0 completed. Mean loss: nan, Mean reward: 0.0125
Episode 100 completed. Mean loss: -0.2128, Mean reward: 0.0160
Episode 200 completed. Mean loss: -0.2620, Mean reward: 0.0115
Episode 300 completed. Mean loss: -0.0534, Mean reward: 0.0050
Episode 400 completed. Mean loss: -0.0107, Mean reward: 0.0075
Episode 500 completed. Mean loss: -0.2058, Mean reward: 0.0015
Episode 600 completed. Mean loss: -0.0176, Mean reward: 0.0095
Episode 700 completed. Mean loss: -0.2418, Mean reward: 0.0015
Episode 800 completed. Mean loss: -0.1349, Mean reward: 0.0125
Episode 900 completed. Mean loss: 0.0492, Mean reward: 0.0150
Episode 1000 completed. Mean loss: -0.0228, Mean reward: 0.0035
Episode 1100 completed. Mean loss: -0.2138, Mean reward: 0.0180
Episode 1200 completed. Mean loss: -0.2668, Mean reward: 0.0035
Episode 1300 completed. Mean loss: -0.1216, Mean reward: 0.0075
Episode 1400 completed. Mean loss: -0.3955, Mean reward: 0.0160
Episode 1500 completed. Mean loss: 0.0503, Mean reward: 0

In [2]:
game = OomiGame()

In [3]:
best_agent.evaluate(1000)

win rate - 0.872


0.872

In [4]:
str(game.hands[1][0])

'Q of Hearts'

In [5]:
[str(c) for c in game.hands[0]]

['Q of Spades',
 'A of Spades',
 'K of Diamonds',
 '10 of Hearts',
 'A of Diamonds',
 'J of Clubs',
 'Q of Diamonds',
 '8 of Diamonds']

In [6]:
card = best_agent.choose_action({'hand': game.hands[0], 'desk': [game.hands[1][0]], 'played': [],
                                                     'current_player': 0, 'tricks_won': {'team1': 0, 'team2': 0},
                                                     'trump_suit': "Diamonds"})

In [7]:
str(card)

'10 of Hearts'

In [8]:
# !rm -rf /kaggle/working/*

In [9]:
# !mkdir /kaggle/working/models/

In [10]:
# !cp /kaggle/input/rl-model-new/models/backup_model.pth /kaggle/working/models/
# !cp /kaggle/input/rl-model-new/models/best_model.pth /kaggle/working/models/