In [10]:
# Requirements
!pip install gymnasium[atari]
!pip install gymnasium[accept-rom-license]
!pip install ale-py
!pip install stable-baselines3
!pip install imageio[ffmpeg]



In [11]:
# Import libraries
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
import ale_py
from gymnasium.wrappers import TransformReward, TimeLimit
from google.colab import drive
import numpy as np
import os

In [12]:
# Removing the custom reward that includes an empty space reward
#class CustomRewardWrapper(gym.Wrapper):
    #def __init__(self, env):
        #super().__init__(env)
        # Counters for specific actions taken by the agent
        #self.fire_count = 0
        #self.left_right_count = 0

        # Thresholds for applying penalties and rewards
        #self.left_right_threshold = 100
        #self.fire_penalty_threshold = 500

        # Penalties and rewards definitions
       # self.fire_penalty = -0.01  # Penalty for excessive "FIRE" actions
        #self.termination_penalty = -0.1  # Penalty for game over
        #self.movement_reward = 0.01  # Reward for moving left or right
       # self.survival_bonus = 0.005  # Bonus for surviving longer

    #def reset(self, **kwargs):
        # Reset counters when a new episode starts
       # self.fire_count = 0
        #self.left_right_count = 0
        #return self.env.reset(**kwargs)

    #def step(self, action):
        # Execute the action and observe the outcome
        #obs_before, _, _, _, _ = self.env.step(0)  # Taking a NOOP action to get the state without changing it
        #obs, reward, terminated, truncated, info = self.env.step(action)

        # Evaluate custom rewards and penalties
        #reward += self.reward_for_empty_space(obs_before, obs)
        #reward += self.manage_action_rewards(action, obs, terminated)

        #return obs, reward, terminated, truncated, info

   # def reward_for_empty_space(self, obs_before, obs_after):
       # # Evaluate the placement of a block in a completely empty column
        #gray_before = np.mean(obs_before, axis=-1) > 0
        #gray_after = np.mean(obs_after, axis=-1) > 0
       # new_blocks = np.bitwise_and(np.bitwise_not(gray_before), gray_after)

        #reward = 0
      #  for col in range(new_blocks.shape[1]):
           # if np.all(np.bitwise_not(gray_before[:, col])) and np.any(new_blocks[:, col]):
               # reward += 0.1  # Reward for placing in an empty column
        #return reward

    #def manage_action_rewards(self, action, obs, terminated):
        # Handle rewards and penalties based on action types
        #reward = 0
        #if action == 1:
         #   self.fire_count += 1
           # if self.fire_count > self.fire_penalty_threshold:
               # reward += self.fire_penalty

        #if action == 2 or action == 3:
           # self.left_right_count += 1
            #if self.left_right_count <= self.left_right_threshold:
                #reward += self.movement_reward

        #if terminated:
            #reward += self.termination_penalty

       # reward += self.survival_bonus  # Incremental reward for each step
        #return reward

In [13]:
#Create custom reward and penalty functions
class CustomRewardWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        # Counters for specific actions taken by the agent
        self.fire_count = 0
        self.left_right_count = 0

        # Thresholds for applying penalties and rewards
        self.left_right_threshold = 100
        self.fire_penalty_threshold = 500

        # Penalties and rewards definitions
        self.fire_penalty = -0.01  # Penalty for excessive "FIRE" actions
        self.termination_penalty = -0.1  # Penalty for game over
        self.movement_reward = 0.01  # Reward for moving left or right

        # Additional bonuses
        self.survival_bonus = 0.005  # Bonus for surviving longer

    def reset(self, **kwargs):
        # Reset counters when a new episode starts
        self.fire_count = 0
        self.left_right_count = 0
        return self.env.reset(**kwargs)

    def step(self, action):
        # Execute the action and observe the outcome
        obs, reward, terminated, truncated, info = self.env.step(action)

        # Increment fire count and apply penalty if threshold is exceeded
        if action == 1:
            self.fire_count += 1
            if self.fire_count > self.fire_penalty_threshold:
                reward += self.fire_penalty  # Penalize for excessive fire actions

        # Increment left/right count and reward movement within threshold
        if action == 2 or action == 3:
            self.left_right_count += 1
            if self.left_right_count <= self.left_right_threshold:
                reward += self.movement_reward  # Reward for moving left or right

        # Apply bonuses for gameplay strategies
        reward += self.survival_bonus  # Bonus for each step the game continues

        # Apply termination penalty if the game ends
        if terminated:
            reward += self.termination_penalty

        return obs, reward, terminated, truncated, info

In [14]:
# Register the Atari environment
gym.register_envs(ale_py)

# Create the Tetris environment and modify with time limit and custom class
env = gym.make("ALE/Tetris-v5")
#env = TimeLimit(env, max_episode_steps=5000)  # Limit episodes to 5000 steps
env = CustomRewardWrapper(env)

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to the model
model_path = "/content/drive/My Drive/tetris_checkpoints/best_model_cnn_exp_v1.zip"
save_path = "/content/drive/My Drive/tetris_checkpoints/"

# Check if the model exists and load or create a new one
if os.path.exists(model_path):
    print("Loading existing model to continue training.")
    model = DQN.load(model_path, env=env)
else:
    print("No existing model found, training a new model.")
    model = DQN(
        policy="CnnPolicy",
        env=env,
        learning_rate=1e-4,
        buffer_size=200000,
        learning_starts=20000,
        batch_size=32,
        tau=1.0,
        gamma=0.99,
        train_freq=4,
        target_update_interval=10000,
        #exploration_fraction=0.1,
        #exploration_final_eps=0.02,
        exploration_initial_eps = 1.0,  # Start with full exploration (for testing)
        exploration_final_eps = 0.1,    # End with 10% random actions(for testing)
        exploration_fraction = 0.5,     # Extend the exploration period to 50% of total training (for testing)
        verbose=1
    )

# Callback to save the best model
eval_callback = EvalCallback(
    env,
    best_model_save_path=save_path,
    log_path=save_path,
    eval_freq=10000,
    deterministic=True,
    render=False
)

# Create the directory if it doesn't exist
os.makedirs('/content/tetris_checkpoints/', exist_ok=True)

# Train the model
model.learn(total_timesteps=200000, callback=[eval_callback])

# Save the final model
final_model_path = os.path.join(save_path, "tetris_cnn_exp_v1")
model.save(final_model_path)
print("Training complete and final model saved at", final_model_path)

# Evaluate the trained model
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
print(f"Mean reward: {mean_reward} +/- {std_reward}")

# Close the environment
env.close()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
No existing model found, training a new model.
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.




----------------------------------
| rollout/            |          |
|    ep_len_mean      | 582      |
|    ep_rew_mean      | 4.06     |
|    exploration_rate | 0.979    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 766      |
|    time_elapsed     | 3        |
|    total_timesteps  | 2330     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 582      |
|    ep_rew_mean      | 3.93     |
|    exploration_rate | 0.958    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 787      |
|    time_elapsed     | 5        |
|    total_timesteps  | 4653     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 554      |
|    ep_rew_mean      | 3.75     |
|    exploration_rate | 0.94     |
| time/               |          |
|    episodes       

In [15]:
# Save the final model
#model.save("/content/drive/My Drive/tetris_checkpoints/cnn_tetris_v2_final.zip")