In [7]:
# 🔁 Remove problematic packages
!pip uninstall -y numpy protobuf gymnasium tensorflow dopamine-rl opencv-python-headless cupy-cuda12x

# ✅ Install compatible, stable versions
!pip install torch==2.1.2 \
             transformers==4.36.2 \
             trl==0.7.9 \
             peft==0.7.1 \
             accelerate==0.21.0 \
             bitsandbytes==0.42.0 \
             gymnasium==1.2.0 \
             numpy==1.26.4 \
             protobuf==4.25.3 \
             wandb==0.16.4


[0mCollecting torch==2.1.2
  Using cached torch-2.1.2-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting transformers==4.36.2
  Using cached transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
Collecting trl==0.7.9
  Using cached trl-0.7.9-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes==0.42.0
  Using cached bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Collecting gymnasium==1.2.0
  Using cached gymnasium-1.2.0-py3-none-any.whl.metadata (9.9 kB)
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting protobuf==4.25.3
  Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting wandb==0.16.4
  Using cached wandb-0.16.4-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.1.2)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-ru

In [1]:
from abc import ABC, abstractmethod
from typing import List, Dict

import gymnasium as gym
import torch
from trl import (
    PPOTrainer,
    PPOConfig,
    create_reference_model,
)
import os
from tqdm import trange
import wandb

from transformers import AutoTokenizer
from peft import LoraConfig
from trl import AutoModelForCausalLMWithValueHead

import re

ImportError: peft>=0.15.0 is required for a normal functioning of this module, but found peft==0.7.1.

In [1]:
import os
from tqdm import trange
import wandb

from transformers import AutoTokenizer
#from peft import LoraConfig

from trl import (
    PPOTrainer,
    PPOConfig,
    create_reference_model,
    AutoModelForCausalLMWithValueHead
)

from abc import ABC, abstractmethod
from typing import List, Dict

import torch
import re
import gymnasium as gym
from llamagym import Agent

AttributeError: `np.float_` was removed in the NumPy 2.0 release. Use `np.float64` instead.

In [None]:
#from https://github.com/KhoomeiK/LlamaGym/blob/main/llamagym/agent.py
class Agent(ABC):
    def __init__(
        self, model, tokenizer, device, generate_config_dict=None, ppo_config_dict=None
    ):
        if generate_config_dict is None:
            generate_config_dict = {
                "max_new_tokens": 32,
                "do_sample": True,
                "top_p": 0.6,
                "top_k": 0,
                "temperature": 0.9,
            }
        if ppo_config_dict is None:
            ppo_config_dict = {"batch_size": 16, "mini_batch_size": 16}

        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.generate_config_dict = generate_config_dict
        self.model_ref = create_reference_model(model)
        self.ppo_config = PPOConfig(**ppo_config_dict)
        self.ppo_trainer = PPOTrainer(self.ppo_config, model, self.model_ref, tokenizer)

        self.current_batch = {"queries": [], "responses": [], "rewards": []}

        self.current_episode_messages = [
            {
                "role": "system",
                "content": self.get_system_prompt(),
            }
        ]
        self.current_episode_rewards = []

    @abstractmethod
    def get_system_prompt(self) -> str:
        pass

    @abstractmethod
    def format_observation(self, observation: gym.core.ObsType) -> str:
        pass

    @abstractmethod
    def extract_action(self, response: str) -> gym.core.ActType:
        pass

    def llm(self, messages: List[Dict[str, str]]) -> str:
        prompt = self.tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        generate_ids = self.model.generate(
            inputs=inputs.input_ids,
            **{
                key.split("/")[-1]: value
                for key, value in self.generate_config_dict.items()
            }
        )
        outputs = self.tokenizer.batch_decode(
            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        response = outputs[0].split("[/INST]")[-1].strip()

        return response

    def act(self, observation):
        message = self.format_observation(observation)
        self.current_episode_messages += [{"role": "user", "content": message}]

        response = self.llm(self.current_episode_messages)
        try:
            action = self.extract_action(response)
        except Exception as e:
            return None

        self.current_episode_messages += [{"role": "assistant", "content": response}]
        return action

    def assign_reward(self, reward):
        self.current_episode_rewards.append(reward)

    def format_episode_for_ppo(self, messages, rewards):
        queries, responses = [], []
        for i in range(2, len(messages), 2):
            prompt = self.tokenizer.apply_chat_template(
                messages[: i + 1], tokenize=False, add_generation_prompt=False
            )
            conversation_chunks = prompt.split("[/INST] ")
            query = "[/INST] ".join(conversation_chunks[:-1]) + "[/INST] "
            response = conversation_chunks[-1]

            query = self.tokenizer(query, return_tensors="pt").input_ids[0]
            response = self.tokenizer(response, return_tensors="pt").input_ids[0]

            queries.append(query)
            responses.append(response)

        if all(reward == 0 for reward in rewards[:-1]):
            # if sparse rewards, give equal reward to all conversation turns
            per_turn_reward = rewards[-1] / (len(messages) / 2)
            rewards = [torch.tensor(per_turn_reward, dtype=torch.float16)] * len(
                queries
            )
        else:
            rewards = [torch.tensor(reward, dtype=torch.float16) for reward in rewards]

        return queries, responses, rewards

    def terminate_episode(self, train=True):
        if train:
            queries, responses, rewards = self.format_episode_for_ppo(
                self.current_episode_messages, self.current_episode_rewards
            )

        self.current_episode_messages = [
            {
                "role": "system",
                "content": self.get_system_prompt(),
            }
        ]
        self.current_episode_rewards = []

        if train:
            self.current_batch["queries"].extend(queries)
            self.current_batch["responses"].extend(responses)
            self.current_batch["rewards"].extend(rewards)

            if len(self.current_batch["queries"]) >= self.ppo_config.batch_size:
                train_stats = self.train_batch(
                    self.current_batch["queries"],
                    self.current_batch["responses"],
                    self.current_batch["rewards"],
                )
                return train_stats

        return {}

    def train_batch(self, batch_queries, batch_responses, batch_rewards):
        if len(batch_queries) > self.ppo_config.batch_size:
            queries = batch_queries[: self.ppo_config.batch_size]
            responses = batch_responses[: self.ppo_config.batch_size]
            rewards = batch_rewards[: self.ppo_config.batch_size]

            # keep the remainder for the next batch
            self.current_batch["queries"] = batch_queries[self.ppo_config.batch_size :]
            self.current_batch["responses"] = batch_responses[
                self.ppo_config.batch_size :
            ]
            self.current_batch["rewards"] = batch_rewards[self.ppo_config.batch_size :]
        else:
            queries, responses, rewards = batch_queries, batch_responses, batch_rewards
            self.current_batch = {"queries": [], "responses": [], "rewards": []}

        train_stats = self.ppo_trainer.step(queries, responses, rewards)
        torch.cuda.empty_cache()

        return train_stats

In [None]:
from google.colab import userdata
hf_token = userdata.get('hf_read_token')
wandb_token = userdata.get('wandb_token')

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
class BlackjackAgent(Agent):
    def get_system_prompt(self) -> str:
        return """You are an expert blackjack player. Every turn, you'll see your current sum, the dealer's showing card value, and whether you have a usable ace. Win by exceeding the dealer's hand but not exceeding 21.
Decide whether to stay with your current sum by writing "Action: 0" or accept another card by writing "Action: 1". Accept a card unless very close to 21."""

    def format_observation(self, observation: gym.core.ObsType) -> str:
        return f"You: {observation[0]}. Dealer: {observation[1]}. You have {'an' if bool(observation[2]) else 'no'} ace."

    def extract_action(self, response: str) -> gym.core.ActType:
        match = re.compile(r"Action: (\d)").search(response)
        if match:
            return int(match.group(1))

        digits = [char for char in response if char.isdigit()]
        if len(digits) == 0 or digits[-1] not in ("0", "1"):
            if "stick" in response.lower():
                return 0
            elif "hit" in response.lower():
                return 1

        return 0

    def train_batch(self, batch_queries, batch_responses, batch_rewards):
        #overriding the library's method
        if len(batch_queries) > self.ppo_config.batch_size:
            queries = batch_queries[: self.ppo_config.batch_size]
            responses = batch_responses[: self.ppo_config.batch_size]
            rewards = batch_rewards[: self.ppo_config.batch_size]
        else:
            queries, responses, rewards = batch_queries, batch_responses, batch_rewards
        self.current_batch = {"queries": [], "responses": [], "rewards": []} #we will be updating the parameters of the model so the history needs to be cleared because the trajectories existing in the batch would have been generated from old parameters

        train_stats = self.ppo_trainer.step(queries, responses, rewards)
        torch.cuda.empty_cache()

        return train_stats


def main():
    hyperparams = {
        "model_name": "meta-llama/Llama-2-7b-chat-hf",
        "env": "Blackjack-v1",
        "lora/r": 16,
        "lora/lora_alpha": 32,
        "lora/lora_dropout": 0.05,
        "lora/bias": "none",
        "lora/task_type": "CAUSAL_LM",
        "load_in_8bit": True,
        "batch_size": 8,
        "seed": 42069,
        "num_steps": 5000,
        "episodes": 100,
        "generate/max_new_tokens": 32,
        "generate/do_sample": True,
        "generate/top_p": 0.6,
        "generate/top_k": 0,
        "generate/temperature": 0.9,
    }
    wandb_run = wandb.init(entity="irfanjamil9-org", project="LLM_RLFT_Blackjack", config=hyperparams)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    HF_TOKEN = hf_token

    lora_config = LoraConfig(
        **{
            key.split("/")[-1]: value
            for key, value in hyperparams.items()
            if key.startswith("lora/")
        }
    )
    model = AutoModelForCausalLMWithValueHead.from_pretrained(
        pretrained_model_name_or_path=hyperparams["model_name"],
        peft_config=lora_config,
        load_in_8bit=hyperparams["load_in_8bit"],
        token=HF_TOKEN,
    ).to(device)
    tokenizer = AutoTokenizer.from_pretrained(hyperparams["model_name"], token=HF_TOKEN)
    tokenizer.add_special_tokens({"pad_token": "<pad>"})
    model.pretrained_model.resize_token_embeddings(len(tokenizer))

    agent = BlackjackAgent(
        model,
        tokenizer,
        device,
        {
            key: value
            for key, value in hyperparams.items()
            if key.startswith("generate/")
        },
        {
            "batch_size": hyperparams["batch_size"],
            "mini_batch_size": hyperparams["batch_size"],
        },
    )
    env = gym.make(hyperparams["env"], natural=False, sab=False)
    num_steps = 0
    episode = 1
    while True: #for episode in trange(hyperparams["episodes"]):
        if num_steps >= hyperparams["num_steps"]:
            break
        observation, info = env.reset()
        done = False

        while not done:
            action = agent.act(observation)
            observation, reward, terminated, truncated, info = env.step(action)
            agent.assign_reward(reward)
            done = terminated or truncated

        episode_stats = {
            "episode": episode,
            "total_return": sum(agent.current_episode_rewards),
            "message_ct": len(agent.current_episode_messages),
            "episode_messages": agent.current_episode_messages,
        }
        episode+=1
        train_stats = agent.terminate_episode() #train_stats (when PPO step(s) are taken, it is the output of ppo_trainer.step())
        if len(train_stats) > 0:
          #training / PPO step occured
          num_steps += 1
        episode_stats.update(train_stats)
        wandb.log(episode_stats)

    posttraining_rewards = []
    for episode_ in trange(hyperparams["episodes"]):

        observation, info = env.reset()
        done = False
        while not done:
            action = agent.act(observation)
            observation, reward, terminated, truncated, info = env.step(action)
            agent.assign_reward(reward)
            done = terminated or truncated

        episode_stats = {
            "episode": str(episode_) + "_posttraining",
            "total_return": sum(agent.current_episode_rewards),
            "message_ct": len(agent.current_episode_messages),
            "episode_messages": agent.current_episode_messages,
        }
        posttraining_rewards.append(sum(agent.current_episode_rewards))

        agent.terminate_episode(train=False) #train_stats (when PPO step(s) are taken, it is the output of ppo_trainer.step())
        wandb.log(episode_stats)
    return posttraining_rewards

In [None]:
posttraining_rewards = main()

After fine-tuning, what portion of games does our LLM agent win, draw, and lose?

In [None]:
win_portion = posttraining_rewards.count(1)/len(posttraining_rewards)
draw_portion = posttraining_rewards.count(0)/len(posttraining_rewards)
loss_portion = posttraining_rewards.count(-1)/len(posttraining_rewards)

print(f'Win Percentage: {100 * win_portion:.2f}%')
print(f'Draw Percentage: {100 * draw_portion:.2f}%')
print(f'Loss Percentage: {100 * loss_portion:.2f}%')
