In [2]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.callbacks import StopTrainingOnRewardThreshold
from stable_baselines3.common.utils import get_linear_fn

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load datasets
train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')
sample_submission = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/sample_submission.csv')

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000)
train_tfidf = vectorizer.fit_transform(train['response_a'] + " " + train['response_b'])
test_tfidf = vectorizer.transform(test['response_a'] + " " + test['response_b'])

# Additional Features
train['response_a_length'] = train['response_a'].apply(len)
train['response_b_length'] = train['response_b'].apply(len)
train['response_a_word_count'] = train['response_a'].apply(lambda x: len(x.split()))
train['response_b_word_count'] = train['response_b'].apply(lambda x: len(x.split()))

test['response_a_length'] = test['response_a'].apply(len)
test['response_b_length'] = test['response_b'].apply(len)
test['response_a_word_count'] = test['response_a'].apply(lambda x: len(x.split()))
test['response_b_word_count'] = test['response_b'].apply(lambda x: len(x.split()))

# Combine Features
X_train_combined = np.hstack((train_tfidf.toarray(), train[['response_a_length', 'response_b_length', 'response_a_word_count', 'response_b_word_count']].values))
X_test_combined = np.hstack((test_tfidf.toarray(), test[['response_a_length', 'response_b_length', 'response_a_word_count', 'response_b_word_count']].values))

y_a = train['winner_model_a'].values
y_b = train['winner_model_b'].values
y_tie = train['winner_tie'].values

# Train a simple logistic regression as reward model
reward_model = LogisticRegression()
reward_model.fit(X_train_combined, y_a)

# Define custom environment for PPO
import gymnasium as gym
from gymnasium import spaces

class ChatbotEnv(gym.Env):
    def __init__(self, data, reward_model):
        super(ChatbotEnv, self).__init__()
        self.data = data
        self.reward_model = reward_model
        self.current_step = 0
        self.action_space = spaces.Discrete(2)  # Two actions: A or B
        self.observation_space = spaces.Box(low=0, high=1, shape=(data.shape[1],), dtype=np.float32)

    def reset(self):
        self.current_step = 0
        return self.data[self.current_step], {}

    def step(self, action):
        obs = self.data[self.current_step]
        reward = self.reward_model.predict_proba([obs])[0][action]
        self.current_step += 1
        done = self.current_step >= len(self.data)
        next_obs = self.data[self.current_step] if not done else self.data[0]
        return next_obs, reward, done, {}, {}  # Return observation, reward, done, truncated, info

# Initialize environment with smaller subset size and wrap it with Monitor and DummyVecEnv
env = ChatbotEnv(X_train_combined[:1000], reward_model)  # Start with 1k samples
env = Monitor(env)
env = DummyVecEnv([lambda: env])

# Initialize PPO model
model = PPO("MlpPolicy", env, verbose=1, batch_size=32, learning_rate=1e-3, n_steps=512, clip_range=0.2)

# Define early stopping callback
callback_on_best = StopTrainingOnRewardThreshold(reward_threshold=20000, verbose=1)
eval_callback = EvalCallback(env, best_model_save_path='./logs/',
                             log_path='./logs/', eval_freq=1000,
                             deterministic=True, render=False,
                             callback_on_new_best=callback_on_best)

# Training PPO model with fewer timesteps initially
model.learn(total_timesteps=1000, callback=eval_callback)

# Gradually increase the data subset and timesteps without changing the environment
for subset_size in [2000, 4000, 8000, len(X_train_combined)]:
    env.data = X_train_combined[:subset_size]
    model.learn(total_timesteps=1000, callback=eval_callback)

# Generate predictions using the trained PPO model
obs, _ = env.reset()
done = False
responses = []
while not done:
    action, _states = model.predict(obs)
    obs, reward, done, _, info = env.step(action)  # updated to handle 4 return values
    responses.append(action)

# Convert responses to numpy array
responses = np.array(responses).flatten()

# Prepare the submission file with PPO model predictions
submission = pd.DataFrame({
    'id': test['id'],
    'winner_model_a': responses[:len(test)],
    'winner_model_b': responses[:len(test)],
    'winner_tie': responses[:len(test)]
})

# Save the submission file
submission.to_csv('submission.csv', index=False)


Using cpu device


TypeError: ChatbotEnv.reset() got an unexpected keyword argument 'seed'