# This code implements the Stage 2 Fine-tuning of SC-Phi2 model

### Import required libraries

In [None]:
from __future__ import print_function

import os
import json

import visdom
import numpy as np
import re

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

from Baselines.GlobalStateEvaluation.test import show_test_result
from data_loader.BatchEnv import BatchSpatialEnv
from transformers import DataCollatorForLanguageModeling

import evaluate


### Setup configuration

In [7]:
# Setup Args
args = { 'replay_path':  './train_val_test/Protoss_vs_Terran/', # Path to specific race train, val and test files
        'exp_name': 'SC:PvP-PvT-922', # Name of the experiment
        'race': 'Protoss', # Player race
        'enemy_race': 'Terran', # Enemy race
        'phase': 'test', # Phase 
        'gpu_id': 0, # GPU ID
        'lr': 5e-5, # Initial learning rate
        'n_steps': 4, # No. of frames in each replay
        'n_replays': 4, # No. of replays in each batch
        'n_epochs': 1, # No. of training epochs
        'save_interval': 1e4, # Interval for saving model params    
        'seed': 1234, # Seed to make the model more deterministic
        'path_to_actions_dict': './actions_protoss.json',  # File to convert action ids to actions  
        'save_path': './scgpt-s2/' # Model save path
}
save_path = os.path.join('./checkpoints/', args['exp_name'])

#### Setup seeds

In [None]:
torch.manual_seed(args['seed'])
torch.cuda.manual_seed(args['seed'])
np.random.seed(args['seed'])
torch.autograd.set_detect_anomaly(True)

### Define SC-GPT 

#### Import SC-GPT specific modules

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from peft import get_peft_model, LoraConfig
import bitsandbytes as bnb
from accelerate import Accelerator

Function to visualize map features

In [None]:
def plot_spatial(feats):
    feats = feats.cpu().numpy()
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(nrows=5, ncols=13, figsize=(10, 10))
    print(ax.shape)
    ax = ax.flatten()
    for i in range(len(feats)):
        ax[i].imshow(feats[i][0])
    plt.show()


Load the data for training

In [None]:
env = BatchSpatialEnv()
env.init(os.path.join(args['replay_path'], '{}.json'.format(args['phase'])),
         './', args['race'], args['enemy_race'], 
         n_steps=args['n_steps'], seed = args['seed'],
         n_replays=args['n_replays'], epochs=args['n_epochs'])

Set some parameters for training

In [15]:
batch_size = 1
n_frames = 4
#n_global_feat = env.n_features_gbl
n_scores = 24
token_length = 288
race = env.race
opponenet = env.enemy_race
grad_accumulation_steps = 8

Name of the Phi-2 and BLIP-2 ViT from Huggingface

In [16]:
base_llm_id = "microsoft/phi-2"
base_blip = "Salesforce/blip2-opt-2.7b"

Load weights from Stage 1 fine-tuned Phi-2 model

In [17]:
peft_model = '/data/MSC-master/scgpt-stage2/PvP/'

Create LoRA config for loading model in LoRA and Quantized format

In [18]:
def create_lora_config(lora_alpha=16, lora_r=8):
    lora_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=0.05,
        r=lora_r,
        target_modules= ['Wqkv', 'fc1', 'fc2', 'out_proj'], # Target layers where we want to apply LoRA
        bias="none",
        task_type="CAUSAL_LM", 
    )
    return lora_config

Create and setup tokenizer

In [19]:
def create_tokenizer(token_len):
    tokenizer = AutoTokenizer.from_pretrained(
        base_llm_id,
        padding_side="left",
        add_eos_token=True,
        add_bos_token=False,
        use_fast=False, 
        max_length=token_len
    )
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

Load Phi-2 model from either Huggingface or fine-tuned version

In [20]:
def create_llm(tokenizer):            
    quant_config = BitsAndBytesConfig(load_in_8bit=True, 
                                        bnb_8bit_compute_dtype=torch.bfloat16, 
                                        bnb_8bit_use_double_quant=False)
    model = AutoModelForCausalLM.from_pretrained(base_llm_id, 
                                                trust_remote_code=True, 
                                                quantization_config=quant_config,
                                                low_cpu_mem_usage=True,
                                                flash_attn = True,
                                                flash_rotary = True,
                                                fused_dense = True,
                                                device_map='auto',
                                                revision = 'refs/pr/23')
    model.config.pad_token_id = tokenizer.pad_token_id
    model.resize_token_embeddings(len(tokenizer)) 
                                                         
    lora_alpha = 192
    lora_r = 96
    lora_config = create_lora_config(lora_alpha, lora_r)
    model = get_peft_model(model, lora_config)    
    print(type(model))
    print(model.print_trainable_parameters())
    return model

Load a saved tokenizer

In [21]:
def load_saved_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(peft_model)
    return tokenizer

Load the model from the disk using PEFT

In [22]:
from peft import PeftModel
def load_saved_llm(tokenizer):            
    quant_config = BitsAndBytesConfig(load_in_8bit=True, 
                                        bnb_8bit_compute_dtype=torch.bfloat16, 
                                        bnb_8bit_use_double_quant=False)
    model = AutoModelForCausalLM.from_pretrained(base_llm_id, 
                                                trust_remote_code=True, 
                                                quantization_config=quant_config,
                                                low_cpu_mem_usage=True,
                                                flash_attn = True,
                                                flash_rotary = True,
                                                fused_dense = True,
                                                device_map='auto',
                                                revision = 'refs/pr/23')

    model.config.pad_token_id = tokenizer.pad_token_id
    model.resize_token_embeddings(len(tokenizer)) 
    print(type(model))  
    model = PeftModel.from_pretrained(model, peft_model, is_trainable=True)#, quantization_config=quant_config)
    print(type(model))
    #model = model.merge_and_unload()
    return model

Load BLIP-2 ViT and its tokenizer

In [23]:
def create_visual_encoder():
    model = Blip2ForConditionalGeneration.from_pretrained(base_blip, 
                                        trust_remote_code=True,                                          
                                        load_in_8bit=True,
                                        low_cpu_mem_usage=True)
    return model.eval()

In [24]:
def create_blip_processor():
    processor = Blip2Processor.from_pretrained(pretrained_model_name_or_path=base_blip, do_rescale=False)
    return processor

Tokenize the given prompt

In [25]:
def tokenize_prompt(text, tokenizer = None, token_length=100):
    result = tokenizer(
        text=text,
        truncation=True,
        max_length=token_length,
        padding="max_length",
        return_tensors="pt"
    )
    """ if self_supervised:
        result['labels'] = result['input_ids'].copy() """
    return result['input_ids']

In [26]:
def eval_tokenize_prompt(text, tokenizer = None, token_length=100):
    return tokenizer(text, return_tensors='pt').to('cuda')

We need Data Collator to create a batch of prompts

In [27]:
def create_collator(tokenizer):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,  # Set to False for causal language modeling
        mlm_probability=0.15  # Probability of masking tokens
    )
    return data_collator

Use global features, ground-truth actions and reward to create a dynamic prompt

In [28]:
def prompt2(score, vis_text, gt_actions, reward):
    score = score.cpu().detach().numpy().tolist()
    for i, s in enumerate(score):
        if i < 11:
            if s >= 0 and s <= 0.2:
                score[i] = 'low'
            elif s > 0.2 and s <= 0.7:
                score[i] = 'medium'
            elif s > 0.7:
                score[i] = 'high'
        else:
            if i == 19 or i == 23:
                if s >= 0 and s <= 2000:
                    score[i] = 'low'
                elif s > 2000 and s <= 8000:
                    score[i] = 'medium'
                elif s > 8000:
                    score[i] = 'high'
            if i == 18 or i == 22:
                if s >= 0 and s <= 10000:
                    score[i] = 'low'
                elif s > 10000 and s <= 30000:
                    score[i] = 'medium'
                elif s > 30000:
                    score[i] = 'high'
            if i == 15:
                if s >= 0 and s <= 0.25:
                    score[i] = 'early'
                elif s > 0.25 and s <= 0.6:
                    score[i] = 'mid'
                elif s > 0.6 and s <= 0.9:
                    score[i] = 'late'
                elif s> 0.9:
                    score[i] = 'end'


    prompt = f'''Instruct: As an expert StarCraft II {race} player, playing against the {opponenet}, predict the next 4 actions and also the result of the game, given the following resources:
                Game state: {score[15]}, Army Count: {score[8]}, Army Units/Buildings: {vis_text}
                Minerals collected: {score[18]}, Minerals used: {score[22]}, Vespene gas collected: {score[19]}, Vespene gas used: {score[23]}
                Food used: {score[3]}, Food cap: {score[4]}, Food for Army: {score[5]}, Food for Workers: {score[6]}
                Idle Workers: {score[7]}, Warp gates count: {score[9]}, Larva count: {score[10]}.
                Output:                
                Action 1: {gt_actions[0]}
                Action 2: {gt_actions[1]}
                Action 3: {gt_actions[2]}
                Action 4: {gt_actions[3]}
                Result: {reward}'''
    return prompt

Once the prompt is ready, we can tokenize it and send it to SLM

In [29]:
def tokenize2(scores, text_feats, gt_actions, rewards, tokenizer, token_length=100):
    
    T, B, F = scores.shape
    scores = scores.reshape(B, T, F)
    gt_actions = gt_actions.reshape(B, T)
    rewards = rewards.reshape(B, T, 1)
    
    
    data = {'input_ids': [], 'labels': []}
    for score, text_feat, gt_action, reward in zip(scores, text_feats, gt_actions, rewards):
        
        score = score.flatten()
        rew = 'win' if reward[0] == 1 else 'loss'
        actions = id_to_actions(gt_action)
        text_prompt = prompt2(score, text_feat[0], actions, rew)
        
        tokens = tokenize_prompt(text_prompt, tokenizer=tokenizer, token_length=token_length)[0]
        
        data['input_ids'].append(tokens)
        data['labels'].append(tokens)
    return data


Generate textual descriptions from map features

In [31]:
def spatial_to_text_features(spatial_features, visual_encoder, vit_processor):
    
    feats = spatial_features.unsqueeze(3)
    
    feats = torch.cat([feats, feats, feats], dim=3)

    feats = feats[:, :, 4:8, :, :, :]
    T, B, F, C, H, W = feats.shape
    feats = feats.reshape(B, T*F, C, H, W)
    feats = torch.mean(feats, dim=1)
    
    quest = "Question: How many circles are there in the image? Answer:"
    
    text_feat = []
    with torch.no_grad():
        
        for i in range(feats.shape[0]):
            inputs = vit_processor(feats[i], text=quest, return_tensors="pt")
            out = visual_encoder.generate(**inputs, max_new_tokens=40)
            text = vit_processor.batch_decode(out, skip_special_tokens=True)[0].strip().replace('circles', 'buildings')        
            text = re.search('\d+ buildings', text)
            if text is None:
                text_feat.append('0 buildings')
            else: 
                text_feat.append(''.join(text.group(0))) 
                
    return np.array(text_feat).reshape(B, 1)

Load race specific ations 

In [32]:
def load_actions():
    act_json = open(args['path_to_actions_dict'], mode='r')
    actions_dict = json.load(act_json)
    return actions_dict
actions_dict = load_actions()

In [None]:
tokenizer = create_tokenizer(token_len=token_length)

In [None]:
tokenizer = load_saved_tokenizer()

In [35]:
data_collator = create_collator(tokenizer=tokenizer)

In [None]:
llm.print_trainable_parameters()

In [None]:
llm = load_saved_llm(tokenizer)

In [None]:
llm.print_trainable_parameters()

In [None]:
llm = create_llm(tokenizer)

In [None]:
print(llm)

In [None]:
visual_encoder = create_visual_encoder()
vit_processor = create_blip_processor()

#### Initialized Env and SC-GPT

#### SC-GPT: Trainable Parameters

#### Setup optimizer

In [38]:
optimizer = bnb.optim.AdamW8bit(llm.parameters(), lr=args['lr'])

In [None]:
# Load optimizer from a saved state
optimizer.load_state_dict(torch.load('/data/MSC-master/scgpt-s2/optimizer_911573.pth'))

In [None]:
# start the visdom server
vis = visdom.Visdom(env=args['exp_name']+'[{}]'.format(args['phase']), port=8097)

#### Training Loop

In [32]:
def train(model=llm, visual_encoder=visual_encoder, vit_processor=vit_processor, env=env, args=args, tokenizer=tokenizer):
    #################################### PLOT ###################################################
    STEPS = 10
    LAMBDA = 0.99
    
    
    accumulation_steps = 0
    
    loss_chart = vis.line(X=np.zeros(1), Y=np.zeros(1), opts=dict(title='Training Loss'))
    #acc_chart = vis.line(X=np.zeros(1), Y=np.zeros(1), opts=dict(title='Training Accuracy'))
    

    #################################### TRAIN ######################################################
    
    gpu_id = args['gpu_id']
   
    model.train()
    visual_encoder.eval()
    
    epoch = 0
    save = args['save_interval']
    env_return = env.step(reward=True, action=True)
    if env_return is not None:
        
        (spatial_features, scores, rewards_gt, actions_gt), require_init = env_return

    with torch.cuda.device(gpu_id):
        
        spatial_features = Variable(torch.from_numpy(spatial_features).to(torch.float16), requires_grad=True)
        scores = Variable(torch.from_numpy(scores).to(torch.float16), requires_grad=True)
        rewards_gt = Variable(torch.from_numpy(rewards_gt).float(), requires_grad=True)
        actions_gt = Variable(torch.from_numpy(actions_gt).to(torch.float32).squeeze(), requires_grad=True)
        
        if gpu_id >= 0:
            
            spatial_features = spatial_features.cuda()
            scores = scores.cuda()
            rewards_gt = rewards_gt.cuda()
            actions_gt = actions_gt.cuda()

    while True:
        optimizer.zero_grad()
        
        vis_text = spatial_to_text_features(spatial_features, visual_encoder, vit_processor)
        
        data = tokenize2(scores, vis_text, actions_gt, rewards_gt, tokenizer, token_length=288)
        
        batch_samples_dict = [{'input_ids': input_ids, 'labels': labels} for input_ids, labels in zip(data['input_ids'], data['labels'])]

        del data
        
        batch = data_collator(batch_samples_dict)
        
        outputs = llm(**batch)
        
        loss = outputs.loss
        
        del batch
        loss.backward()        
        if accumulation_steps % grad_accumulation_steps == 0:
            optimizer.step()
            accumulation_steps = 0
        
        accumulation_steps += 1

        if env.step_count() % 30000 == 0:
            
            for p in optimizer.param_groups:
                p['lr'] *= 0.25
        if env.step_count() % 1000 == 0:
            print(f'epoch: {epoch}, steps: {np.asarray([env.step_count()])}, loss: {loss.item()}') #, acc: {acc}')
        ############################ PLOT ##########################################
                
        vis.line(X=np.asarray([env.step_count()]),
                        Y=np.asarray([loss.cpu().detach()]),
                        win=loss_chart,
                        name='loss',
                        update='append')
        """ vis.line(X=np.asarray([env.step_count()]),
                        Y=np.asarray([acc]),
                        win=acc_chart,
                        name='acc',
                        update='append') """

        ####################### NEXT BATCH ###################################
        env_return = env.step(reward=True, action=True)
        if env_return is not None:
            (raw_spatial, raw_scores, raw_rewards, raw_actions), require_init = env_return            
            
            spatial_features = spatial_features.copy_(torch.from_numpy(raw_spatial).to(torch.float16))
            scores = scores.copy_(torch.from_numpy(raw_scores).to(torch.float16))
            rewards_gt = rewards_gt.copy_(torch.from_numpy(raw_rewards).float())
            actions_gt = actions_gt.copy_(torch.from_numpy(raw_actions).to(torch.int32).squeeze())

        if env.step_count() > save or env_return is None:
            save = env.step_count() + args['save_interval']
            s = str(env.step_count())
            torch.save(llm.state_dict(), args['save_path']+'model_subset_2_'+ s +'.pth')
            torch.save(optimizer.state_dict(), args['save_path']+'optimizer_' + s + '.pth')
        if env_return is None:
            env.close()
            break

Generate prompt for evaluation

In [39]:
def eval_prompt(score, vis_text, gt_actions, reward):
    score = score.cpu().detach().numpy().tolist()
    for i, s in enumerate(score):
        if i < 11:
            if s >= 0 and s <= 0.2:
                score[i] = 'low'
            elif s > 0.2 and s <= 0.7:
                score[i] = 'medium'
            elif s > 0.7:
                score[i] = 'high'
        else:
            if i == 19 or i == 23:
                if s >= 0 and s <= 2000:
                    score[i] = 'low'
                elif s > 2000 and s <= 8000:
                    score[i] = 'medium'
                elif s > 8000:
                    score[i] = 'high'
            if i == 18 or i == 22:
                if s >= 0 and s <= 10000:
                    score[i] = 'low'
                elif s > 10000 and s <= 30000:
                    score[i] = 'medium'
                elif s > 30000:
                    score[i] = 'high'
            if i == 15:
                if s >= 0 and s <= 0.25:
                    score[i] = 'early'
                elif s > 0.25 and s <= 0.6:
                    score[i] = 'mid'
                elif s > 0.6 and s <= 0.9:
                    score[i] = 'late'
                elif s> 0.9:
                    score[i] = 'end'


    prompt = f'''Instruct: As an expert StarCraft II {race} player, playing against the {opponenet}, predict the next 4 actions and also the result of the game, given the following resources:
                Game stage: {score[15]}, Army Count: {score[8]}, Army Units/Buildings: {vis_text}
                Minerals collected: {score[18]}, Minerals used: {score[22]}, Vespene gas collected: {score[19]}, Vespene gas used: {score[23]}
                Food used: {score[3]}, Food cap: {score[4]}, Food for Army: {score[5]}, Food for Workers: {score[6]}
                Idle Workers: {score[7]}, Warp gates count: {score[9]}, Larva count: {score[10]}.
                Output:'''
    return prompt

Load the testing replays

In [None]:
env2 = BatchSpatialEnv()
env2.init('./train_val_test/Terran_vs_Terran/test_subset.json',
         './', args['race'], args['enemy_race'], 
         n_steps=args['n_steps'], seed = args['seed'],
         n_replays=1, epochs=1)

Setup file to save the generated results

In [40]:
eval_results = open('eval_results_pvp-to-pvt.csv', mode='w')

#### Testing Loop

In [41]:
def test(model=llm, visual_encoder=visual_encoder, vit_processor=vit_processor, env=env):
    gpu_id = args['gpu_id']
    model.eval()
    visual_encoder.eval()
    
    
    save = args['save_interval']
    env_return = env.step(reward=True, action=True)
    if env_return is not None:        
        (spatial_features, scores, rewards_gt, actions_gt), require_init = env_return
    
    eval_results.write('Actions_GT \t Reward \t Predictions \n')
    
    with torch.cuda.device(gpu_id):
        
        spatial_features = Variable(torch.from_numpy(spatial_features).to(torch.float16), requires_grad=True)
        scores = Variable(torch.from_numpy(scores).to(torch.float16), requires_grad=True)
        rewards_gt = Variable(torch.from_numpy(rewards_gt).float(), requires_grad=True)
        actions_gt = Variable(torch.from_numpy(actions_gt).to(torch.float32).squeeze(), requires_grad=True)
        
        if gpu_id >= 0:
            
            spatial_features = spatial_features.cuda()
            scores = scores.cuda()
            rewards_gt = rewards_gt.cuda()
            actions_gt = actions_gt.cuda()
    while True:
        
        vis_text = spatial_to_text_features(spatial_features, visual_encoder, vit_processor)
        
        prompt, actions, rew = eval_tokenize(scores, vis_text, actions_gt, rewards_gt, tokenizer, token_length=384)
        
        
        inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
        
        with torch.no_grad():
            row = f"{actions} \t {rew} \t '{tokenizer.decode(llm.generate(**inputs, max_new_tokens=70)[0], skip_special_tokens=True)}'\n"
            
            eval_results.write(row)
        
        env_return = env.step(reward=True, action=True)
        if env_return is not None:
            (raw_spatial, raw_scores, raw_rewards, raw_actions), require_init = env_return            
            spatial_features = spatial_features.copy_(torch.from_numpy(raw_spatial).to(torch.float16))
            scores = scores.copy_(torch.from_numpy(raw_scores).to(torch.float16))
            rewards_gt = rewards_gt.copy_(torch.from_numpy(raw_rewards).float())
            actions_gt = actions_gt.copy_(torch.from_numpy(raw_actions).to(torch.int32).squeeze())

        if env_return is None:
            env.close()
            break

In [None]:
eval_results.close()

Save model's weights

In [None]:

torch.save(llm.state_dict(), args['save_path']+'model.pth')
torch.save(optimizer.state_dict(), args['save_path']+'optimizer.pth')

#### Main Model Training and Evaluation

In [None]:
train(model=llm, visual_encoder=visual_encoder, vit_processor=vit_processor, env=env)

In [None]:
test(model=llm, visual_encoder=visual_encoder, vit_processor=vit_processor, env=env2)

In [None]:
out_dir = './scgpt-stage2'
llm.save_pretrained(out_dir)
tokenizer.save_pretrained(out_dir)