# Sample outputs with LLaMA-Factory

In [None]:
import pandas as pd
import json

import spacy
NLP = spacy.load('en_core_web_sm')

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

def tokenize(txt):
    doc = NLP(txt)
    return " ".join([t.text for t in doc])

import subprocess
def run(cmd):
    print("Run shell command:\n\t", cmd)
    return subprocess.run(cmd, shell=True)

In [None]:
PROJECT_PATH = "./LLaMA-Factory"

checkpoint = '2800'

# llama 6k updates
model_name = "LLaMA2-7B-chat-gec"
model_id = "meta-llama/Llama-2-7b-chat-hf"

for i in range(5):
    
    checkpoint = ['200', '2800', '800', '1200', '2000'][i]
    model_path = f"{PROJECT_PATH}/saves/{model_name}/lora-all-6k/sft/checkpoint-{checkpoint}"

    
    print(model_path)
    
    data_path = f'{model_path}/predict-nucle-samples/generated_predictions.jsonl'
    output_path = f'{model_path}/predict-nucle-samples/output-{i}.csv'
    
    cmd_infer = f"""
    WANDB_DISABLED=true CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
        --config_file {PROJECT_PATH}/examples/accelerate/single_config.yaml \
        {PROJECT_PATH}/src/train_bash.py \
        --stage sft \
        --do_predict \
        --model_name_or_path {model_id} \
        --adapter_name_or_path {model_path} \
        --dataset gec_dataset_test \
        --dataset_dir {PROJECT_PATH}/data \
        --template default \
        --temperature 1.0 \
        --do_sample true \
        --finetuning_type lora \
        --lora_target q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj \
        --output_dir {model_path}/predict-nucle-samples \
        --overwrite_cache \
        --overwrite_output_dir \
        --cutoff_len 256 \
        --preprocessing_num_workers 16 \
        --per_device_eval_batch_size 1 \
        --max_samples 1311 \
        --predict_with_generate \
        --fp16
    """
    
    run(cmd_infer)

    
    with open(data_path, 'r') as f:
        data = [json.loads(l) for l in f.readlines()]
    
    df_output = pd.DataFrame(data)
    df = pd.read_csv('./data/nucle.test.csv')
    df['output'] = df_output['predict']
    df['output'] = df.output.map(tokenize)
    df.to_csv(output_path, index=False)

# Sample outputs with OpenAI-like interface

In [None]:
from openai import OpenAI
client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="token-abc123",
)

In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [None]:
def infer(text):
    completion = client.chat.completions.create(
      # model="mistralai/Mistral-7B-Instruct-v0.2",
        model="gec-llama2-7b-public/",
        temperature=1,
        n=5,
        messages=[
            {"role": "system", "content": "Rewrite this text to make it grammatically correct ."},
            {"role": "user", "content": text}
        ]
    )
    preds = []
    for choise in completion.choices:
        pred = choise.message.content
        prefixes = ["text:", "text is:", ":\n\n", ":\n"]
        for p in prefixes:
            if p in pred:
                pred = pred[pred.index(p)+len(p)+1:]
                pred = pred.lstrip()
                if "\n" in pred:
                    pred = pred[:pred.index('\n')]
                break
        preds.append(pred)
    return preds

infer("I hop ths mesage find u ..")

In [None]:
df = pd.read_csv('troy-blogs.train.tokenized.csv')
df_sample = df.sample(50000)

In [None]:
df_sample['outputs'] = df_sample.src.progress_map(infer)

In [None]:
df_sample.to_csv('troy-blogs.train.tokenized-llama2-samples.csv', index=False)