In [11]:
import numpy as np
import torch
import os
import pandas as pd
import textstat

from pathlib import Path

from transformers import AutoTokenizer
from datasets import load_dataset

from vllm import LLM, SamplingParams
from utils.gpu_management import reset_vllm_gpu_environment

from zeus.monitor import ZeusMonitor

In [12]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


print(f"CUDA available: {torch.cuda.is_available()}")
print(f"cuDNN version: {torch.backends.cudnn.version()}")

with open(f"{Path.home()}/.cache/huggingface/token", "r") as f:
    HF_TOKEN = f.read()
    f.close()


MODELS = [
    "meta-llama/Llama-3.2-3B-Instruct", 
    "meta-llama/Llama-3.1-8B-Instruct",
    "meta-llama/Llama-3.3-70B-Instruct"
]

MAX_SEQ_LEN = 8192
NUM_SAMPLES = 10_000
NUM_BATCHES = 50
SAMPLES_PER_BATCH = NUM_SAMPLES / NUM_BATCHES

# See for reference: https://docs.vllm.ai/en/v0.5.5/dev/sampling_params.html
SAMPLING_PARAMS = SamplingParams(
    temperature=0.8, 
    top_p=0.95,
    min_tokens=1,  # this is key as some models may refuse to generate anything if set to 0.
    max_tokens=128,
)

NUM_GPUS = torch.cuda.device_count()

CSV_FILE_PATH = Path(f"data/simulation_data.csv")

CUDA available: False
cuDNN version: None


In [13]:
def add_instruction(sentence_pair, tokenizer: AutoTokenizer = None):

    message = [
        {"role": "system", "content": "You are a helpful chatbot that translates text from German to English. Only provide the translation, nothing else."},
        {"role": "user", "content": {sentence_pair['translation']['de']}}
        # {"role": "user", "content": f"Please translate the following sentence from German to English: \n\n{sentence_pair['translation']['de']}"}
    ]

    sentence_pair["input_formatted"] = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
    sentence_pair["target"] = sentence_pair["translation"]["en"]
    
    return sentence_pair


def compute_text_metrics(row):

    text = row["input_text"]

    row["flesch_reading_ease"] = textstat.flesch_reading_ease(text)
    row["smog_index"] = textstat.smog_index(text)
    row["automated_readability_index"] = textstat.automated_readability_index(text)
    row["lexical_diversity"] = len(set(text.split())) / len(text.split()) if len(text.split()) > 0 else 0
    row["syllable_count"] = textstat.syllable_count(text)
    row["complex_word_count"] = textstat.difficult_words(text)
    row["avg_word_length"] = sum(len(word) for word in text.split()) / len(text.split()) if len(text.split()) > 0 else 0
    row["sentence_length"] = len(text.split())
    row["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade(text)
    row["coleman_liau_index"] = textstat.coleman_liau_index(text)
    row["dale_chall_readability_score"] = textstat.dale_chall_readability_score(text)
    row["linsear_write_formula"] = textstat.linsear_write_formula(text)
    row["text_standard"] = textstat.text_standard(text)
    row["fernandez_huerta"] = textstat.fernandez_huerta(text)
    row["szigriszt_pazos"] = textstat.szigriszt_pazos(text)
    row["gutierrez_polini"] = textstat.gutierrez_polini(text)
    row["crawford"] = textstat.crawford(text)

    try:
        row["gulpease_index"] = textstat.gulpease_index(text)
    except ZeroDivisionError:
        row["gulpease_index"] = np.nan
        
    try:
        row["osman"] = textstat.osman(text)
    except ZeroDivisionError:
        row["osman"] = np.nan

    return row    

In [19]:
dataset = load_dataset('wmt14', 'de-en', split='train')
dataset = dataset.shuffle().select(range(NUM_SAMPLES))

if CSV_FILE_PATH.exists():
    print("Loaded file")
    df = pd.read_csv(CSV_FILE_PATH)
else:
    df = pd.DataFrame()
df["input_text"] = [dataset[idx]['translation']['de'] for idx in range(len(dataset["translation"]))]

for model_name in MODELS: 

    tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)
    tokenizer.pad_token = tokenizer.eos_token
    # print(tokenizer.chat_template)
    dataset_formatted = dataset.map(lambda sentence_pair: add_instruction(sentence_pair, tokenizer))

    for batch in range(NUM_BATCHES):

        subset = dataset_formatted.select(range(
            int(SAMPLES_PER_BATCH * batch), 
            int(SAMPLES_PER_BATCH * (batch + 1))
        ))
        
        outputs = "None"

        for idx, output in enumerate(outputs): 
            df.loc[df["input_text"] == subset[idx]['translation']['de'], f"output_{model_name.replace('/', '_')}"] = output
    
        df.to_csv(CSV_FILE_PATH)

df = df.apply(compute_text_metrics, axis=1)
df.to_csv(CSV_FILE_PATH)

Loaded file


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
