In [None]:
! pip install datasets einops sentencepiece tokenizers
! pip install git+https://github.com/huggingface/transformers
! pip install torch
! pip install protobuf

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

torch.cuda.empty_cache()
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(
    "Open-Orca/Mistral-7B-OpenOrca")
model = AutoModelForCausalLM.from_pretrained(
    "Open-Orca/Mistral-7B-OpenOrca").half().to(device)

In [None]:
sys_prompt = "A recommendation system for home, office and kitchen containers of all kinds. respond in bolivian spanish" # set this to whatever you want like "A recommendation system for server configuration"
prompt = "I need to store dozens of folders and files in my office."

prefix = "<|im_start|>"
suffix = "<|im_end|>\n"
sys_format = prefix + "system\n" + sys_prompt + suffix
user_format = prefix + "user\n" + prompt + suffix
assistant_format = prefix + "assistant\n"
input_text = sys_format + user_format + assistant_format

generation_config = GenerationConfig(
    max_length=1026, temperature=1.1, top_p=0.95, repetition_penalty=1.0,
    do_sample=True, use_cache=True,
    eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id,
    transformers_version="4.34.0.dev0")

inputs = tokenizer(input_text, return_tensors="pt", return_attention_mask=True).to(device)
outputs = model.generate(**inputs, generation_config=generation_config)

text = tokenizer.batch_decode(outputs)[0]
print(text)

In [None]:
import threading
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

# stop_words = set(["the", "is", "in", "that", "does", "do", "and", "a"]) # Total memory used: 1767.55 MB
# stop_words = set(["the", "is", "in", "that", "does", "do"]) # Total memory used: 2445.98 MB
stop_words = set(["the", "is", "in", "that", "does", "do", "of"]) # Sometimes removing stop words decreases memory usage


def remove_stop_words(text, stop_words):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)


def get_gpu_memory():
    torch.cuda.synchronize()
    return torch.cuda.memory_allocated()

device = "cuda"

total_memory_used =  []

# Function to run model inference
def run_model(prompt):
    # Tokenize input
    
    # Record initial memory usage
    initial_memory = get_gpu_memory()
    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=True).to(device)
    
    # Generate output
    outputs = model.generate(**inputs, generation_config=generation_config)
    
    # Decode and print text
    final_memory = get_gpu_memory()
    memory_used = final_memory - initial_memory
    total_memory_used.append(memory_used)
    print(f"Memory used by this task: {memory_used / (1024**2):.2f} MB")
    text = tokenizer.batch_decode(outputs)[0]
    print(text)


generation_config = GenerationConfig(
    max_length=1026, temperature=1.1, top_p=0.95, repetition_penalty=1.0,
    do_sample=True, use_cache=True,
    eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id,
    transformers_version="4.34.0.dev0")

# Example prompts
prompts = [
    "A system about economics and investment. Specialize in macro-economics and bonds.\nTell me about the euro-dollar",
    "A system about economics and investment. Specialize in macro-economics and bonds.\nTell me about the TLT etf",
    "A system about economics and investment. Specialize in macro-economics and bonds.\nWhat is deflation",
    "A system about economics and investment. Specialize in macro-economics and bonds.\nwhat is inflation",
    "A system about economics and investment. Specialize in macro-economics and bonds.\nHow does the fed work",
    "A system about economics and investment. Specialize in macro-economics and bonds.\nwho is the fed chair",
    "A system about economics and investment. Specialize in macro-economics and bonds.\nWhat is the GDP of Germany",
    "A system about economics and investment. Specialize in macro-economics and bonds.\nwhat is the currency of Japan",
    "A system about economics and investment. Specialize in macro-economics and bonds.\nWhat is a central bank",
    "A system about economics and investment. Specialize in macro-economics and bonds.\nwho is the president of the ECB",
]
prompts = [remove_stop_words(prompt, stop_words) for prompt in prompts]

threads = []
for prompt in prompts:
    thread = threading.Thread(target=run_model, args=(prompt,))
    threads.append(thread)
    thread.start()
    
for thread in threads:
    thread.join()

total_memory_used = sum(total_memory_used)
print(f"Total memory used: {total_memory_used / (1024**2):.2f} MB")

In [None]:
avg_task_mem = 1292.56 / 6
# Total memory used: 3594.73 MB with stopping words
print(f"Average memory used per task: {avg_task_mem:.2f} MB")
max_num_tasks = (25000 - 13500) / avg_task_mem
print(f"Max number of tasks: {max_num_tasks:.2f}")