# to do
- set up `accelerate` lib (redo the configs in a controlled manner) - `accelerate config`
- use the LlaMma model I was just given access to

# Summarisation: DistilBART

In [None]:
from transformers import pipeline
from accelerate import infer_auto_device_map


pipe = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

text = """
In a quiet town nestled between rolling hills and dense forests, a peculiar event took place that would be spoken of for generations. It began on a misty morning when an elderly watchmaker named Elias discovered an ornate key buried beneath the floorboards of his shop. The key, adorned with intricate engravings resembling constellations, sparked a curiosity in him that refused to wane. He spent days poring over ancient tomes and manuscripts, hoping to uncover its origin, yet every answer led to more questions. Then, late one evening, he stumbled upon a forgotten tale—an old legend of a hidden chamber beneath the town’s cathedral, said to contain an artifact of immeasurable power. Driven by an insatiable thirst for knowledge, Elias embarked on a journey deep beneath the city, where he discovered a door that perfectly matched the key’s intricate design. His hands trembled as he turned the key, unlocking a secret lost to time. What lay beyond would change everything.
"""

# Generate summary
summary = pipe(text) 

print(summary)

  from .autonotebook import tqdm as notebook_tqdm
Device set to use cuda:0


[{'summary_text': " An ornate key was buried beneath the floorboards of a watchmaker's shop . The key, adorned with intricate engravings, sparked a curiosity in him that refused to wane . He spent days poring over ancient tomes and manuscripts, hoping to uncover its origin . Then, late one evening, he stumbled upon a forgotten tale ."}]


In [None]:
# (a) Using codecarbon: estimates energy usage based on the CPU/GPU and time:
    
from codecarbon import EmissionsTracker

tracker = EmissionsTracker()
tracker.start()

# Run your inference code
pipe(text)

tracker.stop()


[codecarbon INFO @ 19:20:34] [setup] RAM Tracking...
[codecarbon INFO @ 19:20:34] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at \sys\class\powercap\intel-rapl to measure CPU

[codecarbon INFO @ 19:20:36] CPU Model on constant consumption mode: AMD EPYC 7742 64-Core Processor
[codecarbon INFO @ 19:20:36] [setup] GPU Tracking...
[codecarbon INFO @ 19:20:36] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 19:20:36] >>> Tracker's metadata:
[codecarbon INFO @ 19:20:36]   Platform system: Linux-5.15.0-113-generic-x86_64-with-glibc2.31
[codecarbon INFO @ 19:20:36]   Python version: 3.10.14
[codecarbon INFO @ 19:20:36]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 19:20:36]   Available RAM : 503.532 GB
[codecarbon INFO @ 19:20:36]   CPU count: 128
[codecarbon INFO @ 19:20:36]   CPU model: AMD EPYC 7742 64-Core Processor
[codecarbon INFO @ 19:20:36]   GPU count: 4
[codecarbon INFO @ 19:20:36]   GPU model: 4 x NVIDIA A100-PCIE-40GB
[codecarbon INFO @ 19:20:39] 

2.8000308861770503e-05

In [7]:
#  Using pyJoules for Fine-Grained Profiling - gives per-function energy consumption:

from pyJoules.device.rapl_device import RaplPackageDomain
from pyJoules.energy_meter import measure_energy

@measure_energy(domains=[RaplPackageDomain(0)])
def run_inference():
    pipe("This is a test.")

run_inference()


PermissionError: [Errno 13] Permission denied: '/sys/class/powercap/intel-rapl/intel-rapl:0/energy_uj'

In [2]:
# PyTorch Profiler (For Model-Specific Inference Profiling) - this shows GPU execution time, which can be combined with nvidia-smi power draw to estimate Joules per inference.
import torch

with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
    record_shapes=True
) as prof:
    pipe(text)

print(prof.key_averages().table(sort_by="cuda_time_total"))


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::linear         4.50%      27.229ms        33.98%     205.429ms      58.460us       0.000us         0.00%     133.890ms      38.102us          3514  
                                            aten::addmm        13.74%      83.056ms        23.21%     140.348ms      40.751us     118.968ms        48.51%     118.968ms      34.544us          3444  
         

# Text generation: Llama3

In [None]:
from transformers import pipeline
from accelerate import infer_auto_device_map
import torch
from codecarbon import EmissionsTracker

device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline("text-generation", model="meta-llama/Llama-3.2-1B", device=device)

tracker = EmissionsTracker()
tracker.start()

output = pipe("Once upon a time", max_length=100, num_return_sequences=1)

tracker.stop()

print(output[0]['generated_text'])



Device set to use cuda
[codecarbon INFO @ 17:45:35] [setup] RAM Tracking...
[codecarbon INFO @ 17:45:35] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at \sys\class\powercap\intel-rapl to measure CPU

[codecarbon INFO @ 17:45:36] CPU Model on constant consumption mode: AMD EPYC 7742 64-Core Processor
[codecarbon INFO @ 17:45:36] [setup] GPU Tracking...
[codecarbon INFO @ 17:45:36] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 17:45:36] >>> Tracker's metadata:
[codecarbon INFO @ 17:45:36]   Platform system: Linux-5.15.0-113-generic-x86_64-with-glibc2.31
[codecarbon INFO @ 17:45:36]   Python version: 3.10.14
[codecarbon INFO @ 17:45:36]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 17:45:36]   Available RAM : 503.532 GB
[codecarbon INFO @ 17:45:36]   CPU count: 128
[codecarbon INFO @ 17:45:36]   CPU model: AMD EPYC 7742 64-Core Processor
[codecarbon INFO @ 17:45:36]   GPU count: 4
[codecarbon INFO @ 17:45:36]   GPU model: 4 x NVIDIA A100-PCIE-40GB
[codec

Once upon a time, there was a man who was so afraid of the dark that he would hide in his closet during the night. He would come out of his closet only to go to sleep, and he would wake up in the morning feeling like he had slept through the night. This man was so afraid of the dark that he would hide in his closet during the night. He would come out of his closet only to go to sleep, and he would wake up in the morning feeling like


In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from accelerate import dispatch_model

model_name = "meta-llama/Llama-3.2-1B"

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Automatically distribute across GPUs
device_map = infer_auto_device_map(model)
model = dispatch_model(model, device_map=device_map)

input_ids = tokenizer("Once upon a time", return_tensors="pt").input_ids.to(model.device)

tracker = EmissionsTracker()
tracker.start()

output = model.generate(input_ids, max_length=100)

tracker.stop()

print(tokenizer.decode(output[0], skip_special_tokens=True))


[codecarbon INFO @ 17:48:11] [setup] RAM Tracking...
[codecarbon INFO @ 17:48:11] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at \sys\class\powercap\intel-rapl to measure CPU

[codecarbon INFO @ 17:48:12] CPU Model on constant consumption mode: AMD EPYC 7742 64-Core Processor
[codecarbon INFO @ 17:48:12] [setup] GPU Tracking...
[codecarbon INFO @ 17:48:12] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 17:48:12] >>> Tracker's metadata:
[codecarbon INFO @ 17:48:12]   Platform system: Linux-5.15.0-113-generic-x86_64-with-glibc2.31
[codecarbon INFO @ 17:48:12]   Python version: 3.10.14
[codecarbon INFO @ 17:48:12]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 17:48:12]   Available RAM : 503.532 GB
[codecarbon INFO @ 17:48:12]   CPU count: 128
[codecarbon INFO @ 17:48:12]   CPU model: AMD EPYC 7742 64-Core Processor
[codecarbon INFO @ 17:48:12]   GPU count: 4
[codecarbon INFO @ 17:48:12]   GPU model: 4 x NVIDIA A100-PCIE-40GB
[codecarbon INFO @ 17:48:15] 

Once upon a time, there was a little girl named Lillie. Lillie was a very bright little girl, but she was also very curious. She wanted to know everything about everything. She wanted to know how things worked, and she wanted to know why things worked the way they did.
One day, Lillie was sitting on her bed, reading a book about the wonders of the world. She was fascinated by the descriptions of the different animals and plants, and she wanted


# translation: T5

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from accelerate import infer_auto_device_map


tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", device_map="auto")

input_text = "translate English to German: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))


ImportError: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.
