### I. Import Libraries

In [1]:
# !pip install llmlingua
# !pip install spacy
# !python -m spacy download en_core_web_sm

# !pip install datasets

In [2]:
import torch
import torch.nn as nn
import queue
import threading
import time
from llmlingua import PromptCompressor
from datasets import load_dataset
import random

In [3]:
# check multiprocessors
if torch.cuda.is_available():
    device = torch.device("cuda")
    gpu_properties = torch.cuda.get_device_properties(device)

    print(f"Device Name: {gpu_properties.name}")
    print(f"Multiprocessors (SMs): {gpu_properties.multi_processor_count}")
    print(f"Threads per Multiprocessor: {gpu_properties.max_threads_per_multi_processor}")
else:
    print("CUDA not available")

Device Name: Tesla T4
Multiprocessors (SMs): 40
Threads per Multiprocessor: 1024


### II. Load model

In [4]:
model_meetingbank="microsoft/llmlingua-2-xlm-roberta-large-meetingbank"
compressor = PromptCompressor(
    model_name=model_meetingbank,
    use_llmlingua2=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### III. Define Producer and Consumer

In [5]:
# Simulate real-time data streaming
def simulate_text_stream(text_queue):

    ds_test = load_dataset("openai/gsm8k", "main", split="test")

    for idx, instance in enumerate(ds_test):
        if idx==20: break
        text = instance['question']+instance['answer']
        text = text[:50] # for display
        text_queue.put(text)
        print(f"Queued {idx}: {text}")
        time.sleep(random.random())  # Simulate delay between streaming texts
    text_queue.put("STOP")  # Signal to stop processing

In [6]:
# Function for real-time inference
def process_text_stream(text_queue, results_queue):

  while True:
    # Retrieve text from the queue
    text = text_queue.get()
    if text == "STOP":
      break

    results = compressor.compress_prompt_llmlingua2(
      text,
      rate=0.6,
      force_tokens=['\n', '.', '!', '?', ','],
      chunk_end_tokens=['.', '\n'],
      return_word_label=True,
      drop_consecutive=True
    )

    # Save results
    results_queue.put((text, results['compressed_prompt']))
    print(f"Compressed Prompt: {results['compressed_prompt']}")

In [7]:
# Queues for streaming data and results
text_queue = queue.Queue()
results_queue = queue.Queue()

# Start the text stream simulation in a separate thread
threading.Thread(target=simulate_text_stream, args=(text_queue,), daemon=True).start()

# Start real-time inference
process_text_stream(text_queue, results_queue)

Queued 0: Janet’s ducks lay 16 eggs per day. She eats three 
Queued 1: A robe takes 2 bolts of blue fiber and half that m
Queued 2: Josh decides to try flipping a house.  He buys a h
Compressed Prompt: Janet’s ducks lay 16. eats
Compressed Prompt: robe takes 2 bolts blue fiber half
Queued 3: James decides to run 3 sprints 3 times a week.  He
Compressed Prompt: Josh decides try flipping house. buys
Compressed Prompt: James run 3 sprints times week.
Queued 4: Every day, Wendi feeds each of her chickens three 
Compressed Prompt: , Wendi feeds chickens three
Queued 5: Kylar went to the store to buy glasses for his new
Compressed Prompt: Kylar went store buy glasses new
Queued 6: Toulouse has twice as many sheep as Charleston. Ch
Compressed Prompt: Toulouse twice sheep Charleston.
Queued 7: Carla is downloading a 200 GB file. Normally she c
Queued 8: John drives for 3 hours at a speed of 60 mph and t
Compressed Prompt: Carla downloading 200 GB file.
Compressed Prompt: John drives 3 hours sp

* producer and consumer are fully decoupled. Allow full usage of computer resources for inference without idle machines.