In [1]:
## Currently if you call a model and its already loaded, it goes through, if the model is not loaded, the model is loaded and then a request gets send to it.
## Requests other than the first are quite fast.
## Bigger models take much longer



import requests

BASE_URL = "http://100.118.250.126:6000" # Also worked on: "http://localhost:6000"
GENERATE_URL = f"{BASE_URL}/generate"
GENERATE_STREAM_URL = f"{BASE_URL}/generate_stream"

def generate_text(model_name, text, max_length=100):
    """
    Calls the /generate endpoint and prints the full generated text.
    """
    payload = {
        "model_name": model_name,
        "text": text,
        "max_length": max_length
    }
    
    response = requests.post(GENERATE_URL, json=payload)
    
    if response.status_code == 200:
        data = response.json()
        print("\n=== /generate ===")
        print(f"Model Name: {model_name}")
        print(f"Prompt: {text}")
        print("Generated Text:")
        print(data.get("response", "No response field in JSON"))
        print(f"Time taken: {data.get('time_taken', 'N/A')} seconds")
    else:
        print(f"Error: {response.status_code} - {response.text}")

def generate_text_stream(model_name, text, max_length=100):
    """
    Calls the /generate_stream endpoint and prints tokens as they arrive.
    """
    payload = {
        "model_name": model_name,
        "text": text,
        "max_length": max_length
    }
    
    response = requests.post(GENERATE_STREAM_URL, json=payload, stream=True)
    
    if response.status_code == 200:
        print("\n=== /generate_stream ===")
        print(f"Model Name: {model_name}")
        print(f"Prompt: {text}")
        print("Streaming Generated Text:")
        for chunk in response.iter_lines(decode_unicode=True):
            if chunk:
                print(chunk, end=" ", flush=True)
        print()  # New line after the stream ends
    else:
        print(f"Error: {response.status_code} - {response.text}")

# Example prompts
prompt1 = "Explain the key differences between artificial intelligence and human intelligence."  
prompt2 = "What are the potential risks and benefits of artificial general intelligence (AGI)?"  



| **Model**                              | **Architecture / Basis**                                             | **Estimated Max Token Limit** | **Estimated Max Word Limit**  |
|----------------------------------------|----------------------------------------------------------------------|-------------------------------|-------------------------------|
| Equall/Saul-7B-Instruct-v1             | Continued pretraining of Mistral‑7B                                  | ~8,192 tokens                 | ~6,000 words                  |
| ricdomolm/lawma-8b                     | Fine‑tuned on Llama‑3 8B Instruct                                    | ~8,192 tokens                 | ~6,000 words                  |
| ricdomolm/lawma-70b                    | Fine‑tuned on Llama‑3 70B Instruct for legal classification tasks¹    | ~4,096 tokens                 | ~3,072 words                  |
| deepseek‑ai/DeepSeek‑V2‑Lite (and Chat)  | Mixture‑of‑Experts (MoE) model with an extended context window       | 32,000 tokens                 | ~24,000 words                 |



¹ Although the base Llama‑3‑70B model supports an 8k-token context, this version was configured for legal classification—where shorter inputs (and corresponding outputs) are sufficient—resulting in an effective limit of approximately 4k tokens.

Also the code run until this was:

```
python3 app.py 
 * Serving Flask app 'app'
 * Debug mode: off
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:6000
 * Running on http://100.118.250.126:6000

```

In [None]:
# Test: Using "lawma_70b" model. This is a large model.
generate_text("lawma_70b", prompt1, max_length=100)
generate_text_stream("lawma_70b", prompt2, max_length=100)

In [None]:
# Test: Using "saul_7b_instruct" model
generate_text("saul_7b_instruct", prompt1, max_length=50)
generate_text_stream("saul_7b_instruct", prompt2, max_length=50)

In [None]:
# Test: Using "lawma_8b" model
generate_text("lawma_8b", prompt1, max_length=50)
generate_text_stream("lawma_8b", prompt2, max_length=50)


In [None]:
# Test: Using "DeepSeek-V2-Lite" model
generate_text("DeepSeek-V2-Lite", prompt2, max_length=200)
generate_text_stream("DeepSeek-V2-Lite", prompt1, max_length=200)