In [None]:
!pip install --upgrade requests datasets

In [None]:
import os
is_colab = any('COLAB_' in k for k in os.environ.keys())

if is_colab:
  from google.colab import userdata
  HF_TOKEN = userdata.get('HF_TOKEN')
  OPENROUTER_API_KEY = userdata.get("OPENROUTER_API_KEY")
else:
  !pip install dotenv
  from dotenv import load_dotenv
  load_dotenv()
  HF_TOKEN = os.environ["HF_TOKEN"]
  OPENROUTER_API_KEY = os.environ["OPENROUTER_API_KEY"]


In [None]:
from pathlib import Path
import os
from collections import deque
import json
import re
import datasets
from concurrent.futures import ThreadPoolExecutor
from typing import Optional, Any, Dict
import random

logfiles_path = Path("logfiles")
non_logfiles_path = Path("non_logfiles")

In [None]:
from typing import List, Dict
import requests

CHAT_MODEL = "google/gemini-2.0-flash-001"


def chat(messages: List[Dict[str, str]]) -> str:
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        },
        data=json.dumps(
            {
                "model": CHAT_MODEL,
                "messages": messages,
            }
        ),
    )
    response.raise_for_status()
    message = response.json()["choices"][0]["message"]
    assert message["role"] == "assistant"
    reply = message["content"]
    return reply


def find_next_unused_file(root: Path, suffix: str) -> Path:
    """of the form [number][suffix]"""
    assert root.exists()
    # TODO: binary search
    for i in range(0, 1000000):
        outpath = root / f"{i}{suffix}"
        if not outpath.exists():
            return outpath
    assert False

_start_markdown_re = re.compile(r"^```\w*\n")
_end_markdown_re = re.compile(r"```\n*$")
def strip_markdown_framing(s: str) -> str:
    start = _start_markdown_re.search(s)
    end = _end_markdown_re.search(s)
    if start and end:
        return s[start.end() : end.start()]
    return s

In [None]:
# synthesize a bunch of log files. put them in `logfiles`

from concurrent.futures import ThreadPoolExecutor
from typing import Optional

NUM_FILES_TO_GENERATE = 100

# set max_workers to some high value to generate data faster
with ThreadPoolExecutor() as executor:

    def gen_log() -> Optional[str]:
        for _ in range(4):
            content = chat(
                [
                    {
                        "role": "user",
                        "content": """Generate a plausible log file, as would be emitted from some application or service.
It should contain both uninteresting and interesting lines, including interesting lines that aren't clearly marked as that.
Include poorly formatted log lines.
Pick a random timestamp format, or don't include one at all. But use the same format for the entire file.
Reply ONLY with the log lines. No explanations, markdown quotes or any other form of framing.
            """,
                    }
                ]
            )
            content = strip_markdown_framing(content)
            if _start_markdown_re.match(content) or _end_markdown_re.match(content):
                print("undesired framing in llm response. retrying")
            else:
                return content
        print("model is stubborn. giving up")
        return None

    logs = executor.map(lambda _: gen_log(), range(NUM_FILES_TO_GENERATE))
    for log in logs:
        if log is None:
            continue
        logfiles_path.mkdir(exist_ok=True, parents=True)
        outpath = find_next_unused_file(logfiles_path, ".log")
        with outpath.open("w") as f:
            f.write(log)

In [None]:
# synthesize a bunch of non-log files. put them in `non_logfiles` but with .txt extension


NUM_NON_LOG_FILES = 50

# set max_workers to some high value to generate data faster
with ThreadPoolExecutor(max_workers=50) as executor:

    def gen_non_log() -> Optional[str]:
        for _ in range(4):
            content = chat(
                [
                    {
                        "role": "user",
                        "content": """Generate random text content that does NOT resemble a log file.
It could be a snippet of an article, code, configuration, documentation, or other non-log content.
Reply ONLY with the text content. No explanations, markdown quotes or any other form of framing.
Do NOT start or end with ```.
            """,
                    }
                ]
            )
            # the model likes to add markdown code framing even though we tell it not to. so we strip that out
            content = strip_markdown_framing(content)
            return content
        print("model is stubborn. giving up")
        return None

    non_logs = executor.map(lambda _: gen_non_log(), range(NUM_NON_LOG_FILES))
    for non_log in non_logs:
        if non_log is None:
            continue
        non_logfiles_path.mkdir(exist_ok=True, parents=True)
        outpath = find_next_unused_file(non_logfiles_path, ".txt")
        with outpath.open("w") as f:
            f.write(non_log)

In [None]:
def iterate_logs():
    for logpath in logfiles_path.glob("*.log"):
        with logpath.open("rt") as f:
            yield {"logs": f.read()}


hf_raw_logs_dataset = datasets.Dataset.from_generator(iterate_logs)

In [None]:
hf_raw_logs_dataset.push_to_hub("jnises/llmog-raw-logs", token=HF_TOKEN)

In [None]:
import datasets


def iterate_non_logs():
    for path in non_logfiles_path.glob("*.txt"):
        with path.open("rt") as f:
            yield {"non_log": f.read()}


hf_raw_non_logs_dataset = datasets.Dataset.from_generator(iterate_non_logs)

In [None]:
hf_raw_non_logs_dataset.push_to_hub("jnises/llmog-raw-non-logs", token=HF_TOKEN)

In [None]:
# ask the model if the generated files look good.
# it isn't very accurate or useful

if False:
    format_check = []
    for p in logfiles_path.glob("*.log"):
        logfile = p.read_text()

        def fun(msg: str):
            lmsg = msg.lower()
            if "yes" in lmsg:
                return True
            if "no" in lmsg:
                return False
            raise Exception(f"bad reply from model: {msg}")

        for retry in range(100000):
            try:
                response = chat(
                    [
                        {
                            "role": "user",
                            "content": f"Does the following text look like a raw log file? It mustn't contain any additional framing, only the actual log lines. Answer ONLY with yes or no:\n{logfile}",
                        }
                    ]
                )
                good = fun(response)
                break
            except Exception as e:
                if retry < 10:
                    print(f"{e}. retrying..")
                else:
                    raise
        format_check.append((p, good))
        print(f"{p}: {'good' if good else 'bad'}")


In [None]:
log_dataset = datasets.load_dataset('jnises/llmog-raw-logs', token=HF_TOKEN, split='train')['logs']
non_log_dataset = datasets.load_dataset('jnises/llmog-raw-non-logs', token=HF_TOKEN, split='train')['non_log']


In [None]:
# synthesize conversations where the model rates each line in the log files

# separate system prompt for the trainer as we know it will only be used for actual log files
trainer_system_prompt = {
    "role": "system",
    "content": """You are a developer log analyzer.
Given a sequence of log lines. Rate the last line by how interesting you think it is for diagnosing an issue with the system.
Rate only the last line. Use the prior lines only for context.
If a prior line looks unrelated to the last one, disregard it.
Output EXACTLY in this format:
```
Very brief single-sentence analysis on a single line
SCORE: 0-100
```

Do NOT include any code examples, snippets, or additional explanations.
Keep responses strictly limited to the analysis and score.
Do NOT include any additional framing such as ```.
Do NOT start the analysis with "The last line" or similar redundant information.

Score guide:
Low (0-30): Routine/minor info
Medium (31-70): Noteworthy/important
High (71-100): Critical/security issues
""",
}

system_prompt = {
    "role": "system",
    "content": """You are a developer log analyzer.
Given a sequence of lines of text. Determine if it looks like a log file or not.
If it looks like a log rate the last line by how interesting you think it is for diagnosing an issue with the system.
Rate only the last line. Use the prior lines only for context.
If a prior line looks unrelated to the last one, disregard it.
Output EXACTLY in this format:
```
Very brief single-sentence analysis on a single line
SCORE: 0-100
```

If it doesn't look like a log file just respond with:
```
Not a log
SCORE: 0
```

Do NOT include any code examples, snippets, or additional explanations.
Keep responses strictly limited to the analysis and score.
Do NOT include any additional framing such as ```.
Do NOT start the analysis with "The last line" or similar redundant information.

Score guide:
Low (0-30): Routine/minor info
Medium (31-70): Noteworthy/important
High (71-100): Critical/security issues
""",
}

formatre = re.compile(r"^(?:```\n)?(?P<response>.*\nSCORE: (?:100|\d{1,2})\n?)(?:\n?```\n?)?$")


def iterate_line_windows(files):
    for file in files:
        history = deque(maxlen=10)
        for line in file.splitlines():
            line = line.rstrip()
            history.append(line)
            num_lines = random.randint(1, len(history))
            selected_lines = list(history)[-num_lines:]
            lines = "".join((f"{l}\n" for l in selected_lines))
            yield lines


def generate_conversations():
    # set max_workers to some high value to generate data faster
    with ThreadPoolExecutor(max_workers=100) as executor:

        def f(lines) -> Optional[Dict[str, Any]]:
            query = {
                "role": "user",
                "content": lines,
            }
            for _ in range(4):
                try:
                    reply = chat(
                        [
                            trainer_system_prompt,
                            query,
                        ]
                    )
                except (
                    ConnectionResetError,
                    requests.exceptions.RequestException,
                ) as e:
                    # TODO: exponential backoff
                    print(f"exception communicating with model: {e}")
                    continue
                # TODO: handle and strip out any ``` framing as the model sometimes produces that even though we tell it not to
                if m := formatre.match(reply):
                    reply = m.group('response')
                    # if reply == 'Not a log\nSCORE: 0\n':
                    #     # we know this is a log. so just tell the model to try again
                    #     continue
                else:
                    print(f"bad reply from model: {reply}")
                    continue
                break
            else:
                print("the model is obstinate, ignoring this line")
                return None
            assert isinstance(system_prompt["content"], str)
            assert isinstance(lines, str)
            assert isinstance(reply, str)
            return {
                "conversations": [
                    system_prompt,
                    query,
                    {"role": "assistant", "content": reply},
                ]
            }

        for non_log in iterate_line_windows(non_log_dataset):
            assert isinstance(non_log, str)
            yield {
                'conversations': [
                    system_prompt,
                    {'role': 'user', 'content': non_log},
                    {'role': 'assistant', 'content': 'Not a log\nSCORE: 0\n'}
                ]
            }
        results_iterator = executor.map(f, iterate_line_windows(log_dataset))
        for result in results_iterator:
            if result is not None:
                yield result


hf_dataset = datasets.Dataset.from_generator(generate_conversations)

In [None]:
hf_dataset

In [None]:
hf_dataset = hf_dataset.train_test_split(test_size=0.1, seed=7834761)

In [None]:
hf_dataset.push_to_hub("jnises/llmog-conversations", token=HF_TOKEN)