In [None]:
from pathlib import Path
import os
from dotenv import load_dotenv
from collections import deque
import json
import re

load_dotenv()

logfiles_path = Path('logfiles')


In [None]:
from typing import List, Dict
import requests

CHAT_MODEL = 'google/gemini-2.0-flash-001'

def chat(messages: List[Dict[str, str]]) -> str:
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {os.environ['OPENROUTER_API_KEY']}",
        },
        data=json.dumps({
            'model': CHAT_MODEL,
            "messages": messages,
        })
    )
    response.raise_for_status()
    reply = response.json()['choices'][0]['message']['content']
    return reply
    
def find_next_unused_file(root: Path, suffix: str) -> Path:
    '''of the form [number][suffix]'''
    assert root.exists()
    # TODO: binary search
    for i in range(0, 1000000):
        outpath = logfiles_path / f'{i}{suffix}'
        if not outpath.exists():
            return outpath
    assert False

In [None]:
# synthesize a bunch of log files. put them in `logfiles`

from concurrent.futures import ThreadPoolExecutor
from typing import Optional

NUM_FILES_TO_GENERATE = 10

with ThreadPoolExecutor() as executor:
  def gen_log() -> Optional[str]:
    for retry in range(4):
      content = chat([
          {
            "role": "user",
            "content": '''Generate a plausible log file, as would be emitted from some application or service.
It should contain both uninteresting and interesting lines, including interesting lines that aren't clearly marked as that.
Include poorly formatted log lines.
Reply ONLY with the log lines. No explanations, markdown quotes or any other form of framing.
            '''
          }
        ])
      if content.startswith('```') or content.endswith('```'):
          print('undesired framing in llm response. retrying')
      else:
          return content
    print('model is stubborn. giving up')
    return None
  logs = executor.map(lambda _: gen_log(), range(NUM_FILES_TO_GENERATE))
  for log in logs:
    if log is None:
       continue
    logfiles_path.mkdir(exist_ok=True, parents=True)
    outpath = find_next_unused_file(logfiles_path, '.log')
    with outpath.open('w') as f:
        f.write(log)

In [None]:
# ask the model if the generated files look good.
# it isn't very accurate or useful

format_check = []
for p in logfiles_path.glob('*.log'):
    logfile = p.read_text()
    def fun(msg: str):
      lmsg = msg.lower()
      if 'yes' in lmsg:
         return True
      if 'no' in lmsg:
         return False
      raise Exception(f'bad reply from model: {msg}')
    for retry in range(100000):
      try:
        response = chat([
            {
              "role": "user",
              "content": f"Does the following text look like a raw log file? It mustn't contain any additional framing, only the actual log lines. Answer ONLY with yes or no:\n{logfile}"
            }
          ]
        )
        good = fun(response)
        break
      except Exception as e:
        if retry < 10:
          print(f'{e}. retrying..')
        else:
          raise
    format_check.append((p, good))
    print(f'{p}: {"good" if good else "bad"}')
          


In [None]:
import datasets

system_prompt = {
                "role": "system",
                'content': '''You are a developer log analyzer.
Given a sequence of log lines. Rate only the last line. Use the prior lines only for context.
If a prior line looks unrelated to the last one, disregard it.
Rate the last line by how interesting you think it is for diagnosing an issue with the system.
Output EXACTLY in this format:
```
Very brief single-sentence analysis on a single line
SCORE: 0-100
```

Do NOT include any code examples, snippets, or additional explanations.
Keep responses strictly limited to the analysis and score.
Do NOT include any additional framing such as ````.

Score guide:
Low (0-30): Routine/minor info
Medium (31-70): Noteworthy/important
High (71-100): Critical/security issues
'''
}
formatre = re.compile(r'^.*\nSCORE: (?:100|\d{1,2})$')

def iterate_line_windows():
  for log_path in list(logfiles_path.glob('*.log'))[:1]:
    history = deque(maxlen=3)
    with log_path.open('r') as f:
        for line in f.readlines():
          line = line.rstrip()
          history.append(line)
          # TODO: concat a random number of lines to avoid overfitting
          lines = ''.join((f'{l}\n' for l in history))
          yield lines

def generate_conversations():
  for lines in iterate_line_windows():
    query = {
              'role': 'user',
              'content': lines,
            }
    for retry in range(4):
      reply = chat([
              system_prompt,
              query,
            ])
      if formatre.match(reply) is None:
        print(f'bad reply from model: {reply}')
        continue
      break
    else:
      print('the model is obstinate, ignoring this line')
      continue
    assert isinstance(system_prompt['content'], str)
    assert isinstance(lines, str)
    assert isinstance(reply, str)
    yield {
      'conversations': [
        system_prompt,
        query,
        {
          'role': 'assistant',
          'content': reply
        }
      ]
    }

dataset_list = list(generate_conversations())


In [None]:
hf_dataset = datasets.Dataset.from_generator(generate_conversations)

In [None]:
with ThreadPoolExecutor() as executor:
    hf_dataset = datasets.Dataset.from_generator(executor.map())

In [None]:
dataset_list

In [None]:
dataset_list2 = [{'conversations': [{'role': 'system', 'content': d['instruction']}, {'role': 'user', 'content': d['input']}, {'role': 'assistant', 'content': d['output']}]} for d in dataset_list]

In [None]:
dataset_list2

In [None]:
hf_dataset = datasets.Dataset.from_list(dataset_list)


In [None]:
hf_dataset_path = Path('/Users/joel/moushkka@gmail.com - Google Drive/My Drive/colab/llmog_data/dataset3')
hf_dataset_path.mkdir(exist_ok=True, parents=True)
hf_dataset.save_to_disk(hf_dataset_path)

In [None]:
import datasets
import gzip
import json
import re

def load_conversations_generator():
    """Generator function to load and parse conversations from .json.gz files."""
    for file_path in conversations_root.glob('*.json.gz'):
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            conversations_list = json.load(f)
            #assert len(conversations_list) == 1
            print(conversations_list)
        yield {'conversations': conversations_list }

# Create the Hugging Face Dataset using the generator
hf_dataset = datasets.Dataset.from_generator(load_conversations_generator)
hf_dataset_path = Path('/Users/joel/moushkka@gmail.com - Google Drive/My Drive/colab/llmog_data/dataset')
hf_dataset_path.mkdir(exist_ok=True, parents=True)
hf_dataset.save_to_disk(hf_dataset_path)

# Print the dataset info and the first example
print(hf_dataset)
if len(hf_dataset) > 0:
    print("\nFirst example:")
    print(hf_dataset[0])
else:
    print("\nDataset is empty. Check warnings/errors during generation.")

In [None]:
with gzip.open('conversations/0.json.gz', 'rt', encoding='utf-8') as f:
    data = json.load(f)
    print(json.dumps(data, indent=4))