In [26]:
#from openai import OpenAI
from pathlib import Path
import os
from dotenv import load_dotenv
#import time
from collections import deque
import json
# import gzip
# import itertools
import re

load_dotenv()

# client = OpenAI(
#   base_url="https://openrouter.ai/api/v1",
#   api_key=os.environ['OPENROUTER_API_KEY']
# )

logfiles_path = Path('logfiles')
conversations_root = Path('conversations')
conversations_root.mkdir(exist_ok=True, parents=True)


In [30]:
from typing import List, Dict
import requests

CHAT_MODEL = 'google/gemini-2.0-flash-001'

def chat(messages: List[Dict[str, str]]) -> str:
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {os.environ['OPENROUTER_API_KEY']}",
        },
        data=json.dumps({
            'model': CHAT_MODEL,
            "messages": messages,
        })
    )
    response.raise_for_status()
    reply = response.json()['choices'][0]['message']['content']
    return reply
    
def find_next_unused_file(root: Path, suffix: str) -> Path:
    '''of the form [number][suffix]'''
    assert root.exists()
    # TODO: binary search
    for i in range(0, 1000000):
        outpath = logfiles_path / f'{i}{suffix}'
        if not outpath.exists():
            return outpath
    assert False

In [None]:
for retry in range(4):
  content = chat([
      {
        "role": "user",
        "content": "Generate a plausible log file, as would be emitted from some application or service. It should contain both uninteresting and interesting lines. Reply ONLY with the log lines. No explanations, markdown quotes or any other form of framing."
      }
    ])
  if content.startswith('```'):
      print('undesired framing in llm response. retrying')
  else:
      break
else:
    raise Exception('model is stubborn. giving up')

logfiles_path.mkdir(exist_ok=True, parents=True)
outpath = find_next_unused_file(logfiles_path, '.log')
with outpath.open('w') as f:
    f.write(content)

In [None]:
format_check = []
for p in logfiles_path.glob('*.log'):
    logfile = p.read_text()
    def fun(msg: str):
      lmsg = msg.lower()
      if 'yes' in lmsg:
         return True
      if 'no' in lmsg:
         return False
      raise Exception(f'bad reply from model: {msg}')
    for retry in range(100000):
      try:
        completion = client.chat.completions.create(
          model='meta-llama/llama-4-maverick:free',
          messages=[
            {
              "role": "user",
              "content": f"Does the following text look like a raw log file? It mustn't contain any additional framing, only the actual log lines. Answer ONLY with yes or no:\n{logfile}"
            }
          ]
        )
        if hasattr(completion, 'error') and completion.error:
            raise Exception(f"API call failed: {completion.error}")
        good = fun(completion.choices[0].message.content)
        break
      except Exception as e:
        if retry < 10:
          print(f'{e}. retrying..')
        else:
          raise
    format_check.append((p, good))
    print(f'{p}: {good}')
          


In [None]:
import datasets

system_prompt = {
                "role": "system",
                'content': '''You are a developer log analyzer.
Given a sequence of log lines. Rate only the last line. Use the prior lines only for context.
Rate that line by how interesting you think that line is for diagnosing an issue with the system.
Output EXACTLY in this format:
```
Very brief single-sentence analysis on a single line
SCORE: 0-100
```

Do NOT include any code examples, snippets, or additional explanations.
Keep responses strictly limited to the analysis and score.
Do NOT include any additional framing such as ````.

Score guide:
Low (0-30): Routine/minor info
Medium (31-70): Noteworthy/important
High (71-100): Critical/security issues
'''
}
formatre = re.compile(r'^.*\nSCORE: (?:100|\d{1,2})$')

#for log_path in [output / '0.log']:
def generate_conversations():
  for log_path in logfiles_path.glob('*.log'):
    print(f'evaluating {log_path}\n===========\n\n')
    #conversations = []
    history = deque(maxlen=3)
    with log_path.open('r') as f:
        for line in f.readlines():
          line = line.rstrip()
          history.append(line)
          lines = ''.join((f'{l}\n' for l in history))
          query = {
                    'role': 'user',
                    'content': lines,
                  }
          for retry in range(4):
            completion = client.chat.completions.create(
                  #model='meta-llama/llama-4-maverick:free',
                  #model='google/gemini-2.0-flash-exp:free',
                  #model='meta-llama/llama-4-scout',
                  model='google/gemini-2.0-flash-001',
                  messages=[
                    system_prompt,
                    query,
                  ]
                )
            if hasattr(completion, 'error') and completion.error:
              raise Exception(f"API call failed: {completion.error}")
            assert completion.choices[0].message.role == 'assistant'
            message = completion.choices[0].message.content
            if formatre.match(message) is None:
              print(f'bad reply from model: {message}')
              continue
            break
          else:
            print('the model is obstinate, ignoring this line')
            continue
          print(f'{lines}{message}\n')
          # TODO: make sure the reply follows the requested structure
          # yield {
          #   'conversations': [
          #     system_prompt,
          #     query,
          #     {
          #       'role': 'assistant',
          #       'content': message
          #     },
          # ]}
          assert isinstance(system_prompt['content'], str)
          assert isinstance(lines, str)
          assert isinstance(message, str)
          yield {
            'instruction': system_prompt['content'],
            'input': lines,
            'output': message,
          }
          # conversations.append([
          #   system_prompt,
          #   query,
          #   message,
          # ])
    # conversation_path = conversations_root / log_path.relative_to(output).with_suffix('.json.gz')
    # with gzip.open(conversation_path, 'wb') as f:
    #     f.write(json.dumps(conversations).encode('utf-8'))

dataset_list = list(generate_conversations())




In [None]:
dataset_list2 = [{'conversations': [{'role': 'system', 'content': d['instruction']}, {'role': 'user', 'content': d['input']}, {'role': 'assistant', 'content': d['output']}]} for d in dataset_list]

In [None]:
dataset_list2

In [None]:
hf_dataset = datasets.Dataset.from_list(dataset_list2)


In [None]:
hf_dataset_path = Path('/Users/joel/moushkka@gmail.com - Google Drive/My Drive/colab/llmog_data/dataset2')
hf_dataset_path.mkdir(exist_ok=True, parents=True)
hf_dataset.save_to_disk(hf_dataset_path)

In [None]:
import datasets
import gzip
import json
import re

def load_conversations_generator():
    """Generator function to load and parse conversations from .json.gz files."""
    for file_path in conversations_root.glob('*.json.gz'):
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            conversations_list = json.load(f)
            #assert len(conversations_list) == 1
            print(conversations_list)
        yield {'conversations': conversations_list }

# Create the Hugging Face Dataset using the generator
hf_dataset = datasets.Dataset.from_generator(load_conversations_generator)
hf_dataset_path = Path('/Users/joel/moushkka@gmail.com - Google Drive/My Drive/colab/llmog_data/dataset')
hf_dataset_path.mkdir(exist_ok=True, parents=True)
hf_dataset.save_to_disk(hf_dataset_path)

# Print the dataset info and the first example
print(hf_dataset)
if len(hf_dataset) > 0:
    print("\nFirst example:")
    print(hf_dataset[0])
else:
    print("\nDataset is empty. Check warnings/errors during generation.")

In [None]:
with gzip.open('conversations/0.json.gz', 'rt', encoding='utf-8') as f:
    data = json.load(f)
    print(json.dumps(data, indent=4))