In [27]:
from datasets import Dataset
import pandas as pd


dataset_path = "nl-logql-dataset-classified"

dataset = Dataset.load_from_disk(dataset_path)
df = dataset.to_pandas()

In [21]:
# print the first row's logql query
df["log_category"][0].get("line_filter")

'multiple line filters'

In [22]:
# Assuming df is your DataFrame
# Add two new columns based on the values from log_category
df["line_filter"] = df["log_category"].apply(
    lambda x: x.get("line_filter") if isinstance(x, dict) else None
)
df["label_filter"] = df["log_category"].apply(
    lambda x: x.get("label_filter") if isinstance(x, dict) else None
)

# Display the first row to verify the new columns
display(df.head(1))

Unnamed: 0,application,id,question,logql_query,query_explanation,query_result,category,log_category,line_filter,label_filter
0,openstack,2,How long did it take to spawn instance 3edec1e...,"{application=""openstack"", log_file_type=""nova-...",bla,3edec1e4-9678-4a3a-a21b-a145a4ee5e61 took 20.58,,"{'chain_of_thought': 'Analyzing the query, it ...",multiple line filters,multiple log stream selectors


In [23]:
df.rename(columns={"log_category": "log_category_result"}, inplace=True)

In [25]:
Dataset.from_pandas(df).save_to_disk(dataset_path)

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

In [28]:
df

Unnamed: 0,application,id,question,logql_query,query_explanation,query_result,category,log_category_result,line_filter,label_filter
0,openstack,2,How long did it take to spawn instance 3edec1e...,"{application=""openstack"", log_file_type=""nova-...",bla,3edec1e4-9678-4a3a-a21b-a145a4ee5e61 took 20.58,,"{'chain_of_thought': 'Analyzing the query, it ...",multiple line filters,multiple log stream selectors
1,openstack,3,What was the total time taken to build instanc...,"{application=""openstack"", log_file_type=""nova-...","1. {application=""openstack"", log_file_type=""no...",21.38,,{'chain_of_thought': 'The user query submits u...,multiple line filters,multiple log stream selectors
2,openstack,4,What was the total time taken to build instanc...,"{application=""openstack"", log_file_type=""nova-...","1\n{application=""openstack"", log_file_type=""no...",vcpu: 0.00 VCPU used out of 16 VCPU\ndisk: 0.0...,,{'chain_of_thought': 'For the log query in que...,multiple line filters,multiple log stream selectors
3,openstack,5,What is the vCPU usage for compute node cp-1.s...,max by (node) (\n max_over_time(\n {applic...,"1\n{application=""openstack"", log_file_type=""no...",<graph>\ngraph with plot of used_vcpus across ...,,{'chain_of_thought': 'The query contains three...,multiple line filters,multiple log stream selectors
4,openstack,6,What is the RAM usage for compute node cp-1.sl...,max by (node) (\n max_over_time(\n {applic...,"1\n{application=""openstack"", log_file_type=""no...",<graph>\\ngraph with plot of used_vcpus across...,,{'chain_of_thought': 'This query has three lab...,multiple line filters,multiple log stream selectors
...,...,...,...,...,...,...,...,...,...,...
95,openstack,97,What is the total size of all active base files?,sum by (component) (\n count_over_time({appli...,"1\n{application=""openstack"", component=""nova.v...",12.0k\n<graph>,Image and File Management,{'chain_of_thought': 'The log query contains t...,single line filter,multiple log stream selectors
96,openstack,98,What is the average response time for GET requ...,"avg(\n avg_over_time(\n {application=""open...","\n1. `{application=""openstack"", log_file_type=...",0.264\n<graph>,API Performance and Requests,{'chain_of_thought': 'The query specifies two ...,multiple line filters,multiple log stream selectors
97,openstack,99,How many POST requests to /v2/{tenant_id}/os-s...,"sum(count_over_time({application=""openstack""}\...","1. `{application=""openstack"", log_file_type=""n...",0,API Performance and Requests,{'chain_of_thought': 'The query provided uses ...,multiple line filters,single log stream selector
98,openstack,100,What is the 95th percentile response time for ...,"quantile_over_time(0.95,\n {application=""open...","1. `{application=""openstack"", log_file_type=""n...",0.23,API Performance and Requests,"{'chain_of_thought': 'In this query, there are...",multiple line filters,multiple log stream selectors


# Metric Queries

In [8]:
import os

import instructor
from datasets import Dataset
from dotenv import load_dotenv
from models import LogClass, MetricClass
from openai import OpenAI
from prompts import METRIC_CATEGORY_PROMPT

load_dotenv(".env")

client = instructor.from_openai(OpenAI())
dataset_path = "nl-logql-dataset-classified"
output_path = "nl-logql-dataset-classified-metric"
dataset = Dataset.load_from_disk(dataset_path)


def classify_metric_query(example):
    try:
        res = client.chat.completions.create(
            model="gpt-4-turbo",
            messages=[
                {
                    "role": "system",
                    "content": METRIC_CATEGORY_PROMPT,
                },
                {
                    "role": "user",
                    "content": "Identify the metric aggregation types for the following log query:",
                },
                {
                    "role": "user",
                    "content": example["logql_query"],
                },
            ],
            response_model=MetricClass,
        )
        example["metric_category"] = res.model_dump()
        # print(res.model_dump())
    except Exception as e:
        print(f"Error processing query: {example['logql_query']}")
        print(f"Error: {str(e)}")
        example["metric_category"] = None
    return example


new_dataset = dataset.map(
    classify_metric_query,
    num_proc=os.cpu_count(),
    desc="Classifying metric queries",
    # disable=False,
)


# Save the updated dataset



Classifying metric queries (num_proc=20):   0%|          | 0/100 [00:00<?, ? examples/s]

In [9]:
new_dataset.save_to_disk(output_path)

print("Classification complete. Results saved to the dataset.")

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

Classification complete. Results saved to the dataset.


In [1]:
from datasets import Dataset
import pandas as pd


dataset_path = "nl-logql-dataset-classified-metric"

dataset = Dataset.load_from_disk(dataset_path)
df = dataset.to_pandas()

In [2]:
df.rename(columns={"metric_category": "metric_category_result"}, inplace=True)
df["metric_category"] = df["metric_category_result"].apply(
    lambda x: x.get("categories") if isinstance(x, dict) else None
)

display(df.head(1))

Unnamed: 0,application,id,question,logql_query,query_explanation,query_result,category,log_category_result,line_filter,label_filter,metric_category_result,variables,metric_category
0,openstack,2,How long did it take to spawn instance 3edec1e...,"{application=""openstack"", log_file_type=""nova-...",bla,3edec1e4-9678-4a3a-a21b-a145a4ee5e61 took 20.58,,"{'chain_of_thought': 'Analyzing the query, it ...",multiple line filters,multiple log stream selectors,"{'categories': None, 'chain_of_thought': 'The ...",,


In [14]:
df.head(19)

Unnamed: 0,application,id,question,logql_query,query_explanation,query_result,category,log_category_result,line_filter,label_filter,metric_category_result,metric_category
0,openstack,2,How long did it take to spawn instance 3edec1e...,"{application=""openstack"", log_file_type=""nova-...",bla,3edec1e4-9678-4a3a-a21b-a145a4ee5e61 took 20.58,,"{'chain_of_thought': 'Analyzing the query, it ...",multiple line filters,multiple log stream selectors,"{'categories': None, 'chain_of_thought': 'The ...",
1,openstack,3,What was the total time taken to build instanc...,"{application=""openstack"", log_file_type=""nova-...","1. {application=""openstack"", log_file_type=""no...",21.38,,{'chain_of_thought': 'The user query submits u...,multiple line filters,multiple log stream selectors,"{'categories': None, 'chain_of_thought': 'This...",
2,openstack,4,What was the total time taken to build instanc...,"{application=""openstack"", log_file_type=""nova-...","1\n{application=""openstack"", log_file_type=""no...",vcpu: 0.00 VCPU used out of 16 VCPU\ndisk: 0.0...,,{'chain_of_thought': 'For the log query in que...,multiple line filters,multiple log stream selectors,"{'categories': None, 'chain_of_thought': 'The ...",
3,openstack,5,What is the vCPU usage for compute node cp-1.s...,max by (node) (\n max_over_time(\n {applic...,"1\n{application=""openstack"", log_file_type=""no...",<graph>\ngraph with plot of used_vcpus across ...,,{'chain_of_thought': 'The query contains three...,multiple line filters,multiple log stream selectors,"{'categories': ['unwrapped_range_aggregation',...","[unwrapped_range_aggregation, built_in_range_a..."
4,openstack,6,What is the RAM usage for compute node cp-1.sl...,max by (node) (\n max_over_time(\n {applic...,"1\n{application=""openstack"", log_file_type=""no...",<graph>\\ngraph with plot of used_vcpus across...,,{'chain_of_thought': 'This query has three lab...,multiple line filters,multiple log stream selectors,"{'categories': ['built_in_range_aggregation', ...","[built_in_range_aggregation, unwrapped_range_a..."
5,openstack,7,What was the maximum response time for GET req...,"max (\nmax_over_time(\n {application=""open...","1\n{application=""openstack"", log_file_type=""no...",1.30\n<graph>\ngraph with 1.30 as a line.\n</g...,,{'chain_of_thought': 'The query defines two la...,multiple line filters,multiple log stream selectors,"{'categories': ['built_in_range_aggregation', ...","[built_in_range_aggregation, unwrapped_range_a..."
6,openstack,8,How many POST requests to /v2/e9746973ac574c6b...,"sum(count_over_time({application=""openstack"", ...","1\n{application=""openstack"", log_file_type=""no...",2.07k\n<graph>\ngraph with 2.07k as straight l...,,{'chain_of_thought': 'The given log query invo...,multiple line filters,multiple log stream selectors,"{'categories': ['log_range_aggregation', 'buil...","[log_range_aggregation, built_in_range_aggrega..."
7,openstack,9,What's the average response time for DELETE re...,avg by (application) (\n avg_over_time({app...,"1\n{application=""openstack"", log_file_type=""no...",0.267\n<graph>\nsingle line graph of 0.267 for...,,{'chain_of_thought': 'The query includes two l...,multiple line filters,multiple log stream selectors,"{'categories': ['built_in_range_aggregation', ...","[built_in_range_aggregation, unwrapped_range_a..."
8,openstack,10,How many instances of 'HTTP exception thrown: ...,sum by (application)\n(count_over_time({applic...,"1\n{application=""openstack"", log_file_type=""no...",2.06k\n<graph>,,{'chain_of_thought': 'The query uses three lab...,single line filter,multiple log stream selectors,"{'categories': ['built_in_range_aggregation', ...","[built_in_range_aggregation, log_range_aggrega..."
9,openstack,11,What errors were encountered during the Comput...,"sum(count_over_time({application=""openstack"", ...","1\n{application=""openstack"", log_file_type=~""n...",8\n<graph>,,"{'chain_of_thought': 'In the given query, ther...",single line filter,multiple log stream selectors,"{'categories': ['log_range_aggregation', 'buil...","[log_range_aggregation, built_in_range_aggrega..."


In [3]:
Dataset.from_pandas(df).save_to_disk(dataset_path)

Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

In [15]:
Dataset.from_pandas(df).push_to_hub("sidbin/natural-logql")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/507 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/sidbin/natural-logql/commit/e6a355b510eda4e94a33cd4b141b5fb714746361', commit_message='Upload dataset', commit_description='', oid='e6a355b510eda4e94a33cd4b141b5fb714746361', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/sidbin/natural-logql', endpoint='https://huggingface.co', repo_type='dataset', repo_id='sidbin/natural-logql'), pr_revision=None, pr_num=None)