In [None]:
import os
import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import pyrootutils
import seaborn as sns

In [None]:
PROJECT_ROOT = pyrootutils.find_root(
    search_from=os.path.abspath(""), indicator=".project-root"
)

In [None]:
GRAMMAR_NAME: str = "sample_trim_20250115102355"

samples_dir = PROJECT_ROOT / "data" / "samples" / GRAMMAR_NAME

In [None]:
samples = []

# check if positive.txt and negative.txt exist; if so, open them and read the samples as {sample: line, label: {positive, negative}}

if (samples_dir / "positive.txt").exists():
    with open(samples_dir / "positive.txt", "r") as f:
        samples.extend(
            [
                {
                    "sample": line.strip(),
                    "sample.type.ground_truth": "positive",
                    "sample.length": len(line.strip().split(" ")),
                }
                for line in f.readlines()
            ]
        )

if (samples_dir / "negative.txt").exists():
    with open(samples_dir / "negative.txt", "r") as f:
        samples.extend(
            [
                {
                    "sample": line.strip(),
                    "sample.type.ground_truth": "negative",
                    "sample.length": len(line.strip().split(" ")),
                }
                for line in f.readlines()
            ]
        )

sample_df = pd.DataFrame(samples)

sample_df.head()

In [None]:
fig = plt.figure(figsize=(6, 3))

ax = fig.add_subplot(111)

sns.histplot(
    data=sample_df,
    x="sample.length",
    ax=ax,
    bins=25,
    hue="sample.type.ground_truth",
    palette={"positive": "orange", "negative": "purple"},
)

ax.get_legend().set_title("Sample type")

ax.set_yscale("log")
ax.set_xlabel("Sample length")