## Dataset Inspection

In [1]:
import os, sys
sys.path.append(os.path.join(os.path.abspath(''), '../'))

import peewee as pw
from toyDb.databases import ExperimentDb, ShaderDb

import numpy as np
import json
import hashlib

from misc.ComplexDatasetSnapshotter import (
    EnvironmentFilter,
    CycleTrialsFilter,
    ErrorFilter,
    ShadertoyIdFilter,
    WidthHeightFilter,
    ResourceFilter,
    TraceAvailabilityFilter,
    SpvTokenizedLengthFilter,
    TraceDuplicationPostFilter,
    AugmentationFilter,
    ComplexDatasetSnapshotter
)
from misc.Directory import (
  getIntermediateDir
)
from dataset.FragmentPerformanceTracedSnapshotDataset import FragmentPerformanceTracedSnapshotDataset

ExperimentDb.init_from_default_db()

## Resolution augmentation test

Target filename: `FragPerfSnapshotTracedDataset4096-3060-add2resolution-train-augmented.dat`

This is to split the following
- environment: the 3060 one
- train / dev split: same as 3060-4096.dat
- width & height: (800, 600), (1024, 768), (1920, 1080)
- Resource: 1, 2, 3 (that is, scr resolution changes, but not the rest; iTime = iFrame = 1)
- cycles & trials: use canonical one (30, 10)
- errors: None
- traces: those that have non-null traces
- lengths: `<= 4096`

In [2]:
snapshotter = ComplexDatasetSnapshotter()

# Basic filters
snapshotter.registerFilter(EnvironmentFilter())
snapshotter.registerFilter(WidthHeightFilter())
snapshotter.registerFilter(ResourceFilter())
snapshotter.registerFilter(CycleTrialsFilter())
snapshotter.registerFilter(ErrorFilter())
snapshotter.registerFilter(TraceAvailabilityFilter())

# Length filter
lengthFilter = SpvTokenizedLengthFilter()
lengthFilter.setThreshold(4096)

if os.path.isfile(os.path.join(getIntermediateDir(), "./lengthFilterCache.json")):
    lengthFilter.readFromCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))
else:
    lengthFilter.process(parallel=True)
    lengthFilter.writeToCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))

snapshotter.registerFilter(lengthFilter)
# Train / test split filter
origDatasetPath = os.path.join(
    getIntermediateDir(), "./FragPerfSnapshotTracedDataset4096-3060.dat"
)

trainSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "train")
testSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "test")
trainShdrIds = [trainSet[idx]["shaderId"] for idx in range(len(trainSet))]
testShdrIds = [testSet[idx]["shaderId"] for idx in range(len(testSet))]
shdrIdFilter = ShadertoyIdFilter()
shdrIdFilter.registerGroup("trainShdrExprs", trainShdrIds)
shdrIdFilter.registerGroup("testShdrExprs", testShdrIds)

snapshotter.registerFilter(shdrIdFilter)
# Examine our dear group
snapshotter.examineGroups(1)

  0%|          | 0/13904 [00:00<?, ?it/s]

EnvironmentFilter_EnvId1 = 124014
WidthHeightFilter_1024-768 = 82676
WidthHeightFilter_800-600 = 20669
WidthHeightFilter_1920-1080 = 20669
ResourceFilter_resource1 = 13871
ResourceFilter_resourceNone = 40768
ResourceFilter_resource2 = 13878
ResourceFilter_resource3 = 13873
ResourceFilter_resource4 = 13873
ResourceFilter_resource5 = 13875
ResourceFilter_resource6 = 13876
CycleTrialsFilter_30cycles-10trials = 124014
ErrorFilter_error0 = 83246
ErrorFilter_error1 = 37386
ErrorFilter_error2 = 3197
ErrorFilter_error3 = 185
TraceAvailabilityFilter_noTrace = 40796
TraceAvailabilityFilter_haveTrace = 83218
SpvTokenizedLengthFilter_belowOrEqualThreshold4096 = 67687
SpvTokenizedLengthFilter_aboveThreshold4096 = 15553
SpvTokenizedLengthFilter_failedTokenize = 6
ShadertoyIdFilter_trainShdrExprs = 54114
ShadertoyIdFilter_testShdrExprs = 13530


In [3]:
commonFilter = [
    [('EnvironmentFilter', 'EnvId1')],
    [('CycleTrialsFilter', '30cycles-10trials')],
    [
        ('ResourceFilter', 'resource1'),
        ('ResourceFilter', 'resource2'),
        ('ResourceFilter', 'resource3')
    ],
    [
        ('WidthHeightFilter', '800-600'),
        ('WidthHeightFilter', '1024-768'),
        ('WidthHeightFilter', '1920-1080')
    ],
    [('ErrorFilter', 'error0')],
    [('TraceAvailabilityFilter', 'haveTrace')],
    [('SpvTokenizedLengthFilter', 'belowOrEqualThreshold4096')]
]

destFile = os.path.join(
    getIntermediateDir(),
    "./FragPerfSnapshotTracedDataset4096-3060-add2resolution-train-augmented.dat"
)
if not os.path.isfile(destFile):
    snapshotter.doSnapshot(
        destFile,
        commonFilter + [[('ShadertoyIdFilter', 'trainShdrExprs')]],
        testGroupCopySrc=testSet
    )

    with open(destFile, "rb") as f:
        file_hash = hashlib.md5()
        chunk = f.read(8192)
        while chunk:
            file_hash.update(chunk)
            chunk = f.read(8192)

    print(f"Hash for {destFile}:\n- md5sum: {file_hash.hexdigest()}")

Train samples: 27047
Test samples: 2255
Hash for c:\Projects\NGPP\vkPredict\misc\.././intermediates\./FragPerfSnapshotTracedDataset4096-3060-add2resolution-train-augmented.dat:
- md5sum: e4eb263f2d904899886f97b34fda22ef


## Resgroup and resolution augmentation test

Target filename: `FragPerfSnapshotTracedDataset4096-3060-add2resolution-add3resgroup-train-augmented.dat`

This is to split the following
- environment: the 3060 one
- train / dev split: same as 3060-4096.dat
- width & height: (800, 600), (1024, 768), (1920, 1080)
- Resource: 1, 2, 3 (that is, scr resolution changes, but not the rest; iTime = iFrame = 1), 4, 5, 6
- cycles & trials: use canonical one (30, 10)
- errors: None
- traces: those that have non-null traces
- lengths: `<= 4096`

In [4]:
snapshotter = ComplexDatasetSnapshotter()

# Basic filters
snapshotter.registerFilter(EnvironmentFilter())
snapshotter.registerFilter(WidthHeightFilter())
snapshotter.registerFilter(ResourceFilter())
snapshotter.registerFilter(CycleTrialsFilter())
snapshotter.registerFilter(ErrorFilter())
snapshotter.registerFilter(TraceAvailabilityFilter())

# Length filter
lengthFilter = SpvTokenizedLengthFilter()
lengthFilter.setThreshold(4096)

if os.path.isfile(os.path.join(getIntermediateDir(), "./lengthFilterCache.json")):
    lengthFilter.readFromCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))
else:
    lengthFilter.process(parallel=True)
    lengthFilter.writeToCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))

snapshotter.registerFilter(lengthFilter)
# Train / test split filter
origDatasetPath = os.path.join(
    getIntermediateDir(), "./FragPerfSnapshotTracedDataset4096-3060.dat"
)

trainSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "train")
testSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "test")
trainShdrIds = [trainSet[idx]["shaderId"] for idx in range(len(trainSet))]
testShdrIds = [testSet[idx]["shaderId"] for idx in range(len(testSet))]
shdrIdFilter = ShadertoyIdFilter()
shdrIdFilter.registerGroup("trainShdrExprs", trainShdrIds)
shdrIdFilter.registerGroup("testShdrExprs", testShdrIds)

snapshotter.registerFilter(shdrIdFilter)
# Examine our dear group
snapshotter.examineGroups(1)

  0%|          | 0/13904 [00:00<?, ?it/s]

EnvironmentFilter_EnvId1 = 124014
WidthHeightFilter_1024-768 = 82676
WidthHeightFilter_800-600 = 20669
WidthHeightFilter_1920-1080 = 20669
ResourceFilter_resource1 = 13871
ResourceFilter_resourceNone = 40768
ResourceFilter_resource2 = 13878
ResourceFilter_resource3 = 13873
ResourceFilter_resource4 = 13873
ResourceFilter_resource5 = 13875
ResourceFilter_resource6 = 13876
CycleTrialsFilter_30cycles-10trials = 124014
ErrorFilter_error0 = 83246
ErrorFilter_error1 = 37386
ErrorFilter_error2 = 3197
ErrorFilter_error3 = 185
TraceAvailabilityFilter_noTrace = 40796
TraceAvailabilityFilter_haveTrace = 83218
SpvTokenizedLengthFilter_belowOrEqualThreshold4096 = 67687
SpvTokenizedLengthFilter_aboveThreshold4096 = 15553
SpvTokenizedLengthFilter_failedTokenize = 6
ShadertoyIdFilter_trainShdrExprs = 54114
ShadertoyIdFilter_testShdrExprs = 13530


In [5]:
commonFilter = [
    [('EnvironmentFilter', 'EnvId1')],
    [('CycleTrialsFilter', '30cycles-10trials')],
    [
        ('ResourceFilter', 'resource1'),
        ('ResourceFilter', 'resource2'),
        ('ResourceFilter', 'resource3'),
        ('ResourceFilter', 'resource4'),
        ('ResourceFilter', 'resource5'),
        ('ResourceFilter', 'resource6')
    ],
    [
        ('WidthHeightFilter', '800-600'),
        ('WidthHeightFilter', '1024-768'),
        ('WidthHeightFilter', '1920-1080')
    ],
    [('ErrorFilter', 'error0')],
    [('TraceAvailabilityFilter', 'haveTrace')],
    [('SpvTokenizedLengthFilter', 'belowOrEqualThreshold4096')]
]

destFile = os.path.join(
    getIntermediateDir(),
    "./FragPerfSnapshotTracedDataset4096-3060-add2resolution-add3resgroup-train-augmented.dat"
)
if not os.path.isfile(destFile):
    snapshotter.doSnapshot(
        destFile,
        commonFilter + [[('ShadertoyIdFilter', 'trainShdrExprs')]],
        testGroupCopySrc=testSet
    )

    with open(destFile, "rb") as f:
        file_hash = hashlib.md5()
        chunk = f.read(8192)
        while chunk:
            file_hash.update(chunk)
            chunk = f.read(8192)

    print(f"Hash for {destFile}:\n- md5sum: {file_hash.hexdigest()}")

Train samples: 54084
Test samples: 2255
Hash for c:\Projects\NGPP\vkPredict\misc\.././intermediates\./FragPerfSnapshotTracedDataset4096-3060-add2resolution-add3resgroup-train-augmented.dat:
- md5sum: 6a1b007f54cbd629bc6db40bc973a2bd


## Use trace de-dup post processor

In [6]:
snapshotter = ComplexDatasetSnapshotter()

# Basic filters
snapshotter.registerFilter(EnvironmentFilter())
snapshotter.registerFilter(WidthHeightFilter())
snapshotter.registerFilter(ResourceFilter())
snapshotter.registerFilter(CycleTrialsFilter())
snapshotter.registerFilter(ErrorFilter())
snapshotter.registerFilter(TraceAvailabilityFilter())

# Post processing filter
snapshotter.registerPostProcessor(TraceDuplicationPostFilter())

# Length filter
lengthFilter = SpvTokenizedLengthFilter()
lengthFilter.setThreshold(4096)

if os.path.isfile(os.path.join(getIntermediateDir(), "./lengthFilterCache.json")):
    lengthFilter.readFromCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))
else:
    lengthFilter.process(parallel=True)
    lengthFilter.writeToCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))

snapshotter.registerFilter(lengthFilter)
# Train / test split filter
origDatasetPath = os.path.join(
    getIntermediateDir(), "./FragPerfSnapshotTracedDataset4096-3060.dat"
)

trainSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "train")
testSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "test")
trainShdrIds = [trainSet[idx]["shaderId"] for idx in range(len(trainSet))]
testShdrIds = [testSet[idx]["shaderId"] for idx in range(len(testSet))]
shdrIdFilter = ShadertoyIdFilter()
shdrIdFilter.registerGroup("trainShdrExprs", trainShdrIds)
shdrIdFilter.registerGroup("testShdrExprs", testShdrIds)

snapshotter.registerFilter(shdrIdFilter)
# Examine our dear group
snapshotter.examineGroups(1)

  0%|          | 0/13904 [00:00<?, ?it/s]

EnvironmentFilter_EnvId1 = 124014
WidthHeightFilter_1024-768 = 82676
WidthHeightFilter_800-600 = 20669
WidthHeightFilter_1920-1080 = 20669
ResourceFilter_resource1 = 13871
ResourceFilter_resourceNone = 40768
ResourceFilter_resource2 = 13878
ResourceFilter_resource3 = 13873
ResourceFilter_resource4 = 13873
ResourceFilter_resource5 = 13875
ResourceFilter_resource6 = 13876
CycleTrialsFilter_30cycles-10trials = 124014
ErrorFilter_error0 = 83246
ErrorFilter_error1 = 37386
ErrorFilter_error2 = 3197
ErrorFilter_error3 = 185
TraceAvailabilityFilter_noTrace = 40796
TraceAvailabilityFilter_haveTrace = 83218
SpvTokenizedLengthFilter_belowOrEqualThreshold4096 = 67687
SpvTokenizedLengthFilter_aboveThreshold4096 = 15553
SpvTokenizedLengthFilter_failedTokenize = 6
ShadertoyIdFilter_trainShdrExprs = 54114
ShadertoyIdFilter_testShdrExprs = 13530


In [7]:
commonFilter = [
    [('EnvironmentFilter', 'EnvId1')],
    [('CycleTrialsFilter', '30cycles-10trials')],
    [
        ('ResourceFilter', 'resource1'),
        ('ResourceFilter', 'resource2'),
        ('ResourceFilter', 'resource3'),
        ('ResourceFilter', 'resource4'),
        ('ResourceFilter', 'resource5'),
        ('ResourceFilter', 'resource6')
    ],
    [
        ('WidthHeightFilter', '800-600'),
        ('WidthHeightFilter', '1024-768'),
        ('WidthHeightFilter', '1920-1080')
    ],
    [('ErrorFilter', 'error0')],
    [('TraceAvailabilityFilter', 'haveTrace')],
    [('SpvTokenizedLengthFilter', 'belowOrEqualThreshold4096')]
]

destFile = os.path.join(
    getIntermediateDir(),
    "./FragPerfSnapshotTracedDataset4096-3060-add2resolution-add3resgroup-dedup-train-augmented.dat"
)
if not os.path.isfile(destFile):
    snapshotter.doSnapshot(
        destFile,
        commonFilter + [[('ShadertoyIdFilter', 'trainShdrExprs')]],
        testGroupCopySrc=testSet,
        applyPostProcessorToTrain=True
    )

    with open(destFile, "rb") as f:
        file_hash = hashlib.md5()
        chunk = f.read(8192)
        while chunk:
            file_hash.update(chunk)
            chunk = f.read(8192)

    print(f"Hash for {destFile}:\n- md5sum: {file_hash.hexdigest()}")

TraceDuplicationPostFilter: Total 54084, Accepted 36180, Rejected 17904
Train samples: 36180
Test samples: 2255
Hash for c:\Projects\NGPP\vkPredict\misc\.././intermediates\./FragPerfSnapshotTracedDataset4096-3060-add2resolution-add3resgroup-dedup-train-augmented.dat:
- md5sum: de5b6e9a975fae9cfe0ece6e351ff677


## 18 resgroup augmentation test

Target filename: `FragPerfSnapshotTracedDataset4096-3060-add18resgroup-train-dedup-augmented.dat`

This is to split the following
- environment: the 3060 one
- train / dev split: same as 3060-4096.dat
- width & height: (1024, 768)
- Resource: 4 ~ 21 (that is, sample iTime from 2,3,7, ...)
- cycles & trials: use canonical one (30, 10)
- errors: None
- traces: those that have non-null traces
- lengths: `<= 4096`

In [3]:
snapshotter = ComplexDatasetSnapshotter()

# Basic filters
snapshotter.registerFilter(EnvironmentFilter())
snapshotter.registerFilter(WidthHeightFilter())
snapshotter.registerFilter(ResourceFilter())
snapshotter.registerFilter(CycleTrialsFilter())
snapshotter.registerFilter(ErrorFilter())
snapshotter.registerFilter(TraceAvailabilityFilter())

# Post processing filter
snapshotter.registerPostProcessor(TraceDuplicationPostFilter())

# Length filter
lengthFilter = SpvTokenizedLengthFilter()
lengthFilter.setThreshold(4096)

if os.path.isfile(os.path.join(getIntermediateDir(), "./lengthFilterCache.json")):
    lengthFilter.readFromCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))
else:
    lengthFilter.process(parallel=True)
    lengthFilter.writeToCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))

snapshotter.registerFilter(lengthFilter)
# Train / test split filter
origDatasetPath = os.path.join(
    getIntermediateDir(), "./FragPerfSnapshotTracedDataset4096-3060.dat"
)

trainSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "train")
testSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "test")
trainShdrIds = [trainSet[idx]["shaderId"] for idx in range(len(trainSet))]
testShdrIds = [testSet[idx]["shaderId"] for idx in range(len(testSet))]
shdrIdFilter = ShadertoyIdFilter()
shdrIdFilter.registerGroup("trainShdrExprs", trainShdrIds)
shdrIdFilter.registerGroup("testShdrExprs", testShdrIds)

snapshotter.registerFilter(shdrIdFilter)
# Examine our dear group
snapshotter.examineGroups(1)

commonFilter = [
    [('EnvironmentFilter', 'EnvId1')],
    [('CycleTrialsFilter', '30cycles-10trials')],
    [
        ('ResourceFilter', 'resource4'),
        ('ResourceFilter', 'resource5'),
        ('ResourceFilter', 'resource6'),
        ('ResourceFilter', 'resource7'),
        ('ResourceFilter', 'resource8'),
        ('ResourceFilter', 'resource9'),
        ('ResourceFilter', 'resource10'),
        ('ResourceFilter', 'resource11'),
        ('ResourceFilter', 'resource12'),
        ('ResourceFilter', 'resource13'),
        ('ResourceFilter', 'resource14'),
        ('ResourceFilter', 'resource15'),
        ('ResourceFilter', 'resource16'),
        ('ResourceFilter', 'resource17'),
        ('ResourceFilter', 'resource18'),
        ('ResourceFilter', 'resource19'),
        ('ResourceFilter', 'resource20'),
        ('ResourceFilter', 'resource21')
    ],
    [
        ('WidthHeightFilter', '1024-768')
    ],
    [('ErrorFilter', 'error0')],
    [('TraceAvailabilityFilter', 'haveTrace')],
    [('SpvTokenizedLengthFilter', 'belowOrEqualThreshold4096')]
]

destFile = os.path.join(
    getIntermediateDir(),
    "./FragPerfSnapshotTracedDataset4096-3060-add18resgroup-train-dedup-augmented.dat"
)
if not os.path.isfile(destFile):
    snapshotter.doSnapshot(
        destFile,
        commonFilter + [[('ShadertoyIdFilter', 'trainShdrExprs')]],
        testGroupCopySrc=testSet,
        applyPostProcessorToTrain=True
    )

    with open(destFile, "rb") as f:
        file_hash = hashlib.md5()
        chunk = f.read(8192)
        while chunk:
            file_hash.update(chunk)
            chunk = f.read(8192)

    print(f"Hash for {destFile}:\n- md5sum: {file_hash.hexdigest()}")

  0%|          | 0/13923 [00:00<?, ?it/s]

  0%|          | 0/13923 [00:00<?, ?it/s]

EnvironmentFilter_EnvId1 = 434049
WidthHeightFilter_1024-768 = 392711
WidthHeightFilter_800-600 = 20669
WidthHeightFilter_1920-1080 = 20669
ResourceFilter_resource1 = 13871
ResourceFilter_resourceNone = 142708
ResourceFilter_resource2 = 13878
ResourceFilter_resource3 = 13873
ResourceFilter_resource4 = 13873
ResourceFilter_resource5 = 13875
ResourceFilter_resource6 = 13876
ResourceFilter_resource7 = 13872
ResourceFilter_resource8 = 13867
ResourceFilter_resource9 = 13872
ResourceFilter_resource10 = 13881
ResourceFilter_resource11 = 13873
ResourceFilter_resource12 = 13879
ResourceFilter_resource13 = 13871
ResourceFilter_resource14 = 13869
ResourceFilter_resource15 = 13868
ResourceFilter_resource16 = 13874
ResourceFilter_resource17 = 13874
ResourceFilter_resource18 = 13872
ResourceFilter_resource19 = 13869
ResourceFilter_resource20 = 13869
ResourceFilter_resource21 = 13885
CycleTrialsFilter_30cycles-10trials = 434049
ErrorFilter_error0 = 291341
ErrorFilter_error1 = 130851
ErrorFilter_error

## 6600xt standard test

Target filename: `FragPerfSnapshotTracedDataset4096-6600xt-train.dat`

This is to split the following
- environment: the 6600xt one
- train / dev split: same as 3060-4096.dat
- width & height: (1024, 768)
- Resource: 1
- cycles & trials: use canonical one (30, 10)
- errors: None
- traces: those that have non-null traces
- lengths: `<= 4096`

In [3]:
snapshotter = ComplexDatasetSnapshotter()

# Basic filters
snapshotter.registerFilter(EnvironmentFilter())
snapshotter.registerFilter(WidthHeightFilter())
snapshotter.registerFilter(ResourceFilter())
snapshotter.registerFilter(CycleTrialsFilter())
snapshotter.registerFilter(ErrorFilter())
snapshotter.registerFilter(TraceAvailabilityFilter())

# Post processing filter
# snapshotter.registerPostProcessor(TraceDuplicationPostFilter())

# Length filter
lengthFilter = SpvTokenizedLengthFilter()
lengthFilter.setThreshold(4096)

if os.path.isfile(os.path.join(getIntermediateDir(), "./lengthFilterCache.json")):
    lengthFilter.readFromCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))
else:
    lengthFilter.process(parallel=True)
    lengthFilter.writeToCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))

snapshotter.registerFilter(lengthFilter)
# Train / test split filter
origDatasetPath = os.path.join(
    getIntermediateDir(), "./FragPerfSnapshotTracedDataset4096-3060.dat"
)

trainSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "train")
testSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "test")
trainShdrIds = [trainSet[idx]["shaderId"] for idx in range(len(trainSet))]
testShdrIds = [testSet[idx]["shaderId"] for idx in range(len(testSet))]
shdrIdFilter = ShadertoyIdFilter()
shdrIdFilter.registerGroup("trainShdrExprs", trainShdrIds)
shdrIdFilter.registerGroup("testShdrExprs", testShdrIds)

snapshotter.registerFilter(shdrIdFilter)
# Examine our dear group
snapshotter.examineGroups(1)

commonFilter = [
    [('EnvironmentFilter', 'EnvId2')],
    [('CycleTrialsFilter', '30cycles-10trials')],
    [
        ('ResourceFilter', 'resource1')
    ],
    [
        ('WidthHeightFilter', '1024-768')
    ],
    [('ErrorFilter', 'error0')],
    [('TraceAvailabilityFilter', 'haveTrace')],
    [('SpvTokenizedLengthFilter', 'belowOrEqualThreshold4096')]
]

destFile = os.path.join(
    getIntermediateDir(),
    "./FragPerfSnapshotTracedDataset4096-6600xt-train.dat"
)
if not os.path.isfile(destFile):
    snapshotter.doSnapshot(
        destFile,
        trainGroupFilters=commonFilter + [[('ShadertoyIdFilter', 'trainShdrExprs')]],
        testGroupFilters=commonFilter + [[('ShadertoyIdFilter', 'testShdrExprs')]],
        # applyPostProcessorToTrain=True
    )

    with open(destFile, "rb") as f:
        file_hash = hashlib.md5()
        chunk = f.read(8192)
        while chunk:
            file_hash.update(chunk)
            chunk = f.read(8192)

    print(f"Hash for {destFile}:\n- md5sum: {file_hash.hexdigest()}")

  0%|          | 0/14239 [00:00<?, ?it/s]

EnvironmentFilter_EnvId1 = 434049
EnvironmentFilter_EnvId2 = 20669
WidthHeightFilter_1024-768 = 413380
WidthHeightFilter_800-600 = 20669
WidthHeightFilter_1920-1080 = 20669
ResourceFilter_resource1 = 27942
ResourceFilter_resourceNone = 149306
ResourceFilter_resource2 = 13878
ResourceFilter_resource3 = 13873
ResourceFilter_resource4 = 13873
ResourceFilter_resource5 = 13875
ResourceFilter_resource6 = 13876
ResourceFilter_resource7 = 13872
ResourceFilter_resource8 = 13867
ResourceFilter_resource9 = 13872
ResourceFilter_resource10 = 13881
ResourceFilter_resource11 = 13873
ResourceFilter_resource12 = 13879
ResourceFilter_resource13 = 13871
ResourceFilter_resource14 = 13869
ResourceFilter_resource15 = 13868
ResourceFilter_resource16 = 13874
ResourceFilter_resource17 = 13874
ResourceFilter_resource18 = 13872
ResourceFilter_resource19 = 13869
ResourceFilter_resource20 = 13869
ResourceFilter_resource21 = 13885
CycleTrialsFilter_30cycles-10trials = 454718
ErrorFilter_error0 = 305412
ErrorFilter_

## 3060 `-O` optimized test

Target filename: `FragPerfSnapshotTracedDataset4096-3060-Optim20000.dat`

This is to split the following
- environment: the 3060 one
- **augmentation**: 20000
- train / dev split: same as 3060-4096.dat
- width & height: (1024, 768)
- Resource: 1
- cycles & trials: use canonical one (30, 10)
- errors: None
- traces: those that have non-null traces
- lengths: `<= 4096`

In [4]:
snapshotter = ComplexDatasetSnapshotter()

# Basic filters
snapshotter.registerFilter(EnvironmentFilter())
snapshotter.registerFilter(WidthHeightFilter())
snapshotter.registerFilter(ResourceFilter())
snapshotter.registerFilter(CycleTrialsFilter())
snapshotter.registerFilter(ErrorFilter())
snapshotter.registerFilter(TraceAvailabilityFilter())
snapshotter.registerFilter(AugmentationFilter())

# Post processing filter
# snapshotter.registerPostProcessor(TraceDuplicationPostFilter())

# Length filter
lengthFilter = SpvTokenizedLengthFilter()
lengthFilter.setThreshold(4096)

if os.path.isfile(os.path.join(getIntermediateDir(), "./lengthFilterCache.json")):
    lengthFilter.readFromCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))
else:
    lengthFilter.process(parallel=True)
    lengthFilter.writeToCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))

snapshotter.registerFilter(lengthFilter)
# Train / test split filter
origDatasetPath = os.path.join(
    getIntermediateDir(), "./FragPerfSnapshotTracedDataset4096-3060.dat"
)

trainSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "train")
testSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "test")
trainShdrIds = [trainSet[idx]["shaderId"] for idx in range(len(trainSet))]
testShdrIds = [testSet[idx]["shaderId"] for idx in range(len(testSet))]
shdrIdFilter = ShadertoyIdFilter()
shdrIdFilter.registerGroup("trainShdrExprs", trainShdrIds)
shdrIdFilter.registerGroup("testShdrExprs", testShdrIds)

snapshotter.registerFilter(shdrIdFilter)
# Examine our dear group
snapshotter.examineGroups(1)

commonFilter = [
    [('EnvironmentFilter', 'EnvId1')],
    [('AugmentationFilter', 'aug20000')],
    [('CycleTrialsFilter', '30cycles-10trials')],
    [
        ('ResourceFilter', 'resource1')
    ],
    [
        ('WidthHeightFilter', '1024-768')
    ],
    [('ErrorFilter', 'error0')],
    [('TraceAvailabilityFilter', 'haveTrace')],
    [('SpvTokenizedLengthFilter', 'belowOrEqualThreshold4096')]
]

destFile = os.path.join(
    getIntermediateDir(),
    "./FragPerfSnapshotTracedDataset4096-3060-Optim20000.dat"
)
if not os.path.isfile(destFile):
    snapshotter.doSnapshot(
        destFile,
        trainGroupFilters=commonFilter + [[('ShadertoyIdFilter', 'trainShdrExprs')]],
        testGroupFilters=commonFilter + [[('ShadertoyIdFilter', 'testShdrExprs')]],
        # applyPostProcessorToTrain=True
    )

    with open(destFile, "rb") as f:
        file_hash = hashlib.md5()
        chunk = f.read(8192)
        while chunk:
            file_hash.update(chunk)
            chunk = f.read(8192)

    print(f"Hash for {destFile}:\n- md5sum: {file_hash.hexdigest()}")

  0%|          | 0/28119 [00:00<?, ?it/s]

EnvironmentFilter_EnvId1 = 447888
EnvironmentFilter_EnvId2 = 20669
EnvironmentFilter_EnvId3 = 20669
WidthHeightFilter_1024-768 = 447888
WidthHeightFilter_800-600 = 20669
WidthHeightFilter_1920-1080 = 20669
ResourceFilter_resource1 = 55396
ResourceFilter_resourceNone = 156360
ResourceFilter_resource2 = 13878
ResourceFilter_resource3 = 13873
ResourceFilter_resource4 = 13873
ResourceFilter_resource5 = 13875
ResourceFilter_resource6 = 13876
ResourceFilter_resource7 = 13872
ResourceFilter_resource8 = 13867
ResourceFilter_resource9 = 13872
ResourceFilter_resource10 = 13881
ResourceFilter_resource11 = 13873
ResourceFilter_resource12 = 13879
ResourceFilter_resource13 = 13871
ResourceFilter_resource14 = 13869
ResourceFilter_resource15 = 13868
ResourceFilter_resource16 = 13874
ResourceFilter_resource17 = 13874
ResourceFilter_resource18 = 13872
ResourceFilter_resource19 = 13869
ResourceFilter_resource20 = 13869
ResourceFilter_resource21 = 13885
CycleTrialsFilter_30cycles-10trials = 489226
ErrorFi

## uhd630 standard test

Target filename: `FragPerfSnapshotTracedDataset4096-uhd630.dat`

This is to split the following
- environment: the uhd630 one
- train / dev split: same as 3060-4096.dat
- width & height: (1024, 768)
- Resource: 1
- cycles & trials: use canonical one (30, 10)
- errors: None
- traces: those that have non-null traces
- lengths: `<= 4096`

In [None]:
snapshotter = ComplexDatasetSnapshotter()

# Basic filters
snapshotter.registerFilter(EnvironmentFilter())
snapshotter.registerFilter(WidthHeightFilter())
snapshotter.registerFilter(ResourceFilter())
snapshotter.registerFilter(CycleTrialsFilter())
snapshotter.registerFilter(ErrorFilter())
snapshotter.registerFilter(TraceAvailabilityFilter())

# Post processing filter
# snapshotter.registerPostProcessor(TraceDuplicationPostFilter())

# Length filter
lengthFilter = SpvTokenizedLengthFilter()
lengthFilter.setThreshold(4096)

if os.path.isfile(os.path.join(getIntermediateDir(), "./lengthFilterCache.json")):
    lengthFilter.readFromCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))
else:
    lengthFilter.process(parallel=True)
    lengthFilter.writeToCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))

snapshotter.registerFilter(lengthFilter)
# Train / test split filter
origDatasetPath = os.path.join(
    getIntermediateDir(), "./FragPerfSnapshotTracedDataset4096-3060.dat"
)

trainSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "train")
testSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "test")
trainShdrIds = [trainSet[idx]["shaderId"] for idx in range(len(trainSet))]
testShdrIds = [testSet[idx]["shaderId"] for idx in range(len(testSet))]
shdrIdFilter = ShadertoyIdFilter()
shdrIdFilter.registerGroup("trainShdrExprs", trainShdrIds)
shdrIdFilter.registerGroup("testShdrExprs", testShdrIds)

snapshotter.registerFilter(shdrIdFilter)
# Examine our dear group
snapshotter.examineGroups(1)

commonFilter = [
    [('EnvironmentFilter', 'EnvId3')],
    [('CycleTrialsFilter', '30cycles-10trials')],
    [
        ('ResourceFilter', 'resource1')
    ],
    [
        ('WidthHeightFilter', '1024-768')
    ],
    [('ErrorFilter', 'error0')],
    [('TraceAvailabilityFilter', 'haveTrace')],
    [('SpvTokenizedLengthFilter', 'belowOrEqualThreshold4096')]
]

destFile = os.path.join(
    getIntermediateDir(),
    "./FragPerfSnapshotTracedDataset4096-uhd630.dat"
)
if not os.path.isfile(destFile):
    snapshotter.doSnapshot(
        destFile,
        trainGroupFilters=commonFilter + [[('ShadertoyIdFilter', 'trainShdrExprs')]],
        testGroupFilters=commonFilter + [[('ShadertoyIdFilter', 'testShdrExprs')]],
        # applyPostProcessorToTrain=True
    )

    with open(destFile, "rb") as f:
        file_hash = hashlib.md5()
        chunk = f.read(8192)
        while chunk:
            file_hash.update(chunk)
            chunk = f.read(8192)

    print(f"Hash for {destFile}:\n- md5sum: {file_hash.hexdigest()}")

  0%|          | 0/14280 [00:00<?, ?it/s]

  0%|          | 0/14280 [00:00<?, ?it/s]

EnvironmentFilter_EnvId1 = 434049
EnvironmentFilter_EnvId2 = 20669
EnvironmentFilter_EnvId3 = 20669
WidthHeightFilter_1024-768 = 434049
WidthHeightFilter_800-600 = 20669
WidthHeightFilter_1920-1080 = 20669
ResourceFilter_resource1 = 41557
ResourceFilter_resourceNone = 156360
ResourceFilter_resource2 = 13878
ResourceFilter_resource3 = 13873
ResourceFilter_resource4 = 13873
ResourceFilter_resource5 = 13875
ResourceFilter_resource6 = 13876
ResourceFilter_resource7 = 13872
ResourceFilter_resource8 = 13867
ResourceFilter_resource9 = 13872
ResourceFilter_resource10 = 13881
ResourceFilter_resource11 = 13873
ResourceFilter_resource12 = 13879
ResourceFilter_resource13 = 13871
ResourceFilter_resource14 = 13869
ResourceFilter_resource15 = 13868
ResourceFilter_resource16 = 13874
ResourceFilter_resource17 = 13874
ResourceFilter_resource18 = 13872
ResourceFilter_resource19 = 13869
ResourceFilter_resource20 = 13869
ResourceFilter_resource21 = 13885
CycleTrialsFilter_30cycles-10trials = 475387
ErrorFi

## 4060 standard test

Target filename: `FragPerfSnapshotTracedDataset4096-4060.dat`

This is to split the following
- environment: the 4060 one
- train / dev split: same as 3060-4096.dat
- width & height: (1024, 768)
- Resource: 1
- cycles & trials: use canonical one (30, 10)
- errors: None
- traces: those that have non-null traces
- lengths: `<= 4096`

In [3]:
snapshotter = ComplexDatasetSnapshotter()

# Basic filters
snapshotter.registerFilter(EnvironmentFilter())
snapshotter.registerFilter(WidthHeightFilter())
snapshotter.registerFilter(ResourceFilter())
snapshotter.registerFilter(CycleTrialsFilter())
snapshotter.registerFilter(ErrorFilter())
snapshotter.registerFilter(TraceAvailabilityFilter())

# Post processing filter
# snapshotter.registerPostProcessor(TraceDuplicationPostFilter())

# Length filter
lengthFilter = SpvTokenizedLengthFilter()
lengthFilter.setThreshold(4096)

if os.path.isfile(os.path.join(getIntermediateDir(), "./lengthFilterCache.json")):
    lengthFilter.readFromCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))
else:
    lengthFilter.process(parallel=True)
    lengthFilter.writeToCache(os.path.join(getIntermediateDir(), "./lengthFilterCache.json"))

snapshotter.registerFilter(lengthFilter)
# Train / test split filter
origDatasetPath = os.path.join(
    getIntermediateDir(), "./FragPerfSnapshotTracedDataset4096-3060.dat"
)

trainSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "train")
testSet = FragmentPerformanceTracedSnapshotDataset(origDatasetPath, "test")
trainShdrIds = [trainSet[idx]["shaderId"] for idx in range(len(trainSet))]
testShdrIds = [testSet[idx]["shaderId"] for idx in range(len(testSet))]
shdrIdFilter = ShadertoyIdFilter()
shdrIdFilter.registerGroup("trainShdrExprs", trainShdrIds)
shdrIdFilter.registerGroup("testShdrExprs", testShdrIds)

snapshotter.registerFilter(shdrIdFilter)
# Examine our dear group
snapshotter.examineGroups(1)

commonFilter = [
    [('EnvironmentFilter', 'EnvId4')],
    [('CycleTrialsFilter', '30cycles-10trials')],
    [
        ('ResourceFilter', 'resource1')
    ],
    [
        ('WidthHeightFilter', '1024-768')
    ],
    [('ErrorFilter', 'error0')],
    [('TraceAvailabilityFilter', 'haveTrace')],
    [('SpvTokenizedLengthFilter', 'belowOrEqualThreshold4096')]
]

destFile = os.path.join(
    getIntermediateDir(),
    "./FragPerfSnapshotTracedDataset4096-4060.dat"
)
if not os.path.isfile(destFile):
    snapshotter.doSnapshot(
        destFile,
        trainGroupFilters=commonFilter + [[('ShadertoyIdFilter', 'trainShdrExprs')]],
        testGroupFilters=commonFilter + [[('ShadertoyIdFilter', 'testShdrExprs')]],
        # applyPostProcessorToTrain=True
    )

    with open(destFile, "rb") as f:
        file_hash = hashlib.md5()
        chunk = f.read(8192)
        while chunk:
            file_hash.update(chunk)
            chunk = f.read(8192)

    print(f"Hash for {destFile}:\n- md5sum: {file_hash.hexdigest()}")

  0%|          | 0/28127 [00:00<?, ?it/s]

  0%|          | 0/28127 [00:00<?, ?it/s]

EnvironmentFilter_EnvId1 = 447888
EnvironmentFilter_EnvId2 = 20669
EnvironmentFilter_EnvId3 = 20669
EnvironmentFilter_EnvId4 = 20669
WidthHeightFilter_1024-768 = 468557
WidthHeightFilter_800-600 = 20669
WidthHeightFilter_1920-1080 = 20669
ResourceFilter_resource1 = 69252
ResourceFilter_resourceNone = 163173
ResourceFilter_resource2 = 13878
ResourceFilter_resource3 = 13873
ResourceFilter_resource4 = 13873
ResourceFilter_resource5 = 13875
ResourceFilter_resource6 = 13876
ResourceFilter_resource7 = 13872
ResourceFilter_resource8 = 13867
ResourceFilter_resource9 = 13872
ResourceFilter_resource10 = 13881
ResourceFilter_resource11 = 13873
ResourceFilter_resource12 = 13879
ResourceFilter_resource13 = 13871
ResourceFilter_resource14 = 13869
ResourceFilter_resource15 = 13868
ResourceFilter_resource16 = 13874
ResourceFilter_resource17 = 13874
ResourceFilter_resource18 = 13872
ResourceFilter_resource19 = 13869
ResourceFilter_resource20 = 13869
ResourceFilter_resource21 = 13885
CycleTrialsFilter_3