From 57f993f68bd3775b2073691085ff8542c1afcf8a Mon Sep 17 00:00:00 2001 From: Rod Vagg Date: Fri, 12 May 2023 12:32:50 +1000 Subject: [PATCH] feat(testutil): add test data generation utils (extracted from Lassie) Previously: github.com/filecoin-project/lassie/pkg/internal/itest/unixfs --- testutil/directory.go | 144 +++++++++++++++++++++++++ testutil/generator.go | 206 ++++++++++++++++++++++++++++++++++++ testutil/namegen/namegen.go | 106 +++++++++++++++++++ 3 files changed, 456 insertions(+) create mode 100644 testutil/directory.go create mode 100644 testutil/generator.go create mode 100644 testutil/namegen/namegen.go diff --git a/testutil/directory.go b/testutil/directory.go new file mode 100644 index 0000000..9f373e0 --- /dev/null +++ b/testutil/directory.go @@ -0,0 +1,144 @@ +package testutil + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "fmt" + "io" + "testing" + + "github.com/ipfs/go-cid" + dagpb "github.com/ipld/go-codec-dagpb" + "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/datamodel" + "github.com/ipld/go-ipld-prime/linking" + cidlink "github.com/ipld/go-ipld-prime/linking/cid" + "github.com/ipld/go-ipld-prime/node/basicnode" + "github.com/stretchr/testify/require" +) + +// DirEntry represents a flattened directory entry, where Path is from the +// root of the directory and Content is the file contents. It is intended +// that a DirEntry slice can be used to represent a full-depth directory without +// needing nesting. +type DirEntry struct { + Path string + Content []byte + Root cid.Cid + SelfCids []cid.Cid + TSize uint64 + Children []DirEntry +} + +func (de DirEntry) Size() (int64, error) { + return int64(de.TSize), nil +} + +func (de DirEntry) Link() ipld.Link { + return cidlink.Link{Cid: de.Root} +} + +// ToDirEntry takes a LinkSystem containing UnixFS data and builds a DirEntry +// tree representing the file and directory structure it finds starting at the +// rootCid. If expectFull is true, it will error if it encounters a UnixFS +// node that it cannot fully load. If expectFull is false, it will ignore +// errors and return nil for any node it cannot load. +func ToDirEntry(t *testing.T, linkSys linking.LinkSystem, rootCid cid.Cid, expectFull bool) DirEntry { + de := toDirEntryRecursive(t, linkSys, rootCid, "", expectFull) + return *de +} + +func toDirEntryRecursive(t *testing.T, linkSys linking.LinkSystem, rootCid cid.Cid, name string, expectFull bool) *DirEntry { + var proto datamodel.NodePrototype = dagpb.Type.PBNode + if rootCid.Prefix().Codec == cid.Raw { + proto = basicnode.Prototype.Any + } + node, err := linkSys.Load(linking.LinkContext{Ctx: context.TODO()}, cidlink.Link{Cid: rootCid}, proto) + if expectFull { + require.NoError(t, err) + } else if err != nil { + if e, ok := err.(interface{ NotFound() bool }); ok && e.NotFound() { + return nil + } + require.NoError(t, err) + } + + if node.Kind() == ipld.Kind_Bytes { // is a file + byts, err := node.AsBytes() + require.NoError(t, err) + return &DirEntry{ + Path: name, + Content: byts, + Root: rootCid, + } + } + // else is a directory + children := make([]DirEntry, 0) + for itr := node.MapIterator(); !itr.Done(); { + k, v, err := itr.Next() + require.NoError(t, err) + childName, err := k.AsString() + require.NoError(t, err) + childLink, err := v.AsLink() + require.NoError(t, err) + child := toDirEntryRecursive(t, linkSys, childLink.(cidlink.Link).Cid, name+"/"+childName, expectFull) + children = append(children, *child) + } + return &DirEntry{ + Path: name, + Root: rootCid, + Children: children, + } +} + +// CompareDirEntries is a safe, recursive comparison between two DirEntry +// values. It doesn't strictly require child ordering to match, but it does +// require that all children exist and match, in some order. +func CompareDirEntries(t *testing.T, a, b DirEntry) { + // t.Log("CompareDirEntries", a.Path, b.Path) // TODO: remove this + require.Equal(t, a.Path, b.Path) + require.Equal(t, a.Root.String(), b.Root.String(), a.Path+" root mismatch") + hashA := sha256.Sum256(a.Content) + hashB := sha256.Sum256(b.Content) + require.Equal(t, hex.EncodeToString(hashA[:]), hex.EncodeToString(hashB[:]), a.Path+"content hash mismatch") + require.Equal(t, len(a.Children), len(b.Children), fmt.Sprintf("%s child length mismatch %d <> %d", a.Path, len(a.Children), len(b.Children))) + for i := range a.Children { + // not necessarily in order + var found bool + for j := range b.Children { + if a.Children[i].Path == b.Children[j].Path { + found = true + CompareDirEntries(t, a.Children[i], b.Children[j]) + } + } + require.True(t, found, fmt.Sprintf("%s child %s not found in b", a.Path, a.Children[i].Path)) + } +} + +// WrapContent embeds the content we want in some random nested content such +// that it's fetchable under the provided path. If exclusive is true, the +// content will be the only thing under the path. If false, there will be +// content before and after the wrapped content at each point in the path. +func WrapContent(t *testing.T, rndReader io.Reader, lsys *ipld.LinkSystem, content DirEntry, wrapPath string, exclusive bool) DirEntry { + want := content + ps := datamodel.ParsePath(wrapPath) + for ps.Len() > 0 { + de := []DirEntry{} + if !exclusive { + before := GenerateDirectory(t, lsys, rndReader, 4<<10, false) + before.Path = "!before" + de = append(de, before) + } + want.Path = ps.Last().String() + de = append(de, want) + if !exclusive { + after := GenerateDirectory(t, lsys, rndReader, 4<<11, true) + after.Path = "~after" + de = append(de, after) + } + want = BuildDirectory(t, lsys, de, false) + ps = ps.Pop() + } + return want +} diff --git a/testutil/generator.go b/testutil/generator.go new file mode 100644 index 0000000..4d3eacb --- /dev/null +++ b/testutil/generator.go @@ -0,0 +1,206 @@ +package testutil + +import ( + "bytes" + "crypto/rand" + "io" + "math/big" + "sort" + "strings" + "testing" + + "github.com/ipfs/go-cid" + "github.com/ipfs/go-unixfsnode/data/builder" + "github.com/ipfs/go-unixfsnode/testutil/namegen" + dagpb "github.com/ipld/go-codec-dagpb" + "github.com/ipld/go-ipld-prime" + "github.com/ipld/go-ipld-prime/linking" + cidlink "github.com/ipld/go-ipld-prime/linking/cid" + "github.com/multiformats/go-multihash" + "github.com/stretchr/testify/require" +) + +// GenerateFile generates a random unixfs file of the given size, storing the +// blocks in the provided LinkSystem and returns a DirEntry representation of +// the file. +func GenerateFile(t *testing.T, linkSys *linking.LinkSystem, randReader io.Reader, size int) DirEntry { + // a file of `size` random bytes, packaged into unixfs DAGs, stored in the remote blockstore + delimited := io.LimitReader(randReader, int64(size)) + var buf bytes.Buffer + buf.Grow(size) + delimited = io.TeeReader(delimited, &buf) + // "size-256144" sets the chunker, splitting bytes at 256144b boundaries + cids := make([]cid.Cid, 0) + var undo func() + linkSys.StorageWriteOpener, undo = cidCollector(linkSys, &cids) + defer undo() + root, gotSize, err := builder.BuildUnixFSFile(delimited, "size-256144", linkSys) + require.NoError(t, err) + srcData := buf.Bytes() + rootCid := root.(cidlink.Link).Cid + return DirEntry{ + Path: "", + Content: srcData, + Root: rootCid, + SelfCids: cids, + TSize: uint64(gotSize), + } +} + +// GenerateDirectory generates a random UnixFS directory that aims for the +// requested targetSize (in bytes, although it is likely to fall somewhere +// under this number), storing the blocks in the provided LinkSystem and +// returns a DirEntry representation of the directory. If rootSharded is true, +// the root directory will be built as HAMT sharded (with a low "width" to +// maximise the chance of collisions and therefore greater depth for smaller +// number of files). +func GenerateDirectory(t *testing.T, linkSys *linking.LinkSystem, randReader io.Reader, targetSize int, rootSharded bool) DirEntry { + return GenerateDirectoryFrom(t, linkSys, randReader, targetSize, "", rootSharded) +} + +// GenerateDirectoryFrom is the same as GenerateDirectory but allows the caller +// to specify a directory path to start from. This is useful for generating +// nested directories. +func GenerateDirectoryFrom( + t *testing.T, + linkSys *linking.LinkSystem, + randReader io.Reader, + targetSize int, + dir string, + sharded bool, +) DirEntry { + var curSize int + targetFileSize := targetSize / 16 + children := make([]DirEntry, 0) + for curSize < targetSize { + switch rndInt(randReader, 6) { + case 0: // 1 in 6 chance of finishing this directory if not at root + if dir != "" && len(children) > 0 { + curSize = targetSize // not really, but we're done with this directory + } // else at the root we don't get to finish early + case 1: // 1 in 6 chance of making a new directory + if targetSize-curSize <= 1024 { // don't make tiny directories + continue + } + var newDir string + for { + var err error + newDir, err = namegen.RandomDirectoryName(randReader) + require.NoError(t, err) + if !isDupe(children, newDir) { + break + } + } + child := GenerateDirectoryFrom(t, linkSys, randReader, targetSize-curSize, dir+"/"+newDir, false) + children = append(children, child) + curSize += int(child.TSize) + default: // 4 in 6 chance of making a new file + var size int + for size == 0 { // don't make empty files + sizeB, err := rand.Int(randReader, big.NewInt(int64(targetFileSize))) + require.NoError(t, err) + size = int(sizeB.Int64()) + if size > targetSize-curSize { + size = targetSize - curSize + } + } + entry := GenerateFile(t, linkSys, randReader, size) + var name string + for { + var err error + name, err = namegen.RandomFileName(randReader) + require.NoError(t, err) + if !isDupe(children, name) { + break + } + } + entry.Path = dir + "/" + name + curSize += size + children = append(children, entry) + } + } + dirEntry := BuildDirectory(t, linkSys, children, sharded) + dirEntry.Path = dir + return dirEntry +} + +// BuildDirectory builds a directory from the given children, storing the +// blocks in the provided LinkSystem and returns a DirEntry representation of +// the directory. If sharded is true, the root directory will be built as HAMT +// sharded (with a low "width" to maximise the chance of collisions and +// therefore greater depth for smaller number of files). +func BuildDirectory(t *testing.T, linkSys *linking.LinkSystem, children []DirEntry, sharded bool) DirEntry { + // create stable sorted children, which should match the encoded form + // in dag-pb + sort.Slice(children, func(i, j int) bool { + return strings.Compare(children[i].Path, children[j].Path) < 0 + }) + + dirLinks := make([]dagpb.PBLink, 0) + for _, child := range children { + paths := strings.Split(child.Path, "/") + name := paths[len(paths)-1] + lnk, err := builder.BuildUnixFSDirectoryEntry(name, int64(child.TSize), cidlink.Link{Cid: child.Root}) + require.NoError(t, err) + dirLinks = append(dirLinks, lnk) + } + cids := make([]cid.Cid, 0) + var undo func() + linkSys.StorageWriteOpener, undo = cidCollector(linkSys, &cids) + defer undo() + var root ipld.Link + var size uint64 + var err error + if sharded { + // node arity of 16, quite small to increase collision probability so we actually get sharding + const width = 16 + const hasher = multihash.MURMUR3X64_64 + root, size, err = builder.BuildUnixFSShardedDirectory(width, hasher, dirLinks, linkSys) + require.NoError(t, err) + } else { + root, size, err = builder.BuildUnixFSDirectory(dirLinks, linkSys) + require.NoError(t, err) + } + + return DirEntry{ + Path: "", + Root: root.(cidlink.Link).Cid, + SelfCids: cids, + TSize: size, + Children: children, + } +} + +func rndInt(randReader io.Reader, max int) int { + coin, err := rand.Int(randReader, big.NewInt(int64(max))) + if err != nil { + return 0 // eh, whatever + } + return int(coin.Int64()) +} + +func cidCollector(ls *ipld.LinkSystem, cids *[]cid.Cid) (ipld.BlockWriteOpener, func()) { + swo := ls.StorageWriteOpener + return func(linkCtx ipld.LinkContext) (io.Writer, ipld.BlockWriteCommitter, error) { + w, c, err := swo(linkCtx) + if err != nil { + return nil, nil, err + } + return w, func(lnk ipld.Link) error { + *cids = append(*cids, lnk.(cidlink.Link).Cid) + return c(lnk) + }, nil + }, func() { + // reset + ls.StorageWriteOpener = swo + } +} + +func isDupe(children []DirEntry, name string) bool { + for _, child := range children { + if strings.HasSuffix(child.Path, "/"+name) { + return true + } + } + return false +} diff --git a/testutil/namegen/namegen.go b/testutil/namegen/namegen.go new file mode 100644 index 0000000..d1d6369 --- /dev/null +++ b/testutil/namegen/namegen.go @@ -0,0 +1,106 @@ +package namegen + +import ( + "encoding/binary" + "io" + "strings" +) + +var words = strings.Fields(wordData) +var extensions = []string{"", ".txt", ".pdf", ".docx", ".png", ".jpg", ".csv", ".json", ".xml"} + +func getRandomIndex(r io.Reader, max int) (int, error) { + var n uint32 + err := binary.Read(r, binary.BigEndian, &n) + if err != nil { + return 0, err + } + return int(n % uint32(max)), nil +} + +// RandomDirectoryName returns a random directory name from the provided word list. +func RandomDirectoryName(r io.Reader) (string, error) { + index, err := getRandomIndex(r, len(words)) + if err != nil { + return "", err + } + return words[index], nil +} + +// RandomFileName returns a random file name with an extension from the provided word list and common extensions. +func RandomFileName(r io.Reader) (string, error) { + wordIndex, err := getRandomIndex(r, len(words)) + if err != nil { + return "", err + } + extIndex, err := getRandomIndex(r, len(extensions)) + if err != nil { + return "", err + } + return words[wordIndex] + extensions[extIndex], nil +} + +const wordData = `jabberwocky Snark whiffling borogoves mome raths brillig slithy toves outgrabe +Tumtum Frabjous Bandersnatch Jubjub Callay slumgullion snicker-snack brobdingnagian Jabberwock +tree Poglorian Binkleborf Wockbristle Zizzotether dinglewock Flumgurgle Glimperwick RazzleDazzle8 +gyre tortlewhack whispyfangle Crumplehorn Higgledy7 Piggledy3 flibberwocky Zamborot Flizzleflink +gimble Shakespearean Macbeth Othello Hamlet soliloquy iambic pentameter Benvolio Capulet Montague +Puck Malvolio Beatrice Prospero Iago Falstaff Rosencrantz Guildenstern Cordelia Polonius +Titania Oberon Tybalt Caliban Mercutio Portia Brabantio 4Lear Desdemona Lysander +YossarianScar Jujimufu9 Gorgulon Oozyboozle Razzmatazz8 BlinkenWoggle Flibbertigibbet Quixotic2 +Galumphing Widdershins Pecksniffian Bandicoot11 Flapdoodle Fandango Whippersnapper Grandiloquent +Lollygag Persnickety Gibberish Codswallop Rigmarole Nincompoop Flummox Snollygoster Poppycock +Kerfuffle Balderdash Gobbledygook Fiddle-faddle Antidisestablishmentarianism +Supercalifragilisticexpialidocious Rambunctious9 Lickety-split Hullabaloo Skullduggery Ballyhoo +Flabbergasted Discombobulate Pernicious Bumfuzzle Bamboozle Pandemonium Tomfoolery Hobbledehoy7 +Claptrap Cockamamie Hocus-pocus8 Higgledy-piggledy Dodecahedron Nonsensical Contraption Quizzical +Snuffleupagus Ostentatious Serendipity Ephemeral Melancholy Sonorous Plethora Brouhaha Absquatulate +Gobbledygook3 Lilliputian Chortle Euphonious Mellifluous Obfuscate Perspicacious Prevaricate +Sesquipedalian Tintinnabulation Quibble9 Umbrageous Quotidian Flapdoodle5 NoodleDoodle +Zigzagumptious Throttlebottom WuzzleWump Canoodle Hodgepodge Blatherskite7 Hornswoggle +BibbidiBobbidiBoo Prestidigitation Confabulate Abscond8 Lickspittle Ragamuffin Taradiddle +Widdershins4 Boondoggle Snuffleupagus9 Gallivant Folderol Malarkey Skedaddle Hobgoblin +BlubberCrumble ZibberZap Snickerdoodle Mooncalf LicketySplit8 Whatchamacallit Thingamajig +Thingamabob GibbleGabble FuddleDuddle LoopyLoo Splendiferous Bumbershoot Catawampus Flibbertigibbet5 +Gobbledygook7 Whippersnapper9 Ragamuffin8 Splendiferous +ætheling witan ealdorman leofwyrd swain bēorhall beorn mēarh scōp cyning hēahgerefa +sceadugenga wilweorc hildoræswa þegn ælfscyne wyrmslaga wælwulf fyrd hrēowmōd dēor +ealdorleornung scyldwiga þēodcwealm hāligbōc gūþweard wealdend gāstcynn wīfmann +wīsestōw þrēatung rīcere scealc eorþwerod bealucræft cynerīce sceorp ættwer +gāsthof ealdrīce wæpnedmann wæterfōr landgemære gafolgelda wīcstede mægenþrymm +æscwiga læcedōm wīdferhþ eorlgestrēon brimrād wæterstede hūslēoþ searocraeft +þegnunga wælscenc þrīstguma fyrdrinc wundorcræft cræftleornung eorþbūend +sǣlācend þunorrad wætergifu wæterscipe wæterþenung eorþtilþ eorþgebyrde +eorþhæbbend eorþgræf eorþbærn eorþhūs eorþscearu eorþsweg eorþtæfl eorþweorc +eorþweall eorþwaru eorþwela eorþwīs eorþworn eorþyþ eorþweg eorþwīse eorþwyrhta +eorþwīn eorþsceaða eorþsweart eorþscræf eorþscrūd eorþswyft eorþscīr eorþscūa +eorþsēoc eorþsele eorþhūsl eorþsted eorþswyn eorþsittend eorþsniþ eorþscearp +eorþscyld eorþsceaft eorþstapol eorþstede eorþsmitta eorþscēawere +velociraptorious chimeraesque bellerophontic serendipitastic transmogrification ultracrepidarian +prestidigitationary supraluminescence hemidemisemiquaver unquestionability intercontinentalism +antediluvianistic disproportionately absquatulationism automagicalization +floccinaucinihilipilification quintessentiality incomprehensibility juxtapositionally +perpendicularitude transubstantiation synchronicityverse astronomicalunit thermodynamicness +electromagnetismal procrastinatorily disenfranchisement neutrinooscillation hyperventilatingly +pneumonoultramicroscopicsilicovolcanoconiosis supercalifragilisticexpialidocious thaumaturgeonomics +idiosyncratically unencumberedness phantasmagoricity extraterrestrialism philanthropistastic +xenotransplantation incontrovertibility spontaneityvolution teleportationally labyrinthinean +megalomaniaction cryptozoologician ineffablemystique multiplicativity sisypheanquandary +overenthusiastically irrefutablenotion exceptionalitysphere +blibby ploof twindle zibbet jinty wiblo glimsy snaft trindle quopp vistly chark plizet snibber frint +trazzle buvvy skipple flizz dworp grindle yipple zarfle clippet swazz mibber brackle tindle grozz +vindle plazz freggle twazz snuzzle gwippet whindle juzzle krazz yazzle flippet skindle zapple prazz +buzzle chazz gripple snozzle trizz wazzle blikket zib glup snof yipr tazz vlim frub dwex klop +aa ab ad ae ag ah ai al am an as at aw ax ay ba be bi bo by de do ed ef eh el em en er es et ex fa +fe go ha he hi hm ho id if in is it jo ka ki la li lo ma me mi mm mo mu my na ne no nu od oe of oh +oi om on op or os ow ox oy pa pe pi qi re sh si so ta ti to uh um un up us ut we wo xi xu ya ye yo +za zo +hĕlłø cąfѐ ŝmîłe þřęê ċỏẽxist ǩāŕáōķê ŧrävèl кυгiοsity ŭпịςørn мëĺōđỳ ğħōšţ ŵăνę ẓẽṕhýr ғụzzlę +пåŕŧy 僃êct ԁяêåм љúвïĺëë ѓåḿъḽë ţęmƿęşţ říše čajovna želva štěstí ýpsilon ďábel ňadraží ťava +h3ll0 w0rld c0d1ng 3x3mpl3 pr0gr4mm1ng d3v3l0p3r 5cr4bbl3 3l3ph4nt 4pp 5y5t3m 1nput 0utput 3rr0r +5t4ck0v3rfl0w 5tr1ng 5l1c3 5h4k35p34r3 5t4nd4rd 3ncrypt10n 5h3ll 5cr1pt 5t4ck 5qu4r3 r3ct4ngl3 +tr14ngl3 c1rc13 5ph3r3 5qu4r3r00t 3xpr35510n 5t4t15t1c5 5t4t3m3nt 5ynt4x 5ugg35t10n 5y5t3m4t1c +5h0rtcut 5h4d0w 5h4r3d +1 2 3 4 5 6 7 8 9 0 +a b c d e f g h i j k l m n o p q r s t u v w x y z +A B C D E F G H I J K L M N O P Q R S T U V W X Y Z`