diff --git a/.gitignore b/.gitignore index 2ee099a828e3a..942c7a9761a47 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /build /lake-packages +/.cache diff --git a/Cache/Hashing.lean b/Cache/Hashing.lean new file mode 100644 index 0000000000000..11797fb3fcc24 --- /dev/null +++ b/Cache/Hashing.lean @@ -0,0 +1,67 @@ +/- +Copyright (c) 2023 Arthur Paulino. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. +Authors: Arthur Paulino +-/ + +import Cache.IO +import Lean.Elab.ParseImportsFast + +namespace Cache.Hashing + +open System IO + +/-- We cache the hash of each file for faster lookup -/ +abbrev HashM := StateT IO.HashMap IO + +/-- Gets the file paths to Mathlib files imported on a Lean source -/ +def getFileImports (source : String) (pkgDirs : PackageDirs) : Array FilePath := + let s := Lean.ParseImports.main source (Lean.ParseImports.whitespace source {}) + let imps := s.imports.map (·.module.toString) + |>.map (·.splitOn ".") + |>.filter fun parts => match parts.head? with + | some head => pkgDirs.contains head + | none => false + imps.map (mkFilePath · |>.withExtension "lean") + +/-- +Computes the root hash, which mixes the hashes of the content of: +* `lakefile.lean` +* `lean-toolchain` +* `lake-manifest.json` + +TODO: make it `IO UInt64` when the initialization bug is fixed in core. `IO Nat` is a workaround. + See https://github.com/leanprover/lean4/issues/1998 +-/ +def getRootHash : IO Nat := do + let rootFiles : List FilePath := ["lakefile.lean", "lean-toolchain", "lake-manifest.json"] + let isMathlibRoot ← isMathlibRoot + let h := hash $ ← rootFiles.mapM fun path => + return ← IO.FS.readFile $ if isMathlibRoot then path else mathlibDepPath / path + return h.toNat + +initialize rootHashFix : Nat ← getRootHash + +def rootHash : UInt64 := + .ofNat rootHashFix + +/-- +Computes the hash of a file, which mixes: +* The root hash +* The hash of its content +* The hashes of the imported files that are part of `Mathlib` +-/ +partial def getFileHash (filePath : FilePath) : HashM UInt64 := do + match (← get).find? filePath with + | some hash => pure hash + | none => + let content ← IO.FS.readFile $ (← IO.getPackageDir filePath) / filePath + let importHashes ← (getFileImports content pkgDirs).mapM getFileHash + let fileHash := hash $ rootHash :: content.hash :: importHashes.toList + modifyGet (fileHash, ·.insert filePath fileHash) + +/-- Main API to retrieve the hashes of the Lean files -/ +def getHashes : IO IO.HashMap := + return (← StateT.run (getFileHash ⟨"Mathlib.lean"⟩) default).2 + +end Cache.Hashing diff --git a/Cache/IO.lean b/Cache/IO.lean new file mode 100644 index 0000000000000..8abc879293e6c --- /dev/null +++ b/Cache/IO.lean @@ -0,0 +1,156 @@ +/- +Copyright (c) 2023 Arthur Paulino. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. +Authors: Arthur Paulino +-/ + +import Lean.Data.HashMap +import Lean.Data.RBMap +import Lean.Data.RBTree + +/-- Removes a parent path from the beginning of a path -/ +def System.FilePath.withoutParent (path parent : FilePath) : FilePath := + let rec aux : List String → List String → List String + | z@(x :: xs), y :: ys => if x == y then aux xs ys else z + | [], _ => [] + | x, [] => x + mkFilePath $ aux path.components parent.components + +def UInt64.asTarGz (n : UInt64) : String := + s!"{n}.tar.gz" + +namespace Cache.IO + +open System (FilePath) + +/-- Target directory for build files -/ +def LIBDIR : FilePath := + "build" / "lib" + +/-- Target directory for IR files -/ +def IRDIR : FilePath := + "build" / "ir" + +/-- Target directory for caching -/ +def CACHEDIR : FilePath := + ⟨".cache"⟩ + +/-- Target directory for caching -/ +def TMPDIR : FilePath := + CACHEDIR / "tmp" + +def LAKEPACKAGESDIR : FilePath := + ⟨"lake-packages"⟩ + +abbrev PackageDirs := Lean.RBMap String FilePath compare + +/-- Whether this is running on Mathlib repo or not -/ +def isMathlibRoot : IO Bool := + FilePath.mk "Mathlib" |>.pathExists + +def mathlibDepPath : FilePath := + LAKEPACKAGESDIR / "Mathlib" + +def getPackageDirs : IO PackageDirs := return .ofList [ + ("Mathlib", if ← isMathlibRoot then "." else mathlibDepPath), + ("Aesop", LAKEPACKAGESDIR / "aesop"), + ("Std", LAKEPACKAGESDIR / "std"), + ("Qq", LAKEPACKAGESDIR / "Qq") +] + +initialize pkgDirs : PackageDirs ← getPackageDirs + +def getPackageDir (path : FilePath) : IO FilePath := + match path.withExtension "" |>.components.head? with + | none => throw $ IO.userError "Can't find package directory for empty path" + | some pkg => match pkgDirs.find? pkg with + | none => throw $ IO.userError s!"Unknown package directory for {pkg}" + | some path => return path + +/-- Runs a terminal command and retrieves its output -/ +def runCmd (cmd : String) (args : Array String) : IO String := do + let out ← IO.Process.output { cmd := cmd, args := args } + if out.exitCode != 0 then throw $ IO.userError out.stderr + else return out.stdout + +/-- Recursively gets all files from a directory with a certain extension -/ +partial def getFilesWithExtension + (fp : FilePath) (extension : String) (acc : Array FilePath := #[]) : + IO $ Array FilePath := do + if ← fp.isDir then + (← fp.readDir).foldlM (fun acc dir => getFilesWithExtension dir.path extension acc) acc + else return if fp.extension == some extension then acc.push fp else acc + +abbrev HashMap := Lean.HashMap FilePath UInt64 + +namespace HashMap + +def filter (hashMap : HashMap) (set : Lean.RBTree String compare) (keep : Bool) : HashMap := + hashMap.fold (init := default) fun acc path hash => + let contains := set.contains hash.asTarGz + let add := if keep then contains else !contains + if add then acc.insert path hash else acc + +def hashes (hashMap : HashMap) : Lean.RBTree UInt64 compare := + hashMap.fold (init := default) fun acc _ hash => acc.insert hash + +end HashMap + +def mkDir (path : FilePath) : IO Unit := do + if !(← path.pathExists) then IO.FS.createDirAll path + +/-- Given a path to a Lean file, concatenates the paths to its build files -/ +def mkBuildPaths (path : FilePath) : IO $ Array String := do + let packageDir ← getPackageDir path + return #[ + packageDir / LIBDIR / path.withExtension "olean" |>.toString, + packageDir / LIBDIR / path.withExtension "ilean" |>.toString, + packageDir / LIBDIR / path.withExtension "trace" |>.toString, + packageDir / IRDIR / path.withExtension "c" |>.toString, + packageDir / IRDIR / path.withExtension "c.trace" |>.toString] + +/-- Compresses build files into the local cache -/ +def mkCache (hashMap : HashMap) (overwrite : Bool) : IO $ Array String := do + mkDir CACHEDIR + IO.println "Compressing cache" + let mut acc := default + for (path, hash) in hashMap.toList do + let zip := hash.asTarGz + let zipPath := CACHEDIR / zip + if overwrite || !(← zipPath.pathExists) then + discard $ runCmd "tar" $ #["-I", "gzip -9", "-cf", zipPath.toString] ++ + (← mkBuildPaths path) + acc := acc.push zip + return acc + +/-- Gets the set of all cached files -/ +def getLocalCacheSet : IO $ Lean.RBTree String compare := do + let paths ← getFilesWithExtension CACHEDIR "gz" + return .ofList (paths.data.map (·.withoutParent CACHEDIR |>.toString)) + +/-- Decompresses build files into their respective folders -/ +def setCache (hashMap : HashMap) : IO Unit := do + IO.println "Decompressing cache" + hashMap.filter (← getLocalCacheSet) true |>.forM fun path hash => do + match path.parent with + | none | some path => do + let packageDir ← getPackageDir path + mkDir $ packageDir / LIBDIR / path + mkDir $ packageDir / IRDIR / path + discard $ runCmd "tar" #["-xzf", s!"{CACHEDIR / hash.asTarGz}"] + +instance : Ord FilePath where + compare x y := compare x.toString y.toString + +/-- Retrieves the azure token from the file system -/ +def getToken : IO String := do + let some token ← IO.getEnv "MATHLIB_CACHE_SAS" + | throw $ IO.userError "environment variable MATHLIB_CACHE_SAS must be set to upload caches" + return token + +/-- Removes all cache files except for what's in the `keep` set -/ +def clearCache (keep : Lean.RBTree FilePath compare := default) : IO Unit := do + for path in ← getFilesWithExtension CACHEDIR "gz" do + if ! keep.contains path then IO.FS.removeFile path + +end Cache.IO diff --git a/Cache/Main.lean b/Cache/Main.lean new file mode 100644 index 0000000000000..e4034e28b0e4d --- /dev/null +++ b/Cache/Main.lean @@ -0,0 +1,55 @@ +/- +Copyright (c) 2023 Arthur Paulino. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. +Authors: Arthur Paulino +-/ + +import Cache.Requests + +def help : String := "Mathlib4 caching CLI +Usage: cache [COMMAND] + +Commands: + # No priviledge required + get Download and decompress linked files missing on the local cache + get! Download and decompress all linked files + mk Compress non-compressed build files into the local cache + mk! Compress build files into the local cache (no skipping) + set Decompress linked files + clear Delete non-linked files + clear! Delete everything on the local cache + + # Privilege required + put Run 'mk' then upload linked files missing on the server + put! Run 'mk' then upload all linked files + commit Run 'put' then write a commit on the server + commit! Run 'put!' then (over)write a commit on the server + collect TODO + +* Linked files refer to local cache files with corresponding Lean sources +* Commands ending with '!' should be used manually, when hot-fixes are needed" + +open Cache IO Hashing Requests in +def main (args : List String) : IO Unit := do + let hashMap ← getHashes + match args with + | ["get"] => getFiles (hashMap.filter (← getLocalCacheSet) false) + | ["get!"] => getFiles hashMap + | ["mk"] => discard $ mkCache hashMap false + | ["mk!"] => discard $ mkCache hashMap true + | ["set"] => setCache hashMap + | ["clear"] => + clearCache $ hashMap.fold (fun acc _ hash => acc.insert $ CACHEDIR / hash.asTarGz) .empty + | ["clear!"] => clearCache + | ["put"] => putFiles (← mkCache hashMap false) false (← getToken) + | ["put!"] => putFiles (← mkCache hashMap false) true (← getToken) + | ["commit"] => + if !(← isStatusClean) then IO.println "Please commit your changes first" return else + putFiles (← mkCache hashMap false) false (← getToken) + commit hashMap false (← getToken) + | ["commit!"] => + if !(← isStatusClean) then IO.println "Please commit your changes first" return else + putFiles (← mkCache hashMap false) true (← getToken) + commit hashMap true (← getToken) + | ["collect"] => IO.println "TODO" + | _ => println help diff --git a/Cache/Requests.lean b/Cache/Requests.lean new file mode 100644 index 0000000000000..a28d03ecade97 --- /dev/null +++ b/Cache/Requests.lean @@ -0,0 +1,167 @@ +/- +Copyright (c) 2023 Arthur Paulino. All rights reserved. +Released under Apache 2.0 license as described in the file LICENSE. +Authors: Arthur Paulino +-/ + +import Cache.Hashing + +def ByteArray.startsWith (a b : ByteArray) : Bool := Id.run do + let size := b.size + let a := a.copySlice 0 .empty 0 size + if size != a.size then return false + for i in [0 : size] do + if a.get! i != b.get! i then return false + return true + +namespace Cache.Requests + +/-- Azure blob URL -/ +def URL : String := + "https://lakecache.blob.core.windows.net/mathlib4" + +open System (FilePath) + +/-- +Given a file name like `"1234.tar.gz"`, makes the URL to that file on the server. + +The `f/` prefix means that it's a common file for caching. +-/ +def mkFileURL (fileName : String) (token : Option String) : IO String := + return match token with + | some token => s!"{URL}/f/{fileName}?{token}" + | none => s!"{URL}/f/{fileName}" + +section Get + +/-- Formats part of the `curl` command that corresponds to the listing of files to be downloaded -/ +def mkGetPairs (fileNames : Array $ String × FilePath) : IO $ Array String := + fileNames.foldlM (init := default) fun acc (fileName, path) => do + pure $ acc ++ #[← mkFileURL fileName none, "-o", path.toString] + +/-- Bytes that correspond to the beginning of invalid downloaded files, introducing a XML -/ +def invalidFileStart : ByteArray := + ⟨#[239, 187, 191, 60, 63, 120, 109, 108, 32, 118, 101, 114, 115, 105, 111, 110, 61]⟩ + +/-- +Calls `curl` to download files from the server. + +It first downloads the files to a temporary folder then extracts valid `tar.gz` files to the cache +folder. The temporary folder is then deleted. +-/ +def getFiles (hashMap : IO.HashMap) : IO Unit := do + let size := hashMap.size + if size > 0 then + let pairs := hashMap.fold (init := #[]) fun acc _ hash => + let fileName := hash.asTarGz + acc.push (fileName, IO.TMPDIR / fileName) + IO.println s!"Attempting to download {size} file(s)" + IO.mkDir IO.TMPDIR + discard $ IO.runCmd "curl" $ #["-X", "GET", "--parallel"] ++ (← mkGetPairs pairs) + for (fileName, path) in pairs do + let bytes ← IO.FS.readBinFile path + if !bytes.startsWith invalidFileStart then + IO.FS.writeBinFile (IO.CACHEDIR / fileName) bytes + IO.FS.removeDirAll IO.TMPDIR + IO.setCache hashMap + else IO.println "No file to download" + +end Get + +section Put + +/-- Formats part of the `curl` command that corresponds to the listing of files to be uploaded -/ +def mkPutPairs (fileNames : Array String) (token : String) : IO $ Array String := + fileNames.foldlM (init := default) fun acc fileName => do + pure $ acc.append #["-T", s!"{IO.CACHEDIR / fileName}", ← mkFileURL fileName (some token)] + +/-- Calls `curl` to send a set of cached files to the server -/ +def putFiles (fileNames : Array String) (overwrite : Bool) (token : String) : IO Unit := do + let size := fileNames.size + if size > 0 then + IO.println s!"Attempting to upload {size} file(s)" + let params := if overwrite + then #["-X", "PUT", "-H", "x-ms-blob-type: BlockBlob", "--parallel"] + else #["-X", "PUT", "-H", "x-ms-blob-type: BlockBlob", "-H", "If-None-Match: *", "--parallel"] + discard $ IO.runCmd "curl" $ params ++ (← mkPutPairs fileNames token) + else IO.println "No file to upload" + +end Put + +section Commit + +def isStatusClean : IO Bool := + return (← IO.runCmd "git" #["status", "--porcelain"]).isEmpty + +def getCommitHash : IO String := do + let ret := (← IO.runCmd "git" #["log", "-1"]).replace "\n" " " + match ret.splitOn " " with + | "commit" :: hash :: _ => return hash + | _ => throw $ IO.userError "Invalid format for the return of `git log -1`" + +/-- +Sends a commit file to the server, containing the hashes of the respective commited files. + +The file name is the current Git hash and the `c/` prefix means that it's a commit file. +-/ +def commit (hashMap : IO.HashMap) (overwrite : Bool) (token : String) : IO Unit := do + let hash ← getCommitHash + let path := IO.TMPDIR / hash + IO.mkDir IO.TMPDIR + IO.FS.writeFile path $ ("\n".intercalate $ hashMap.hashes.toList.map toString) ++ "\n" + let params := if overwrite + then #["-X", "PUT", "-H", "x-ms-blob-type: BlockBlob"] + else #["-X", "PUT", "-H", "x-ms-blob-type: BlockBlob", "-H", "If-None-Match: *"] + discard $ IO.runCmd "curl" $ params ++ #["-T", path.toString, s!"{URL}/c/{hash}?{token}"] + IO.FS.removeDirAll IO.TMPDIR + +end Commit + +section Collect + +inductive QueryType + | files | commits | all + +def QueryType.prefix : QueryType → String + | files => "&prefix=f/" + | commits => "&prefix=c/" + | all => default + +def formatError : IO α := + throw $ IO.userError "Invalid format for curl return" + +def QueryType.desc : QueryType → String + | files => "hosted files" + | commits => "hosted commits" + | all => "everything" + +/-- +Retrieves metadata about hosted files: their names and the timestamps of last modification. + +Example: `["path/to/file.extension", "Sat, 24 Dec 2022 17:33:01 GMT"]` +-/ +def getFilesInfo (q : QueryType) : IO $ List (String × String) := do + IO.println s!"Downloading info list of {q.desc}" + let ret ← IO.runCmd "curl" #["-X", "GET", s!"{URL}?comp=list&restype=container{q.prefix}"] + match ret.splitOn "" with + | [] => formatError + | [_] => return default + | _ :: parts => + parts.mapM fun part => match part.splitOn "" with + | [name, rest] => match rest.splitOn "" with + | [_, rest] => match rest.splitOn "" with + | [date, _] => pure (name, date) + | _ => formatError + | _ => formatError + | _ => formatError + +/-- WIP garbage collection. Currently deletes the entire cache. Still useful for development -/ +def collectCache (token : String) : IO Unit := do + let hostedCacheSet ← getFilesInfo .all + let arr ← hostedCacheSet.foldlM (init := #[]) fun acc (fileName, _) => do + pure $ acc.push (← mkFileURL fileName (some token)) + discard $ IO.runCmd "curl" $ #["-X", "DELETE", "--parallel"] ++ arr + +end Collect + +end Cache.Requests diff --git a/lakefile.lean b/lakefile.lean index af1044f1fff30..20c8bc49cceb7 100644 --- a/lakefile.lean +++ b/lakefile.lean @@ -19,3 +19,9 @@ require «doc-gen4» from git "https://github.com/leanprover/doc-gen4" @ "main" require std from git "https://github.com/leanprover/std4" @ "main" require Qq from git "https://github.com/gebner/quote4" @ "master" require aesop from git "https://github.com/JLimperg/aesop" @ "master" + +lean_lib Cache where + roots := #[`Cache] + +lean_exe cache where + root := `Cache.Main