Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: caching build files on Azure (#1230)
This PR implements caching for Mathlib build files. The basic API is: * `lake exe cache get` to download missing build files * `lake exe cache put` to upload files that are missing on the server * `lake exe cache` to print the help menu This caching method has a few enhancements on top of the solution we had for mathlib in Lean 3: 1. Source files are content-addressed and build files are referenced by such hashes separately 2. Minimized download traffic with queries that only pull files that are missing locally 3. Minimized upload traffic with queries that only push files that are missing on the server
- Loading branch information
1 parent
bfe06a7
commit 01a2153
Showing
6 changed files
with
452 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
/build | ||
/lake-packages | ||
/.cache |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
/- | ||
Copyright (c) 2023 Arthur Paulino. All rights reserved. | ||
Released under Apache 2.0 license as described in the file LICENSE. | ||
Authors: Arthur Paulino | ||
-/ | ||
|
||
import Cache.IO | ||
import Lean.Elab.ParseImportsFast | ||
|
||
namespace Cache.Hashing | ||
|
||
open System IO | ||
|
||
/-- We cache the hash of each file for faster lookup -/ | ||
abbrev HashM := StateT IO.HashMap IO | ||
|
||
/-- Gets the file paths to Mathlib files imported on a Lean source -/ | ||
def getFileImports (source : String) (pkgDirs : PackageDirs) : Array FilePath := | ||
let s := Lean.ParseImports.main source (Lean.ParseImports.whitespace source {}) | ||
let imps := s.imports.map (·.module.toString) | ||
|>.map (·.splitOn ".") | ||
|>.filter fun parts => match parts.head? with | ||
| some head => pkgDirs.contains head | ||
| none => false | ||
imps.map (mkFilePath · |>.withExtension "lean") | ||
|
||
/-- | ||
Computes the root hash, which mixes the hashes of the content of: | ||
* `lakefile.lean` | ||
* `lean-toolchain` | ||
* `lake-manifest.json` | ||
TODO: make it `IO UInt64` when the initialization bug is fixed in core. `IO Nat` is a workaround. | ||
See https://github.com/leanprover/lean4/issues/1998 | ||
-/ | ||
def getRootHash : IO Nat := do | ||
let rootFiles : List FilePath := ["lakefile.lean", "lean-toolchain", "lake-manifest.json"] | ||
let isMathlibRoot ← isMathlibRoot | ||
let h := hash $ ← rootFiles.mapM fun path => | ||
return ← IO.FS.readFile $ if isMathlibRoot then path else mathlibDepPath / path | ||
return h.toNat | ||
|
||
initialize rootHashFix : Nat ← getRootHash | ||
|
||
def rootHash : UInt64 := | ||
.ofNat rootHashFix | ||
|
||
/-- | ||
Computes the hash of a file, which mixes: | ||
* The root hash | ||
* The hash of its content | ||
* The hashes of the imported files that are part of `Mathlib` | ||
-/ | ||
partial def getFileHash (filePath : FilePath) : HashM UInt64 := do | ||
match (← get).find? filePath with | ||
| some hash => pure hash | ||
| none => | ||
let content ← IO.FS.readFile $ (← IO.getPackageDir filePath) / filePath | ||
let importHashes ← (getFileImports content pkgDirs).mapM getFileHash | ||
let fileHash := hash $ rootHash :: content.hash :: importHashes.toList | ||
modifyGet (fileHash, ·.insert filePath fileHash) | ||
|
||
/-- Main API to retrieve the hashes of the Lean files -/ | ||
def getHashes : IO IO.HashMap := | ||
return (← StateT.run (getFileHash ⟨"Mathlib.lean"⟩) default).2 | ||
|
||
end Cache.Hashing |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
/- | ||
Copyright (c) 2023 Arthur Paulino. All rights reserved. | ||
Released under Apache 2.0 license as described in the file LICENSE. | ||
Authors: Arthur Paulino | ||
-/ | ||
|
||
import Lean.Data.HashMap | ||
import Lean.Data.RBMap | ||
import Lean.Data.RBTree | ||
|
||
/-- Removes a parent path from the beginning of a path -/ | ||
def System.FilePath.withoutParent (path parent : FilePath) : FilePath := | ||
let rec aux : List String → List String → List String | ||
| z@(x :: xs), y :: ys => if x == y then aux xs ys else z | ||
| [], _ => [] | ||
| x, [] => x | ||
mkFilePath $ aux path.components parent.components | ||
|
||
def UInt64.asTarGz (n : UInt64) : String := | ||
s!"{n}.tar.gz" | ||
|
||
namespace Cache.IO | ||
|
||
open System (FilePath) | ||
|
||
/-- Target directory for build files -/ | ||
def LIBDIR : FilePath := | ||
"build" / "lib" | ||
|
||
/-- Target directory for IR files -/ | ||
def IRDIR : FilePath := | ||
"build" / "ir" | ||
|
||
/-- Target directory for caching -/ | ||
def CACHEDIR : FilePath := | ||
⟨".cache"⟩ | ||
|
||
/-- Target directory for caching -/ | ||
def TMPDIR : FilePath := | ||
CACHEDIR / "tmp" | ||
|
||
def LAKEPACKAGESDIR : FilePath := | ||
⟨"lake-packages"⟩ | ||
|
||
abbrev PackageDirs := Lean.RBMap String FilePath compare | ||
|
||
/-- Whether this is running on Mathlib repo or not -/ | ||
def isMathlibRoot : IO Bool := | ||
FilePath.mk "Mathlib" |>.pathExists | ||
|
||
def mathlibDepPath : FilePath := | ||
LAKEPACKAGESDIR / "Mathlib" | ||
|
||
def getPackageDirs : IO PackageDirs := return .ofList [ | ||
("Mathlib", if ← isMathlibRoot then "." else mathlibDepPath), | ||
("Aesop", LAKEPACKAGESDIR / "aesop"), | ||
("Std", LAKEPACKAGESDIR / "std"), | ||
("Qq", LAKEPACKAGESDIR / "Qq") | ||
] | ||
|
||
initialize pkgDirs : PackageDirs ← getPackageDirs | ||
|
||
def getPackageDir (path : FilePath) : IO FilePath := | ||
match path.withExtension "" |>.components.head? with | ||
| none => throw $ IO.userError "Can't find package directory for empty path" | ||
| some pkg => match pkgDirs.find? pkg with | ||
| none => throw $ IO.userError s!"Unknown package directory for {pkg}" | ||
| some path => return path | ||
|
||
/-- Runs a terminal command and retrieves its output -/ | ||
def runCmd (cmd : String) (args : Array String) : IO String := do | ||
let out ← IO.Process.output { cmd := cmd, args := args } | ||
if out.exitCode != 0 then throw $ IO.userError out.stderr | ||
else return out.stdout | ||
|
||
/-- Recursively gets all files from a directory with a certain extension -/ | ||
partial def getFilesWithExtension | ||
(fp : FilePath) (extension : String) (acc : Array FilePath := #[]) : | ||
IO $ Array FilePath := do | ||
if ← fp.isDir then | ||
(← fp.readDir).foldlM (fun acc dir => getFilesWithExtension dir.path extension acc) acc | ||
else return if fp.extension == some extension then acc.push fp else acc | ||
|
||
abbrev HashMap := Lean.HashMap FilePath UInt64 | ||
|
||
namespace HashMap | ||
|
||
def filter (hashMap : HashMap) (set : Lean.RBTree String compare) (keep : Bool) : HashMap := | ||
hashMap.fold (init := default) fun acc path hash => | ||
let contains := set.contains hash.asTarGz | ||
let add := if keep then contains else !contains | ||
if add then acc.insert path hash else acc | ||
|
||
def hashes (hashMap : HashMap) : Lean.RBTree UInt64 compare := | ||
hashMap.fold (init := default) fun acc _ hash => acc.insert hash | ||
|
||
end HashMap | ||
|
||
def mkDir (path : FilePath) : IO Unit := do | ||
if !(← path.pathExists) then IO.FS.createDirAll path | ||
|
||
/-- Given a path to a Lean file, concatenates the paths to its build files -/ | ||
def mkBuildPaths (path : FilePath) : IO $ Array String := do | ||
let packageDir ← getPackageDir path | ||
return #[ | ||
packageDir / LIBDIR / path.withExtension "olean" |>.toString, | ||
packageDir / LIBDIR / path.withExtension "ilean" |>.toString, | ||
packageDir / LIBDIR / path.withExtension "trace" |>.toString, | ||
packageDir / IRDIR / path.withExtension "c" |>.toString, | ||
packageDir / IRDIR / path.withExtension "c.trace" |>.toString] | ||
|
||
/-- Compresses build files into the local cache -/ | ||
def mkCache (hashMap : HashMap) (overwrite : Bool) : IO $ Array String := do | ||
mkDir CACHEDIR | ||
IO.println "Compressing cache" | ||
let mut acc := default | ||
for (path, hash) in hashMap.toList do | ||
let zip := hash.asTarGz | ||
let zipPath := CACHEDIR / zip | ||
if overwrite || !(← zipPath.pathExists) then | ||
discard $ runCmd "tar" $ #["-I", "gzip -9", "-cf", zipPath.toString] ++ | ||
(← mkBuildPaths path) | ||
acc := acc.push zip | ||
return acc | ||
|
||
/-- Gets the set of all cached files -/ | ||
def getLocalCacheSet : IO $ Lean.RBTree String compare := do | ||
let paths ← getFilesWithExtension CACHEDIR "gz" | ||
return .ofList (paths.data.map (·.withoutParent CACHEDIR |>.toString)) | ||
|
||
/-- Decompresses build files into their respective folders -/ | ||
def setCache (hashMap : HashMap) : IO Unit := do | ||
IO.println "Decompressing cache" | ||
hashMap.filter (← getLocalCacheSet) true |>.forM fun path hash => do | ||
match path.parent with | ||
| none | some path => do | ||
let packageDir ← getPackageDir path | ||
mkDir $ packageDir / LIBDIR / path | ||
mkDir $ packageDir / IRDIR / path | ||
discard $ runCmd "tar" #["-xzf", s!"{CACHEDIR / hash.asTarGz}"] | ||
|
||
instance : Ord FilePath where | ||
compare x y := compare x.toString y.toString | ||
|
||
/-- Retrieves the azure token from the file system -/ | ||
def getToken : IO String := do | ||
let some token ← IO.getEnv "MATHLIB_CACHE_SAS" | ||
| throw $ IO.userError "environment variable MATHLIB_CACHE_SAS must be set to upload caches" | ||
return token | ||
|
||
/-- Removes all cache files except for what's in the `keep` set -/ | ||
def clearCache (keep : Lean.RBTree FilePath compare := default) : IO Unit := do | ||
for path in ← getFilesWithExtension CACHEDIR "gz" do | ||
if ! keep.contains path then IO.FS.removeFile path | ||
|
||
end Cache.IO |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
/- | ||
Copyright (c) 2023 Arthur Paulino. All rights reserved. | ||
Released under Apache 2.0 license as described in the file LICENSE. | ||
Authors: Arthur Paulino | ||
-/ | ||
|
||
import Cache.Requests | ||
|
||
def help : String := "Mathlib4 caching CLI | ||
Usage: cache [COMMAND] | ||
Commands: | ||
# No priviledge required | ||
get Download and decompress linked files missing on the local cache | ||
get! Download and decompress all linked files | ||
mk Compress non-compressed build files into the local cache | ||
mk! Compress build files into the local cache (no skipping) | ||
set Decompress linked files | ||
clear Delete non-linked files | ||
clear! Delete everything on the local cache | ||
# Privilege required | ||
put Run 'mk' then upload linked files missing on the server | ||
put! Run 'mk' then upload all linked files | ||
commit Run 'put' then write a commit on the server | ||
commit! Run 'put!' then (over)write a commit on the server | ||
collect TODO | ||
* Linked files refer to local cache files with corresponding Lean sources | ||
* Commands ending with '!' should be used manually, when hot-fixes are needed" | ||
|
||
open Cache IO Hashing Requests in | ||
def main (args : List String) : IO Unit := do | ||
let hashMap ← getHashes | ||
match args with | ||
| ["get"] => getFiles (hashMap.filter (← getLocalCacheSet) false) | ||
| ["get!"] => getFiles hashMap | ||
| ["mk"] => discard $ mkCache hashMap false | ||
| ["mk!"] => discard $ mkCache hashMap true | ||
| ["set"] => setCache hashMap | ||
| ["clear"] => | ||
clearCache $ hashMap.fold (fun acc _ hash => acc.insert $ CACHEDIR / hash.asTarGz) .empty | ||
| ["clear!"] => clearCache | ||
| ["put"] => putFiles (← mkCache hashMap false) false (← getToken) | ||
| ["put!"] => putFiles (← mkCache hashMap false) true (← getToken) | ||
| ["commit"] => | ||
if !(← isStatusClean) then IO.println "Please commit your changes first" return else | ||
putFiles (← mkCache hashMap false) false (← getToken) | ||
commit hashMap false (← getToken) | ||
| ["commit!"] => | ||
if !(← isStatusClean) then IO.println "Please commit your changes first" return else | ||
putFiles (← mkCache hashMap false) true (← getToken) | ||
commit hashMap true (← getToken) | ||
| ["collect"] => IO.println "TODO" | ||
| _ => println help |
Oops, something went wrong.