Skip to content

Commit 01a2153

Browse files
committed
feat: caching build files on Azure (#1230)
This PR implements caching for Mathlib build files. The basic API is: * `lake exe cache get` to download missing build files * `lake exe cache put` to upload files that are missing on the server * `lake exe cache` to print the help menu This caching method has a few enhancements on top of the solution we had for mathlib in Lean 3: 1. Source files are content-addressed and build files are referenced by such hashes separately 2. Minimized download traffic with queries that only pull files that are missing locally 3. Minimized upload traffic with queries that only push files that are missing on the server
1 parent bfe06a7 commit 01a2153

File tree

6 files changed

+452
-0
lines changed

6 files changed

+452
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
/build
22
/lake-packages
3+
/.cache

Cache/Hashing.lean

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/-
2+
Copyright (c) 2023 Arthur Paulino. All rights reserved.
3+
Released under Apache 2.0 license as described in the file LICENSE.
4+
Authors: Arthur Paulino
5+
-/
6+
7+
import Cache.IO
8+
import Lean.Elab.ParseImportsFast
9+
10+
namespace Cache.Hashing
11+
12+
open System IO
13+
14+
/-- We cache the hash of each file for faster lookup -/
15+
abbrev HashM := StateT IO.HashMap IO
16+
17+
/-- Gets the file paths to Mathlib files imported on a Lean source -/
18+
def getFileImports (source : String) (pkgDirs : PackageDirs) : Array FilePath :=
19+
let s := Lean.ParseImports.main source (Lean.ParseImports.whitespace source {})
20+
let imps := s.imports.map (·.module.toString)
21+
|>.map (·.splitOn ".")
22+
|>.filter fun parts => match parts.head? with
23+
| some head => pkgDirs.contains head
24+
| none => false
25+
imps.map (mkFilePath · |>.withExtension "lean")
26+
27+
/--
28+
Computes the root hash, which mixes the hashes of the content of:
29+
* `lakefile.lean`
30+
* `lean-toolchain`
31+
* `lake-manifest.json`
32+
33+
TODO: make it `IO UInt64` when the initialization bug is fixed in core. `IO Nat` is a workaround.
34+
See https://github.com/leanprover/lean4/issues/1998
35+
-/
36+
def getRootHash : IO Nat := do
37+
let rootFiles : List FilePath := ["lakefile.lean", "lean-toolchain", "lake-manifest.json"]
38+
let isMathlibRoot ← isMathlibRoot
39+
let h := hash $ ← rootFiles.mapM fun path =>
40+
return ← IO.FS.readFile $ if isMathlibRoot then path else mathlibDepPath / path
41+
return h.toNat
42+
43+
initialize rootHashFix : Nat ← getRootHash
44+
45+
def rootHash : UInt64 :=
46+
.ofNat rootHashFix
47+
48+
/--
49+
Computes the hash of a file, which mixes:
50+
* The root hash
51+
* The hash of its content
52+
* The hashes of the imported files that are part of `Mathlib`
53+
-/
54+
partial def getFileHash (filePath : FilePath) : HashM UInt64 := do
55+
match (← get).find? filePath with
56+
| some hash => pure hash
57+
| none =>
58+
let content ← IO.FS.readFile $ (← IO.getPackageDir filePath) / filePath
59+
let importHashes ← (getFileImports content pkgDirs).mapM getFileHash
60+
let fileHash := hash $ rootHash :: content.hash :: importHashes.toList
61+
modifyGet (fileHash, ·.insert filePath fileHash)
62+
63+
/-- Main API to retrieve the hashes of the Lean files -/
64+
def getHashes : IO IO.HashMap :=
65+
return (← StateT.run (getFileHash ⟨"Mathlib.lean"⟩) default).2
66+
67+
end Cache.Hashing

Cache/IO.lean

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
/-
2+
Copyright (c) 2023 Arthur Paulino. All rights reserved.
3+
Released under Apache 2.0 license as described in the file LICENSE.
4+
Authors: Arthur Paulino
5+
-/
6+
7+
import Lean.Data.HashMap
8+
import Lean.Data.RBMap
9+
import Lean.Data.RBTree
10+
11+
/-- Removes a parent path from the beginning of a path -/
12+
def System.FilePath.withoutParent (path parent : FilePath) : FilePath :=
13+
let rec aux : List String → List String → List String
14+
| z@(x :: xs), y :: ys => if x == y then aux xs ys else z
15+
| [], _ => []
16+
| x, [] => x
17+
mkFilePath $ aux path.components parent.components
18+
19+
def UInt64.asTarGz (n : UInt64) : String :=
20+
s!"{n}.tar.gz"
21+
22+
namespace Cache.IO
23+
24+
open System (FilePath)
25+
26+
/-- Target directory for build files -/
27+
def LIBDIR : FilePath :=
28+
"build" / "lib"
29+
30+
/-- Target directory for IR files -/
31+
def IRDIR : FilePath :=
32+
"build" / "ir"
33+
34+
/-- Target directory for caching -/
35+
def CACHEDIR : FilePath :=
36+
".cache"
37+
38+
/-- Target directory for caching -/
39+
def TMPDIR : FilePath :=
40+
CACHEDIR / "tmp"
41+
42+
def LAKEPACKAGESDIR : FilePath :=
43+
"lake-packages"
44+
45+
abbrev PackageDirs := Lean.RBMap String FilePath compare
46+
47+
/-- Whether this is running on Mathlib repo or not -/
48+
def isMathlibRoot : IO Bool :=
49+
FilePath.mk "Mathlib" |>.pathExists
50+
51+
def mathlibDepPath : FilePath :=
52+
LAKEPACKAGESDIR / "Mathlib"
53+
54+
def getPackageDirs : IO PackageDirs := return .ofList [
55+
("Mathlib", if ← isMathlibRoot then "." else mathlibDepPath),
56+
("Aesop", LAKEPACKAGESDIR / "aesop"),
57+
("Std", LAKEPACKAGESDIR / "std"),
58+
("Qq", LAKEPACKAGESDIR / "Qq")
59+
]
60+
61+
initialize pkgDirs : PackageDirs ← getPackageDirs
62+
63+
def getPackageDir (path : FilePath) : IO FilePath :=
64+
match path.withExtension "" |>.components.head? with
65+
| none => throw $ IO.userError "Can't find package directory for empty path"
66+
| some pkg => match pkgDirs.find? pkg with
67+
| none => throw $ IO.userError s!"Unknown package directory for {pkg}"
68+
| some path => return path
69+
70+
/-- Runs a terminal command and retrieves its output -/
71+
def runCmd (cmd : String) (args : Array String) : IO String := do
72+
let out ← IO.Process.output { cmd := cmd, args := args }
73+
if out.exitCode != 0 then throw $ IO.userError out.stderr
74+
else return out.stdout
75+
76+
/-- Recursively gets all files from a directory with a certain extension -/
77+
partial def getFilesWithExtension
78+
(fp : FilePath) (extension : String) (acc : Array FilePath := #[]) :
79+
IO $ Array FilePath := do
80+
if ← fp.isDir then
81+
(← fp.readDir).foldlM (fun acc dir => getFilesWithExtension dir.path extension acc) acc
82+
else return if fp.extension == some extension then acc.push fp else acc
83+
84+
abbrev HashMap := Lean.HashMap FilePath UInt64
85+
86+
namespace HashMap
87+
88+
def filter (hashMap : HashMap) (set : Lean.RBTree String compare) (keep : Bool) : HashMap :=
89+
hashMap.fold (init := default) fun acc path hash =>
90+
let contains := set.contains hash.asTarGz
91+
let add := if keep then contains else !contains
92+
if add then acc.insert path hash else acc
93+
94+
def hashes (hashMap : HashMap) : Lean.RBTree UInt64 compare :=
95+
hashMap.fold (init := default) fun acc _ hash => acc.insert hash
96+
97+
end HashMap
98+
99+
def mkDir (path : FilePath) : IO Unit := do
100+
if !(← path.pathExists) then IO.FS.createDirAll path
101+
102+
/-- Given a path to a Lean file, concatenates the paths to its build files -/
103+
def mkBuildPaths (path : FilePath) : IO $ Array String := do
104+
let packageDir ← getPackageDir path
105+
return #[
106+
packageDir / LIBDIR / path.withExtension "olean" |>.toString,
107+
packageDir / LIBDIR / path.withExtension "ilean" |>.toString,
108+
packageDir / LIBDIR / path.withExtension "trace" |>.toString,
109+
packageDir / IRDIR / path.withExtension "c" |>.toString,
110+
packageDir / IRDIR / path.withExtension "c.trace" |>.toString]
111+
112+
/-- Compresses build files into the local cache -/
113+
def mkCache (hashMap : HashMap) (overwrite : Bool) : IO $ Array String := do
114+
mkDir CACHEDIR
115+
IO.println "Compressing cache"
116+
let mut acc := default
117+
for (path, hash) in hashMap.toList do
118+
let zip := hash.asTarGz
119+
let zipPath := CACHEDIR / zip
120+
if overwrite || !(← zipPath.pathExists) then
121+
discard $ runCmd "tar" $ #["-I", "gzip -9", "-cf", zipPath.toString] ++
122+
(← mkBuildPaths path)
123+
acc := acc.push zip
124+
return acc
125+
126+
/-- Gets the set of all cached files -/
127+
def getLocalCacheSet : IO $ Lean.RBTree String compare := do
128+
let paths ← getFilesWithExtension CACHEDIR "gz"
129+
return .ofList (paths.data.map (·.withoutParent CACHEDIR |>.toString))
130+
131+
/-- Decompresses build files into their respective folders -/
132+
def setCache (hashMap : HashMap) : IO Unit := do
133+
IO.println "Decompressing cache"
134+
hashMap.filter (← getLocalCacheSet) true |>.forM fun path hash => do
135+
match path.parent with
136+
| none | some path => do
137+
let packageDir ← getPackageDir path
138+
mkDir $ packageDir / LIBDIR / path
139+
mkDir $ packageDir / IRDIR / path
140+
discard $ runCmd "tar" #["-xzf", s!"{CACHEDIR / hash.asTarGz}"]
141+
142+
instance : Ord FilePath where
143+
compare x y := compare x.toString y.toString
144+
145+
/-- Retrieves the azure token from the file system -/
146+
def getToken : IO String := do
147+
let some token ← IO.getEnv "MATHLIB_CACHE_SAS"
148+
| throw $ IO.userError "environment variable MATHLIB_CACHE_SAS must be set to upload caches"
149+
return token
150+
151+
/-- Removes all cache files except for what's in the `keep` set -/
152+
def clearCache (keep : Lean.RBTree FilePath compare := default) : IO Unit := do
153+
for path in ← getFilesWithExtension CACHEDIR "gz" do
154+
if ! keep.contains path then IO.FS.removeFile path
155+
156+
end Cache.IO

Cache/Main.lean

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/-
2+
Copyright (c) 2023 Arthur Paulino. All rights reserved.
3+
Released under Apache 2.0 license as described in the file LICENSE.
4+
Authors: Arthur Paulino
5+
-/
6+
7+
import Cache.Requests
8+
9+
def help : String := "Mathlib4 caching CLI
10+
Usage: cache [COMMAND]
11+
12+
Commands:
13+
# No priviledge required
14+
get Download and decompress linked files missing on the local cache
15+
get! Download and decompress all linked files
16+
mk Compress non-compressed build files into the local cache
17+
mk! Compress build files into the local cache (no skipping)
18+
set Decompress linked files
19+
clear Delete non-linked files
20+
clear! Delete everything on the local cache
21+
22+
# Privilege required
23+
put Run 'mk' then upload linked files missing on the server
24+
put! Run 'mk' then upload all linked files
25+
commit Run 'put' then write a commit on the server
26+
commit! Run 'put!' then (over)write a commit on the server
27+
collect TODO
28+
29+
* Linked files refer to local cache files with corresponding Lean sources
30+
* Commands ending with '!' should be used manually, when hot-fixes are needed"
31+
32+
open Cache IO Hashing Requests in
33+
def main (args : List String) : IO Unit := do
34+
let hashMap ← getHashes
35+
match args with
36+
| ["get"] => getFiles (hashMap.filter (← getLocalCacheSet) false)
37+
| ["get!"] => getFiles hashMap
38+
| ["mk"] => discard $ mkCache hashMap false
39+
| ["mk!"] => discard $ mkCache hashMap true
40+
| ["set"] => setCache hashMap
41+
| ["clear"] =>
42+
clearCache $ hashMap.fold (fun acc _ hash => acc.insert $ CACHEDIR / hash.asTarGz) .empty
43+
| ["clear!"] => clearCache
44+
| ["put"] => putFiles (← mkCache hashMap false) false (← getToken)
45+
| ["put!"] => putFiles (← mkCache hashMap false) true (← getToken)
46+
| ["commit"] =>
47+
if !(← isStatusClean) then IO.println "Please commit your changes first" return else
48+
putFiles (← mkCache hashMap false) false (← getToken)
49+
commit hashMap false (← getToken)
50+
| ["commit!"] =>
51+
if !(← isStatusClean) then IO.println "Please commit your changes first" return else
52+
putFiles (← mkCache hashMap false) true (← getToken)
53+
commit hashMap true (← getToken)
54+
| ["collect"] => IO.println "TODO"
55+
| _ => println help

0 commit comments

Comments
 (0)