Skip to content

Commit

Permalink
feat: caching build files on Azure (#1230)
Browse files Browse the repository at this point in the history
This PR implements caching for Mathlib build files. The basic API is:
* `lake exe cache get` to download missing build files
* `lake exe cache put` to upload files that are missing on the server
* `lake exe cache` to print the help menu

This caching method has a few enhancements on top of the solution we had for mathlib in Lean 3:
1. Source files are content-addressed and build files are referenced by such hashes separately
2. Minimized download traffic with queries that only pull files that are missing locally
3. Minimized upload traffic with queries that only push files that are missing on the server
  • Loading branch information
arthurpaulino committed Jan 3, 2023
1 parent bfe06a7 commit 01a2153
Show file tree
Hide file tree
Showing 6 changed files with 452 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -1,2 +1,3 @@
/build
/lake-packages
/.cache
67 changes: 67 additions & 0 deletions Cache/Hashing.lean
@@ -0,0 +1,67 @@
/-
Copyright (c) 2023 Arthur Paulino. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: Arthur Paulino
-/

import Cache.IO
import Lean.Elab.ParseImportsFast

namespace Cache.Hashing

open System IO

/-- We cache the hash of each file for faster lookup -/
abbrev HashM := StateT IO.HashMap IO

/-- Gets the file paths to Mathlib files imported on a Lean source -/
def getFileImports (source : String) (pkgDirs : PackageDirs) : Array FilePath :=
let s := Lean.ParseImports.main source (Lean.ParseImports.whitespace source {})
let imps := s.imports.map (·.module.toString)
|>.map (·.splitOn ".")
|>.filter fun parts => match parts.head? with
| some head => pkgDirs.contains head
| none => false
imps.map (mkFilePath · |>.withExtension "lean")

/--
Computes the root hash, which mixes the hashes of the content of:
* `lakefile.lean`
* `lean-toolchain`
* `lake-manifest.json`
TODO: make it `IO UInt64` when the initialization bug is fixed in core. `IO Nat` is a workaround.
See https://github.com/leanprover/lean4/issues/1998
-/
def getRootHash : IO Nat := do
let rootFiles : List FilePath := ["lakefile.lean", "lean-toolchain", "lake-manifest.json"]
let isMathlibRoot ← isMathlibRoot
let h := hash $ ← rootFiles.mapM fun path =>
return ← IO.FS.readFile $ if isMathlibRoot then path else mathlibDepPath / path
return h.toNat

initialize rootHashFix : Nat ← getRootHash

def rootHash : UInt64 :=
.ofNat rootHashFix

/--
Computes the hash of a file, which mixes:
* The root hash
* The hash of its content
* The hashes of the imported files that are part of `Mathlib`
-/
partial def getFileHash (filePath : FilePath) : HashM UInt64 := do
match (← get).find? filePath with
| some hash => pure hash
| none =>
let content ← IO.FS.readFile $ (← IO.getPackageDir filePath) / filePath
let importHashes ← (getFileImports content pkgDirs).mapM getFileHash
let fileHash := hash $ rootHash :: content.hash :: importHashes.toList
modifyGet (fileHash, ·.insert filePath fileHash)

/-- Main API to retrieve the hashes of the Lean files -/
def getHashes : IO IO.HashMap :=
return (← StateT.run (getFileHash ⟨"Mathlib.lean"⟩) default).2

end Cache.Hashing
156 changes: 156 additions & 0 deletions Cache/IO.lean
@@ -0,0 +1,156 @@
/-
Copyright (c) 2023 Arthur Paulino. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: Arthur Paulino
-/

import Lean.Data.HashMap
import Lean.Data.RBMap
import Lean.Data.RBTree

/-- Removes a parent path from the beginning of a path -/
def System.FilePath.withoutParent (path parent : FilePath) : FilePath :=
let rec aux : List String → List String → List String
| z@(x :: xs), y :: ys => if x == y then aux xs ys else z
| [], _ => []
| x, [] => x
mkFilePath $ aux path.components parent.components

def UInt64.asTarGz (n : UInt64) : String :=
s!"{n}.tar.gz"

namespace Cache.IO

open System (FilePath)

/-- Target directory for build files -/
def LIBDIR : FilePath :=
"build" / "lib"

/-- Target directory for IR files -/
def IRDIR : FilePath :=
"build" / "ir"

/-- Target directory for caching -/
def CACHEDIR : FilePath :=
".cache"

/-- Target directory for caching -/
def TMPDIR : FilePath :=
CACHEDIR / "tmp"

def LAKEPACKAGESDIR : FilePath :=
"lake-packages"

abbrev PackageDirs := Lean.RBMap String FilePath compare

/-- Whether this is running on Mathlib repo or not -/
def isMathlibRoot : IO Bool :=
FilePath.mk "Mathlib" |>.pathExists

def mathlibDepPath : FilePath :=
LAKEPACKAGESDIR / "Mathlib"

def getPackageDirs : IO PackageDirs := return .ofList [
("Mathlib", if ← isMathlibRoot then "." else mathlibDepPath),
("Aesop", LAKEPACKAGESDIR / "aesop"),
("Std", LAKEPACKAGESDIR / "std"),
("Qq", LAKEPACKAGESDIR / "Qq")
]

initialize pkgDirs : PackageDirs ← getPackageDirs

def getPackageDir (path : FilePath) : IO FilePath :=
match path.withExtension "" |>.components.head? with
| none => throw $ IO.userError "Can't find package directory for empty path"
| some pkg => match pkgDirs.find? pkg with
| none => throw $ IO.userError s!"Unknown package directory for {pkg}"
| some path => return path

/-- Runs a terminal command and retrieves its output -/
def runCmd (cmd : String) (args : Array String) : IO String := do
let out ← IO.Process.output { cmd := cmd, args := args }
if out.exitCode != 0 then throw $ IO.userError out.stderr
else return out.stdout

/-- Recursively gets all files from a directory with a certain extension -/
partial def getFilesWithExtension
(fp : FilePath) (extension : String) (acc : Array FilePath := #[]) :
IO $ Array FilePath := do
if ← fp.isDir then
(← fp.readDir).foldlM (fun acc dir => getFilesWithExtension dir.path extension acc) acc
else return if fp.extension == some extension then acc.push fp else acc

abbrev HashMap := Lean.HashMap FilePath UInt64

namespace HashMap

def filter (hashMap : HashMap) (set : Lean.RBTree String compare) (keep : Bool) : HashMap :=
hashMap.fold (init := default) fun acc path hash =>
let contains := set.contains hash.asTarGz
let add := if keep then contains else !contains
if add then acc.insert path hash else acc

def hashes (hashMap : HashMap) : Lean.RBTree UInt64 compare :=
hashMap.fold (init := default) fun acc _ hash => acc.insert hash

end HashMap

def mkDir (path : FilePath) : IO Unit := do
if !(← path.pathExists) then IO.FS.createDirAll path

/-- Given a path to a Lean file, concatenates the paths to its build files -/
def mkBuildPaths (path : FilePath) : IO $ Array String := do
let packageDir ← getPackageDir path
return #[
packageDir / LIBDIR / path.withExtension "olean" |>.toString,
packageDir / LIBDIR / path.withExtension "ilean" |>.toString,
packageDir / LIBDIR / path.withExtension "trace" |>.toString,
packageDir / IRDIR / path.withExtension "c" |>.toString,
packageDir / IRDIR / path.withExtension "c.trace" |>.toString]

/-- Compresses build files into the local cache -/
def mkCache (hashMap : HashMap) (overwrite : Bool) : IO $ Array String := do
mkDir CACHEDIR
IO.println "Compressing cache"
let mut acc := default
for (path, hash) in hashMap.toList do
let zip := hash.asTarGz
let zipPath := CACHEDIR / zip
if overwrite || !(← zipPath.pathExists) then
discard $ runCmd "tar" $ #["-I", "gzip -9", "-cf", zipPath.toString] ++
(← mkBuildPaths path)
acc := acc.push zip
return acc

/-- Gets the set of all cached files -/
def getLocalCacheSet : IO $ Lean.RBTree String compare := do
let paths ← getFilesWithExtension CACHEDIR "gz"
return .ofList (paths.data.map (·.withoutParent CACHEDIR |>.toString))

/-- Decompresses build files into their respective folders -/
def setCache (hashMap : HashMap) : IO Unit := do
IO.println "Decompressing cache"
hashMap.filter (← getLocalCacheSet) true |>.forM fun path hash => do
match path.parent with
| none | some path => do
let packageDir ← getPackageDir path
mkDir $ packageDir / LIBDIR / path
mkDir $ packageDir / IRDIR / path
discard $ runCmd "tar" #["-xzf", s!"{CACHEDIR / hash.asTarGz}"]

instance : Ord FilePath where
compare x y := compare x.toString y.toString

/-- Retrieves the azure token from the file system -/
def getToken : IO String := do
let some token ← IO.getEnv "MATHLIB_CACHE_SAS"
| throw $ IO.userError "environment variable MATHLIB_CACHE_SAS must be set to upload caches"
return token

/-- Removes all cache files except for what's in the `keep` set -/
def clearCache (keep : Lean.RBTree FilePath compare := default) : IO Unit := do
for path in ← getFilesWithExtension CACHEDIR "gz" do
if ! keep.contains path then IO.FS.removeFile path

end Cache.IO
55 changes: 55 additions & 0 deletions Cache/Main.lean
@@ -0,0 +1,55 @@
/-
Copyright (c) 2023 Arthur Paulino. All rights reserved.
Released under Apache 2.0 license as described in the file LICENSE.
Authors: Arthur Paulino
-/

import Cache.Requests

def help : String := "Mathlib4 caching CLI
Usage: cache [COMMAND]
Commands:
# No priviledge required
get Download and decompress linked files missing on the local cache
get! Download and decompress all linked files
mk Compress non-compressed build files into the local cache
mk! Compress build files into the local cache (no skipping)
set Decompress linked files
clear Delete non-linked files
clear! Delete everything on the local cache
# Privilege required
put Run 'mk' then upload linked files missing on the server
put! Run 'mk' then upload all linked files
commit Run 'put' then write a commit on the server
commit! Run 'put!' then (over)write a commit on the server
collect TODO
* Linked files refer to local cache files with corresponding Lean sources
* Commands ending with '!' should be used manually, when hot-fixes are needed"

open Cache IO Hashing Requests in
def main (args : List String) : IO Unit := do
let hashMap ← getHashes
match args with
| ["get"] => getFiles (hashMap.filter (← getLocalCacheSet) false)
| ["get!"] => getFiles hashMap
| ["mk"] => discard $ mkCache hashMap false
| ["mk!"] => discard $ mkCache hashMap true
| ["set"] => setCache hashMap
| ["clear"] =>
clearCache $ hashMap.fold (fun acc _ hash => acc.insert $ CACHEDIR / hash.asTarGz) .empty
| ["clear!"] => clearCache
| ["put"] => putFiles (← mkCache hashMap false) false (← getToken)
| ["put!"] => putFiles (← mkCache hashMap false) true (← getToken)
| ["commit"] =>
if !(← isStatusClean) then IO.println "Please commit your changes first" return else
putFiles (← mkCache hashMap false) false (← getToken)
commit hashMap false (← getToken)
| ["commit!"] =>
if !(← isStatusClean) then IO.println "Please commit your changes first" return else
putFiles (← mkCache hashMap false) true (← getToken)
commit hashMap true (← getToken)
| ["collect"] => IO.println "TODO"
| _ => println help

0 comments on commit 01a2153

Please sign in to comment.