diff --git a/.gitignore b/.gitignore
index 2ee099a828e3a..942c7a9761a47 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
/build
/lake-packages
+/.cache
diff --git a/Cache/Hashing.lean b/Cache/Hashing.lean
new file mode 100644
index 0000000000000..11797fb3fcc24
--- /dev/null
+++ b/Cache/Hashing.lean
@@ -0,0 +1,67 @@
+/-
+Copyright (c) 2023 Arthur Paulino. All rights reserved.
+Released under Apache 2.0 license as described in the file LICENSE.
+Authors: Arthur Paulino
+-/
+
+import Cache.IO
+import Lean.Elab.ParseImportsFast
+
+namespace Cache.Hashing
+
+open System IO
+
+/-- We cache the hash of each file for faster lookup -/
+abbrev HashM := StateT IO.HashMap IO
+
+/-- Gets the file paths to Mathlib files imported on a Lean source -/
+def getFileImports (source : String) (pkgDirs : PackageDirs) : Array FilePath :=
+ let s := Lean.ParseImports.main source (Lean.ParseImports.whitespace source {})
+ let imps := s.imports.map (·.module.toString)
+ |>.map (·.splitOn ".")
+ |>.filter fun parts => match parts.head? with
+ | some head => pkgDirs.contains head
+ | none => false
+ imps.map (mkFilePath · |>.withExtension "lean")
+
+/--
+Computes the root hash, which mixes the hashes of the content of:
+* `lakefile.lean`
+* `lean-toolchain`
+* `lake-manifest.json`
+
+TODO: make it `IO UInt64` when the initialization bug is fixed in core. `IO Nat` is a workaround.
+ See https://github.com/leanprover/lean4/issues/1998
+-/
+def getRootHash : IO Nat := do
+ let rootFiles : List FilePath := ["lakefile.lean", "lean-toolchain", "lake-manifest.json"]
+ let isMathlibRoot ← isMathlibRoot
+ let h := hash $ ← rootFiles.mapM fun path =>
+ return ← IO.FS.readFile $ if isMathlibRoot then path else mathlibDepPath / path
+ return h.toNat
+
+initialize rootHashFix : Nat ← getRootHash
+
+def rootHash : UInt64 :=
+ .ofNat rootHashFix
+
+/--
+Computes the hash of a file, which mixes:
+* The root hash
+* The hash of its content
+* The hashes of the imported files that are part of `Mathlib`
+-/
+partial def getFileHash (filePath : FilePath) : HashM UInt64 := do
+ match (← get).find? filePath with
+ | some hash => pure hash
+ | none =>
+ let content ← IO.FS.readFile $ (← IO.getPackageDir filePath) / filePath
+ let importHashes ← (getFileImports content pkgDirs).mapM getFileHash
+ let fileHash := hash $ rootHash :: content.hash :: importHashes.toList
+ modifyGet (fileHash, ·.insert filePath fileHash)
+
+/-- Main API to retrieve the hashes of the Lean files -/
+def getHashes : IO IO.HashMap :=
+ return (← StateT.run (getFileHash ⟨"Mathlib.lean"⟩) default).2
+
+end Cache.Hashing
diff --git a/Cache/IO.lean b/Cache/IO.lean
new file mode 100644
index 0000000000000..8abc879293e6c
--- /dev/null
+++ b/Cache/IO.lean
@@ -0,0 +1,156 @@
+/-
+Copyright (c) 2023 Arthur Paulino. All rights reserved.
+Released under Apache 2.0 license as described in the file LICENSE.
+Authors: Arthur Paulino
+-/
+
+import Lean.Data.HashMap
+import Lean.Data.RBMap
+import Lean.Data.RBTree
+
+/-- Removes a parent path from the beginning of a path -/
+def System.FilePath.withoutParent (path parent : FilePath) : FilePath :=
+ let rec aux : List String → List String → List String
+ | z@(x :: xs), y :: ys => if x == y then aux xs ys else z
+ | [], _ => []
+ | x, [] => x
+ mkFilePath $ aux path.components parent.components
+
+def UInt64.asTarGz (n : UInt64) : String :=
+ s!"{n}.tar.gz"
+
+namespace Cache.IO
+
+open System (FilePath)
+
+/-- Target directory for build files -/
+def LIBDIR : FilePath :=
+ "build" / "lib"
+
+/-- Target directory for IR files -/
+def IRDIR : FilePath :=
+ "build" / "ir"
+
+/-- Target directory for caching -/
+def CACHEDIR : FilePath :=
+ ⟨".cache"⟩
+
+/-- Target directory for caching -/
+def TMPDIR : FilePath :=
+ CACHEDIR / "tmp"
+
+def LAKEPACKAGESDIR : FilePath :=
+ ⟨"lake-packages"⟩
+
+abbrev PackageDirs := Lean.RBMap String FilePath compare
+
+/-- Whether this is running on Mathlib repo or not -/
+def isMathlibRoot : IO Bool :=
+ FilePath.mk "Mathlib" |>.pathExists
+
+def mathlibDepPath : FilePath :=
+ LAKEPACKAGESDIR / "Mathlib"
+
+def getPackageDirs : IO PackageDirs := return .ofList [
+ ("Mathlib", if ← isMathlibRoot then "." else mathlibDepPath),
+ ("Aesop", LAKEPACKAGESDIR / "aesop"),
+ ("Std", LAKEPACKAGESDIR / "std"),
+ ("Qq", LAKEPACKAGESDIR / "Qq")
+]
+
+initialize pkgDirs : PackageDirs ← getPackageDirs
+
+def getPackageDir (path : FilePath) : IO FilePath :=
+ match path.withExtension "" |>.components.head? with
+ | none => throw $ IO.userError "Can't find package directory for empty path"
+ | some pkg => match pkgDirs.find? pkg with
+ | none => throw $ IO.userError s!"Unknown package directory for {pkg}"
+ | some path => return path
+
+/-- Runs a terminal command and retrieves its output -/
+def runCmd (cmd : String) (args : Array String) : IO String := do
+ let out ← IO.Process.output { cmd := cmd, args := args }
+ if out.exitCode != 0 then throw $ IO.userError out.stderr
+ else return out.stdout
+
+/-- Recursively gets all files from a directory with a certain extension -/
+partial def getFilesWithExtension
+ (fp : FilePath) (extension : String) (acc : Array FilePath := #[]) :
+ IO $ Array FilePath := do
+ if ← fp.isDir then
+ (← fp.readDir).foldlM (fun acc dir => getFilesWithExtension dir.path extension acc) acc
+ else return if fp.extension == some extension then acc.push fp else acc
+
+abbrev HashMap := Lean.HashMap FilePath UInt64
+
+namespace HashMap
+
+def filter (hashMap : HashMap) (set : Lean.RBTree String compare) (keep : Bool) : HashMap :=
+ hashMap.fold (init := default) fun acc path hash =>
+ let contains := set.contains hash.asTarGz
+ let add := if keep then contains else !contains
+ if add then acc.insert path hash else acc
+
+def hashes (hashMap : HashMap) : Lean.RBTree UInt64 compare :=
+ hashMap.fold (init := default) fun acc _ hash => acc.insert hash
+
+end HashMap
+
+def mkDir (path : FilePath) : IO Unit := do
+ if !(← path.pathExists) then IO.FS.createDirAll path
+
+/-- Given a path to a Lean file, concatenates the paths to its build files -/
+def mkBuildPaths (path : FilePath) : IO $ Array String := do
+ let packageDir ← getPackageDir path
+ return #[
+ packageDir / LIBDIR / path.withExtension "olean" |>.toString,
+ packageDir / LIBDIR / path.withExtension "ilean" |>.toString,
+ packageDir / LIBDIR / path.withExtension "trace" |>.toString,
+ packageDir / IRDIR / path.withExtension "c" |>.toString,
+ packageDir / IRDIR / path.withExtension "c.trace" |>.toString]
+
+/-- Compresses build files into the local cache -/
+def mkCache (hashMap : HashMap) (overwrite : Bool) : IO $ Array String := do
+ mkDir CACHEDIR
+ IO.println "Compressing cache"
+ let mut acc := default
+ for (path, hash) in hashMap.toList do
+ let zip := hash.asTarGz
+ let zipPath := CACHEDIR / zip
+ if overwrite || !(← zipPath.pathExists) then
+ discard $ runCmd "tar" $ #["-I", "gzip -9", "-cf", zipPath.toString] ++
+ (← mkBuildPaths path)
+ acc := acc.push zip
+ return acc
+
+/-- Gets the set of all cached files -/
+def getLocalCacheSet : IO $ Lean.RBTree String compare := do
+ let paths ← getFilesWithExtension CACHEDIR "gz"
+ return .ofList (paths.data.map (·.withoutParent CACHEDIR |>.toString))
+
+/-- Decompresses build files into their respective folders -/
+def setCache (hashMap : HashMap) : IO Unit := do
+ IO.println "Decompressing cache"
+ hashMap.filter (← getLocalCacheSet) true |>.forM fun path hash => do
+ match path.parent with
+ | none | some path => do
+ let packageDir ← getPackageDir path
+ mkDir $ packageDir / LIBDIR / path
+ mkDir $ packageDir / IRDIR / path
+ discard $ runCmd "tar" #["-xzf", s!"{CACHEDIR / hash.asTarGz}"]
+
+instance : Ord FilePath where
+ compare x y := compare x.toString y.toString
+
+/-- Retrieves the azure token from the file system -/
+def getToken : IO String := do
+ let some token ← IO.getEnv "MATHLIB_CACHE_SAS"
+ | throw $ IO.userError "environment variable MATHLIB_CACHE_SAS must be set to upload caches"
+ return token
+
+/-- Removes all cache files except for what's in the `keep` set -/
+def clearCache (keep : Lean.RBTree FilePath compare := default) : IO Unit := do
+ for path in ← getFilesWithExtension CACHEDIR "gz" do
+ if ! keep.contains path then IO.FS.removeFile path
+
+end Cache.IO
diff --git a/Cache/Main.lean b/Cache/Main.lean
new file mode 100644
index 0000000000000..e4034e28b0e4d
--- /dev/null
+++ b/Cache/Main.lean
@@ -0,0 +1,55 @@
+/-
+Copyright (c) 2023 Arthur Paulino. All rights reserved.
+Released under Apache 2.0 license as described in the file LICENSE.
+Authors: Arthur Paulino
+-/
+
+import Cache.Requests
+
+def help : String := "Mathlib4 caching CLI
+Usage: cache [COMMAND]
+
+Commands:
+ # No priviledge required
+ get Download and decompress linked files missing on the local cache
+ get! Download and decompress all linked files
+ mk Compress non-compressed build files into the local cache
+ mk! Compress build files into the local cache (no skipping)
+ set Decompress linked files
+ clear Delete non-linked files
+ clear! Delete everything on the local cache
+
+ # Privilege required
+ put Run 'mk' then upload linked files missing on the server
+ put! Run 'mk' then upload all linked files
+ commit Run 'put' then write a commit on the server
+ commit! Run 'put!' then (over)write a commit on the server
+ collect TODO
+
+* Linked files refer to local cache files with corresponding Lean sources
+* Commands ending with '!' should be used manually, when hot-fixes are needed"
+
+open Cache IO Hashing Requests in
+def main (args : List String) : IO Unit := do
+ let hashMap ← getHashes
+ match args with
+ | ["get"] => getFiles (hashMap.filter (← getLocalCacheSet) false)
+ | ["get!"] => getFiles hashMap
+ | ["mk"] => discard $ mkCache hashMap false
+ | ["mk!"] => discard $ mkCache hashMap true
+ | ["set"] => setCache hashMap
+ | ["clear"] =>
+ clearCache $ hashMap.fold (fun acc _ hash => acc.insert $ CACHEDIR / hash.asTarGz) .empty
+ | ["clear!"] => clearCache
+ | ["put"] => putFiles (← mkCache hashMap false) false (← getToken)
+ | ["put!"] => putFiles (← mkCache hashMap false) true (← getToken)
+ | ["commit"] =>
+ if !(← isStatusClean) then IO.println "Please commit your changes first" return else
+ putFiles (← mkCache hashMap false) false (← getToken)
+ commit hashMap false (← getToken)
+ | ["commit!"] =>
+ if !(← isStatusClean) then IO.println "Please commit your changes first" return else
+ putFiles (← mkCache hashMap false) true (← getToken)
+ commit hashMap true (← getToken)
+ | ["collect"] => IO.println "TODO"
+ | _ => println help
diff --git a/Cache/Requests.lean b/Cache/Requests.lean
new file mode 100644
index 0000000000000..a28d03ecade97
--- /dev/null
+++ b/Cache/Requests.lean
@@ -0,0 +1,167 @@
+/-
+Copyright (c) 2023 Arthur Paulino. All rights reserved.
+Released under Apache 2.0 license as described in the file LICENSE.
+Authors: Arthur Paulino
+-/
+
+import Cache.Hashing
+
+def ByteArray.startsWith (a b : ByteArray) : Bool := Id.run do
+ let size := b.size
+ let a := a.copySlice 0 .empty 0 size
+ if size != a.size then return false
+ for i in [0 : size] do
+ if a.get! i != b.get! i then return false
+ return true
+
+namespace Cache.Requests
+
+/-- Azure blob URL -/
+def URL : String :=
+ "https://lakecache.blob.core.windows.net/mathlib4"
+
+open System (FilePath)
+
+/--
+Given a file name like `"1234.tar.gz"`, makes the URL to that file on the server.
+
+The `f/` prefix means that it's a common file for caching.
+-/
+def mkFileURL (fileName : String) (token : Option String) : IO String :=
+ return match token with
+ | some token => s!"{URL}/f/{fileName}?{token}"
+ | none => s!"{URL}/f/{fileName}"
+
+section Get
+
+/-- Formats part of the `curl` command that corresponds to the listing of files to be downloaded -/
+def mkGetPairs (fileNames : Array $ String × FilePath) : IO $ Array String :=
+ fileNames.foldlM (init := default) fun acc (fileName, path) => do
+ pure $ acc ++ #[← mkFileURL fileName none, "-o", path.toString]
+
+/-- Bytes that correspond to the beginning of invalid downloaded files, introducing a XML -/
+def invalidFileStart : ByteArray :=
+ ⟨#[239, 187, 191, 60, 63, 120, 109, 108, 32, 118, 101, 114, 115, 105, 111, 110, 61]⟩
+
+/--
+Calls `curl` to download files from the server.
+
+It first downloads the files to a temporary folder then extracts valid `tar.gz` files to the cache
+folder. The temporary folder is then deleted.
+-/
+def getFiles (hashMap : IO.HashMap) : IO Unit := do
+ let size := hashMap.size
+ if size > 0 then
+ let pairs := hashMap.fold (init := #[]) fun acc _ hash =>
+ let fileName := hash.asTarGz
+ acc.push (fileName, IO.TMPDIR / fileName)
+ IO.println s!"Attempting to download {size} file(s)"
+ IO.mkDir IO.TMPDIR
+ discard $ IO.runCmd "curl" $ #["-X", "GET", "--parallel"] ++ (← mkGetPairs pairs)
+ for (fileName, path) in pairs do
+ let bytes ← IO.FS.readBinFile path
+ if !bytes.startsWith invalidFileStart then
+ IO.FS.writeBinFile (IO.CACHEDIR / fileName) bytes
+ IO.FS.removeDirAll IO.TMPDIR
+ IO.setCache hashMap
+ else IO.println "No file to download"
+
+end Get
+
+section Put
+
+/-- Formats part of the `curl` command that corresponds to the listing of files to be uploaded -/
+def mkPutPairs (fileNames : Array String) (token : String) : IO $ Array String :=
+ fileNames.foldlM (init := default) fun acc fileName => do
+ pure $ acc.append #["-T", s!"{IO.CACHEDIR / fileName}", ← mkFileURL fileName (some token)]
+
+/-- Calls `curl` to send a set of cached files to the server -/
+def putFiles (fileNames : Array String) (overwrite : Bool) (token : String) : IO Unit := do
+ let size := fileNames.size
+ if size > 0 then
+ IO.println s!"Attempting to upload {size} file(s)"
+ let params := if overwrite
+ then #["-X", "PUT", "-H", "x-ms-blob-type: BlockBlob", "--parallel"]
+ else #["-X", "PUT", "-H", "x-ms-blob-type: BlockBlob", "-H", "If-None-Match: *", "--parallel"]
+ discard $ IO.runCmd "curl" $ params ++ (← mkPutPairs fileNames token)
+ else IO.println "No file to upload"
+
+end Put
+
+section Commit
+
+def isStatusClean : IO Bool :=
+ return (← IO.runCmd "git" #["status", "--porcelain"]).isEmpty
+
+def getCommitHash : IO String := do
+ let ret := (← IO.runCmd "git" #["log", "-1"]).replace "\n" " "
+ match ret.splitOn " " with
+ | "commit" :: hash :: _ => return hash
+ | _ => throw $ IO.userError "Invalid format for the return of `git log -1`"
+
+/--
+Sends a commit file to the server, containing the hashes of the respective commited files.
+
+The file name is the current Git hash and the `c/` prefix means that it's a commit file.
+-/
+def commit (hashMap : IO.HashMap) (overwrite : Bool) (token : String) : IO Unit := do
+ let hash ← getCommitHash
+ let path := IO.TMPDIR / hash
+ IO.mkDir IO.TMPDIR
+ IO.FS.writeFile path $ ("\n".intercalate $ hashMap.hashes.toList.map toString) ++ "\n"
+ let params := if overwrite
+ then #["-X", "PUT", "-H", "x-ms-blob-type: BlockBlob"]
+ else #["-X", "PUT", "-H", "x-ms-blob-type: BlockBlob", "-H", "If-None-Match: *"]
+ discard $ IO.runCmd "curl" $ params ++ #["-T", path.toString, s!"{URL}/c/{hash}?{token}"]
+ IO.FS.removeDirAll IO.TMPDIR
+
+end Commit
+
+section Collect
+
+inductive QueryType
+ | files | commits | all
+
+def QueryType.prefix : QueryType → String
+ | files => "&prefix=f/"
+ | commits => "&prefix=c/"
+ | all => default
+
+def formatError : IO α :=
+ throw $ IO.userError "Invalid format for curl return"
+
+def QueryType.desc : QueryType → String
+ | files => "hosted files"
+ | commits => "hosted commits"
+ | all => "everything"
+
+/--
+Retrieves metadata about hosted files: their names and the timestamps of last modification.
+
+Example: `["path/to/file.extension", "Sat, 24 Dec 2022 17:33:01 GMT"]`
+-/
+def getFilesInfo (q : QueryType) : IO $ List (String × String) := do
+ IO.println s!"Downloading info list of {q.desc}"
+ let ret ← IO.runCmd "curl" #["-X", "GET", s!"{URL}?comp=list&restype=container{q.prefix}"]
+ match ret.splitOn "" with
+ | [] => formatError
+ | [_] => return default
+ | _ :: parts =>
+ parts.mapM fun part => match part.splitOn "" with
+ | [name, rest] => match rest.splitOn "" with
+ | [_, rest] => match rest.splitOn "" with
+ | [date, _] => pure (name, date)
+ | _ => formatError
+ | _ => formatError
+ | _ => formatError
+
+/-- WIP garbage collection. Currently deletes the entire cache. Still useful for development -/
+def collectCache (token : String) : IO Unit := do
+ let hostedCacheSet ← getFilesInfo .all
+ let arr ← hostedCacheSet.foldlM (init := #[]) fun acc (fileName, _) => do
+ pure $ acc.push (← mkFileURL fileName (some token))
+ discard $ IO.runCmd "curl" $ #["-X", "DELETE", "--parallel"] ++ arr
+
+end Collect
+
+end Cache.Requests
diff --git a/lakefile.lean b/lakefile.lean
index af1044f1fff30..20c8bc49cceb7 100644
--- a/lakefile.lean
+++ b/lakefile.lean
@@ -19,3 +19,9 @@ require «doc-gen4» from git "https://github.com/leanprover/doc-gen4" @ "main"
require std from git "https://github.com/leanprover/std4" @ "main"
require Qq from git "https://github.com/gebner/quote4" @ "master"
require aesop from git "https://github.com/JLimperg/aesop" @ "master"
+
+lean_lib Cache where
+ roots := #[`Cache]
+
+lean_exe cache where
+ root := `Cache.Main