In [1]:
// #r "nuget: NGitLab"
#r "nuget: Fshttp"
#r "nuget: dotenv.net"

In [2]:
open dotenv.net
open System.Text.Json

let envVars = DotEnv.Read();
let token = envVars.["DATAHUB_API_TOKEN"]

In [3]:
open FsHttp


let all_projects =
    let getResp (page: int) =
        http {
            GET $"https://git.nfdi4plants.org/api/v4/projects"
            query [ "per_page", "100"; "page", string page ]
            headers
                [ 
                    "Authorization", token
                ]
        }
        |> Request.send
        
    let rec loop page (acc: (JsonElement array) list) =
        let response = getResp page

        let items = response |> Response.toJsonArray

        let total_items = 
            response.headers    
            |> Array.ofSeq
            |> Array.tryFind (fun kv -> kv.Key = "X-Total")
            |> Option.bind (fun kv -> if String.IsNullOrEmpty(kv.Value |> Seq.exactlyOne) then None else Some (kv.Value |> Seq.exactlyOne |> int))

        printfn "Fetched page %d with %d items (total items: %A)" page items.Length total_items

        let nextLink =
            response.headers    
            |> Array.ofSeq
            // |> fun x -> printfn "%A" x; x
            |> Array.tryFind (fun kv -> kv.Key = "X-Next-Page")
            |> Option.bind (fun kv -> if String.IsNullOrEmpty(kv.Value |> Seq.exactlyOne) then None else Some (kv.Value |> Seq.exactlyOne |> int))

        printfn "Next link: %A" nextLink

        match nextLink with
        | Some nextPage ->
            loop nextPage (items :: acc)
        | _ -> 
            let allItems =
                acc
                |> List.rev
                |> List.append [ items ]
                |> Array.concat
            allItems
    loop 1 []

all_projects.Length

Fetched page 1 with 100 items (total items: Some 156)
Next link: Some 2
Fetched page 2 with 56 items (total items: Some 156)
Next link: None


In [4]:
open System.Collections.Generic
open System.IO
let saved_files = 
    Directory.GetFiles("./data")
    |> Array.ofSeq
    |> Array.map (fun x -> x.Split("_").[2])

let progress = ResizeArray<string*string>()
let mapperino = Dictionary<string, (string * string * JsonElement array)>()

saved_files

In [5]:
let getTree (repoId: string) =
    let getResp (page: int) =
        http {
            GET $"https://git.nfdi4plants.org/api/v4/projects/{repoId}/repository/tree"
            query [ "per_page", "100"; "recursive", "true"; "page", string page ]
            headers
                [ 
                    "Authorization", token
                ]
        }
        |> Request.send

    let rec loop page (acc: (JsonElement array) list) =
        let response = getResp page

        let items = response |> Response.toJsonArray

        let total_items = 
            response.headers    
            |> Array.ofSeq
            |> Array.tryFind (fun kv -> kv.Key = "X-Total")
            |> Option.bind (fun kv -> if String.IsNullOrEmpty(kv.Value |> Seq.exactlyOne) then None else Some (kv.Value |> Seq.exactlyOne |> int))

        // printfn "Fetched page %d with %d items (total items: %A)" page items.Length total_items

        let nextLink =
            response.headers    
            |> Array.ofSeq
            // |> fun x -> printfn "%A" x; x
            |> Array.tryFind (fun kv -> kv.Key = "X-Next-Page")
            |> Option.bind (fun kv -> if String.IsNullOrEmpty(kv.Value |> Seq.exactlyOne) then None else Some (kv.Value |> Seq.exactlyOne |> int))

        // printfn "Next link: %A" nextLink

        match nextLink with
        | Some nextPage ->
            loop nextPage (items :: acc)
        | _ -> 
            let allItems =
                acc
                |> List.rev
                |> List.append [ items ]
                |> Array.concat
            allItems
    loop 1 []


In [6]:
let projects_to_skip = ["1842"]

In [7]:
let all_trees =
    all_projects
    |> Array.iter (fun projectJson ->
        let repoId : string = (projectJson?id).ToString()
        if 
            not (mapperino.ContainsKey(repoId))
            && not (List.contains repoId projects_to_skip)
            && not (Array.contains repoId saved_files)
        then
            printfn "starting repo %A" repoId
            try
                let repoName : string = (projectJson?name).ToString()
                let tree = getTree repoId
                let r = (repoId, repoName, tree |> Array.ofSeq)
                progress.Add(repoId,"done")
                mapperino.Add(repoId, r)
                printfn "Processed repo %s (%s) with %d items" repoName repoId tree.Length
                File.WriteAllText($"data/datahub_repo_{repoId}_tree.json", JsonSerializer.Serialize(tree)) 
                // r
            with ex ->
                progress.Add(repoId, sprintf "error: %s" ex.Message)
                failwithf "Error processing repo %s: %s" repoId ex.Message
        else
            printfn "skipping repo %A" repoId
            // mapperino.[repoId]
    )

skipping repo "1399"
skipping repo "1321"
skipping repo "1222"
skipping repo "1118"
skipping repo "1082"
skipping repo "1050"
skipping repo "1048"
skipping repo "1040"
skipping repo "1000"
skipping repo "997"
starting repo "984"
Processed repo Rat MRI + LS (984) with 32406 items
starting repo "979"
Processed repo 2023_Krueger_prophage_lysis-lysogeny_MrpR (979) with 100 items
starting repo "969"
Processed repo Identification and characterization of DICER-LIKE genes and their roles in Marchantia polymorpha development and salt stress response (969) with 129 items
starting repo "933"
Processed repo Samuilov-2018-BOU-PSP (933) with 8542 items
starting repo "923"
Processed repo 2022_Krueger_pseudokinase_heme_tolerance (923) with 47 items
starting repo "897"
Processed repo MBEN_Resolve (897) with 92 items
starting repo "784"
Processed repo Germann-2023 (784) with 133 items
starting repo "720"
Processed repo Evolutionary responses to CuZnSODs inhibition in plants 2023 (720) with 265 items
sta

In [None]:
progress

In [None]:
all_trees.Length

In [None]:
let filterXLSX (tree: JsonElement array) = 
    tree
    |> Array.ofSeq
    |> Array.filter (fun x -> 
        let path = (x?path).ToString()  
        path.EndsWith(".xlsx")
        && not (path.Contains("isa.investigation.xlsx"))
        && not (path.Contains("isa.assay.xlsx"))
        && not (path.Contains("isa.study.xlsx"))
    )

In [None]:
all_trees
|> Array.map (fun (repoId, tree) ->
    repoId,
    filterXLSX tree
    |> Array.map (fun x -> x?path.ToString())
)
|> Array.filter (fun (_, paths) -> paths.Length > 0)
|> Map.ofArray

In [None]:
all_trees
|> Array.map (fun (repoId, tree) ->
    repoId,
    tree
    |> Array.iter (fun itemJson ->
        let itemName : string = (itemJson?name).ToString()
        printfn " - %s" itemName
    )
)

In [None]:
let all_visible_projects = 
    client.Projects.GetAsync()
    |> Async.AwaitTask
    |> Async.RunSynchronously
    |> Array.ofSeq

In [None]:
all_visible_projects[0].Id

In [None]:
client.Projects.

In [None]:
client.

In [None]:
all_visible_projects
|> Array.map (fun project ->
    project.
)