Skip to content

Commit

Permalink
Merge pull request #445 from imperialCHEPI/input_data_from_url
Browse files Browse the repository at this point in the history
Allow for downloading zipped input data from URL
  • Loading branch information
alexdewar committed Jun 21, 2024
2 parents 13e9c8b + 1f5a175 commit 75a6008
Show file tree
Hide file tree
Showing 14 changed files with 125 additions and 44 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ The code in this repository is licensed under the [BSD 3-Clause](LICENSE.txt) li
| [libzippp](https://github.com/ctabin/libzippp) | MIT |
| [openssl](https://www.openssl.org) | Apache 2.0 |
| [PlatformFolders](https://github.com/sago007/PlatformFolders) | MIT |
| [curlpp](http://www.curlpp.org) | MIT |

### Tools and Frameworks

Expand Down
19 changes: 10 additions & 9 deletions src/HealthGPS.Console/command_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,18 @@ CommandOptions parse_arguments(cxxopts::Options &options, int &argc, char *argv[
}

if (result.count("storage")) {
cmd.storage_folder = result["storage"].as<std::string>();
if (cmd.storage_folder.is_relative()) {
cmd.storage_folder = std::filesystem::absolute(cmd.storage_folder);
fmt::print("File storage folder.: {}\n", cmd.storage_folder.string());
cmd.data_path_or_url = result["storage"].as<std::string>();

if (!cmd.data_path_or_url.starts_with("http://") &&
!cmd.data_path_or_url.starts_with("https://")) {
const std::filesystem::path path = cmd.data_path_or_url;

if (path.is_relative()) {
cmd.data_path_or_url = std::filesystem::absolute(path).string();
}
}
}

if (!fs::exists(cmd.storage_folder)) {
fmt::print(fg(fmt::color::red), "\nFile storage folder: {} not found.\n",
cmd.storage_folder.string());
cmd.exit_code = EXIT_FAILURE;
fmt::print("Data source: {}\n", cmd.data_path_or_url);
}

if (result.count("jobid")) {
Expand Down
4 changes: 2 additions & 2 deletions src/HealthGPS.Console/command_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ struct CommandOptions {
/// @brief The configuration file argument value
std::filesystem::path config_file{};

/// @brief The back-end storage full path argument value
std::filesystem::path storage_folder{};
/// @brief The back-end storage full path or URL argument value
std::string data_path_or_url;

/// @brief Indicates whether the application logging is verbose
bool verbose{};
Expand Down
2 changes: 1 addition & 1 deletion src/HealthGPS.Console/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ int main(int argc, char *argv[]) { // NOLINT(bugprone-exception-escape)
try {
#endif
// Create back-end data store, cached data repository wrapper
auto data_api = data::DataManager(cmd_args.storage_folder, config.verbosity);
auto data_api = data::DataManager(cmd_args.data_path_or_url, config.verbosity);
auto data_repository = hgps::CachedRepository{data_api};

// Register the input risk factors model definitions
Expand Down
5 changes: 4 additions & 1 deletion src/HealthGPS.Datastore/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
find_package(fmt CONFIG REQUIRED)
find_package(jsoncons CONFIG REQUIRED)
find_package(libzippp CONFIG REQUIRED)
find_package(unofficial-curlpp CONFIG REQUIRED)

add_library(HealthGPS.Datastore STATIC "")
target_compile_features(HealthGPS.Datastore PUBLIC cxx_std_${CMAKE_CXX_STANDARD})
Expand All @@ -10,12 +11,14 @@ target_sources(
PRIVATE "api.h"
"datamanager.cpp"
"datamanager.h"
"download_file.cpp"
"download_file.h"
"schema.cpp"
"schema.h"
"zip_file.cpp"
"zip_file.h")

target_link_libraries(HealthGPS.Datastore PRIVATE HealthGPS.Core fmt::fmt jsoncons
libzippp::libzippp)
libzippp::libzippp unofficial::curlpp::curlpp)

set(ROOT_NAMESPACE hgps::data)
20 changes: 14 additions & 6 deletions src/HealthGPS.Datastore/datamanager.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "datamanager.h"
#include "download_file.h"
#include "schema.h"
#include "zip_file.h"

Expand Down Expand Up @@ -46,15 +47,22 @@ nlohmann::json read_input_files_from_directory(const std::filesystem::path &root
} // anonymous namespace

namespace hgps::data {
DataManager::DataManager(std::filesystem::path path, VerboseMode verbosity)
DataManager::DataManager(const std::string &path_or_url, VerboseMode verbosity)
: verbosity_{verbosity} {
if (std::filesystem::is_directory(path)) {
root_ = std::move(path);
} else if (std::filesystem::is_regular_file(path) && path.extension() == ".zip") {
if (path_or_url.starts_with("http:") || path_or_url.starts_with("https:")) {
// Download file to temporary folder and extract it
const auto path = download_file_to_temporary(path_or_url, ".zip");
root_ = extract_zip_file_or_load_from_cache(path);
} else {
throw std::runtime_error(
fmt::format("Path must either point to a zip file or a directory: {}", path.string()));
std::filesystem::path path = path_or_url;
if (std::filesystem::is_directory(path)) {
root_ = std::move(path);
} else if (std::filesystem::is_regular_file(path) && path.extension() == ".zip") {
root_ = extract_zip_file_or_load_from_cache(path);
} else {
throw std::runtime_error(fmt::format(
"Path must either point to a zip file or a directory: {}", path_or_url));
}
}

index_ = read_input_files_from_directory(root_);
Expand Down
4 changes: 2 additions & 2 deletions src/HealthGPS.Datastore/datamanager.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ class DataManager : public Datastore {
DataManager() = delete;

/// @brief Initialises a new instance of the hgps::data::DataManager class.
/// @param path The path to store root folder containing the index.json file or a zip file.
/// @param path_or_url The path or URL pointing to the input files.
/// @param verbosity The terminal logging verbosity mode to use.
/// @throws std::invalid_argument if the root directory or index.json is missing.
/// @throws std::runtime_error for invalid or unsupported index.json file schema version.
explicit DataManager(std::filesystem::path path, VerboseMode verbosity = VerboseMode::none);
explicit DataManager(const std::string &path_or_url, VerboseMode verbosity = VerboseMode::none);

std::vector<Country> get_countries() const override;

Expand Down
61 changes: 61 additions & 0 deletions src/HealthGPS.Datastore/download_file.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#include "download_file.h"
#include "HealthGPS/program_dirs.h"

#include <curlpp/Easy.hpp>
#include <curlpp/Exception.hpp>
#include <curlpp/Options.hpp>
#include <curlpp/cURLpp.hpp>
#include <fmt/format.h>

#include <fstream>
#include <random>
#include <sstream>

namespace {
std::filesystem::path get_temporary_file_path(const std::string &file_prefix,
const std::string &file_extension) {
const auto tmp_dir = hgps::get_temporary_directory();
std::filesystem::create_directories(tmp_dir);

std::mt19937 prng(std::random_device{}());
std::uniform_int_distribution<unsigned> rand;
std::filesystem::path path;
while (true) {
std::stringstream ss;
ss << file_prefix << std::hex << rand(prng) << file_extension;
path = tmp_dir / ss.str();
if (!std::filesystem::exists(path)) {
return path;
}
}
}
} // anonymous namespace

namespace hgps::data {
std::filesystem::path download_file(const std::string &url,
const std::filesystem::path &download_path) {
std::ofstream ofs{download_path};
if (!ofs) {
throw std::runtime_error(fmt::format("Failed to create file {}", download_path.string()));
}

curlpp::Cleanup cleanup;

// Our request to be sent
curlpp::Easy request;
request.setOpt<curlpp::options::Url>(url);
request.setOpt<curlpp::options::WriteStream>(&ofs);

// Make request
request.perform();

return download_path;
}

std::filesystem::path download_file_to_temporary(const std::string &url,
const std::string &file_extension) {
const auto download_path = get_temporary_file_path("data", file_extension);
return download_file(url, download_path);
}

} // namespace hgps::data
20 changes: 20 additions & 0 deletions src/HealthGPS.Datastore/download_file.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#pragma once

#include <filesystem>
#include <string>

namespace hgps::data {
/// @brief Download the file at the specified URL
/// @param url URL to download from
/// @param download_path Destination for downloaded files, including filename
/// @return Path to downloaded file
std::filesystem::path download_file(const std::string &url,
const std::filesystem::path &download_path);

/// @brief Download the file at the specified URL to a temporary folder
/// @param url URL to download from
/// @param file_extension File extension (including dot) for downloaded file
/// @return Path to downloaded file
std::filesystem::path download_file_to_temporary(const std::string &url,
const std::string &file_extension);
} // namespace hgps::data
18 changes: 0 additions & 18 deletions src/HealthGPS.Datastore/zip_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,6 @@
#include <random>

namespace hgps::data {
std::filesystem::path create_temporary_directory() {
auto tmp_dir = std::filesystem::temp_directory_path();
std::random_device dev;
std::mt19937 prng(dev());
std::uniform_int_distribution<unsigned> rand;
std::filesystem::path path;

while (true) {
std::stringstream ss;
ss << std::hex << rand(prng);
path = tmp_dir / ss.str();
// true if the directory was created.
if (std::filesystem::create_directory(path)) {
return path;
}
}
}

std::filesystem::path get_zip_cache_directory(const std::string &file_hash) {
if (file_hash.size() != 64) {
throw std::invalid_argument("file_hash does not appear to be a valid SHA256 hash");
Expand Down
3 changes: 0 additions & 3 deletions src/HealthGPS.Datastore/zip_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
#include <filesystem>

namespace hgps::data {
//! Create a temporary directory with a unique path
std::filesystem::path create_temporary_directory();

/// @brief Get cache directory for extracting a file into
/// @param file_hash The SHA256 hash of the file
/// @return The path to the directory
Expand Down
4 changes: 4 additions & 0 deletions src/HealthGPS/program_dirs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,8 @@ std::filesystem::path get_program_path() {
std::filesystem::path get_cache_directory() {
return std::filesystem::path{sago::getCacheDir()} / program_name;
}

std::filesystem::path get_temporary_directory() {
return std::filesystem::temp_directory_path() / program_name;
}
} // namespace hgps
5 changes: 4 additions & 1 deletion src/HealthGPS/program_dirs.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ std::filesystem::path get_program_directory();
//! Get the path to the currently executing program
std::filesystem::path get_program_path();

/// The cache folder for Health-GPS
//! The cache folder for Health-GPS
std::filesystem::path get_cache_directory();

//! A temporary directory for Health-GPS
std::filesystem::path get_temporary_directory();
} // namespace hgps
3 changes: 2 additions & 1 deletion vcpkg.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
"tbb",
"libzippp",
"openssl",
"platform-folders"
"platform-folders",
"curlpp"
],
"builtin-baseline": "bd2b54836beed96e1efbe9aaf8ee800f5448856d"
}

0 comments on commit 75a6008

Please sign in to comment.