Skip to content

Commit

Permalink
feat: Add standalone proto extractor (#21)
Browse files Browse the repository at this point in the history
The extractor takes a list of 1+ "toplevel" protos and generates a kzip
file containing those protos and all of their dependencies. It also
handles the --proto_path/-I flags for adding directories to the search
path.
  • Loading branch information
justbuchanan committed Jan 15, 2019
1 parent 0ab7501 commit c1c53ce
Show file tree
Hide file tree
Showing 21 changed files with 744 additions and 1 deletion.
2 changes: 1 addition & 1 deletion WORKSPACE
Expand Up @@ -15,7 +15,7 @@ http_archive(

git_repository(
name = "io_kythe",
commit = "7eba1c0a26b3de921f64fd9e06d39c7a89d0add2",
commit = "6c54d48ce947fb5526c50e3264bdeb35fea3a685",
remote = "https://github.com/kythe/kythe",
)

Expand Down
16 changes: 16 additions & 0 deletions kythe/cxx/extractor/proto/BUILD
@@ -0,0 +1,16 @@
cc_binary(
name = "proto_extractor",
srcs = ["proto_extractor.cc"],
visibility = ["//visibility:public"],
deps = [
"//kythe/cxx/indexer/proto:search_path",
"@com_github_gflags_gflags//:gflags",
"@com_github_google_glog//:glog",
"@com_google_absl//absl/strings",
"@com_google_protobuf//:protobuf",
"@io_kythe//kythe/cxx/common:kzip_writer",
"@io_kythe//kythe/cxx/common:lib",
"@io_kythe//kythe/cxx/common:path_utils",
"@io_kythe//kythe/proto:analysis_cc_proto",
],
)
264 changes: 264 additions & 0 deletions kythe/cxx/extractor/proto/proto_extractor.cc
@@ -0,0 +1,264 @@
/*
* Copyright 2018 The Kythe Authors. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// Standalone extractor for protobuf. Given a proto file, builds a kzip
// containing the proto file(s) and all dependencies.
//
// Usage:
// export KYTHE_OUTPUT_FILE=foo.kzip
// proto_extractor foo.proto
// proto_extractor foo.proto bar.proto
// proto_extractor foo.proto -- --proto_path dir/with/my/deps

#include <string>

#include "absl/strings/match.h"
#include "absl/strings/str_cat.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "google/protobuf/compiler/importer.h"
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "kythe/cxx/common/file_vname_generator.h"
#include "kythe/cxx/common/kzip_writer.h"
#include "kythe/cxx/common/path_utils.h"
#include "kythe/cxx/indexer/proto/search_path.h"
#include "kythe/proto/analysis.pb.h"

namespace kythe {
namespace {

// Error "collector" that just writes messages to log output.
class LoggingMultiFileErrorCollector
: public google::protobuf::compiler::MultiFileErrorCollector {
public:
void AddError(const std::string& filename, int line, int column,
const std::string& message) override {
LOG(ERROR) << filename << "@" << line << ":" << column << ": " << message;
}

void AddWarning(const std::string& filename, int line, int column,
const std::string& message) override {
LOG(ERROR) << filename << "@" << line << ":" << column << ": " << message;
}
};

// DiskSourceTree that records which proto files are opened while parsing the
// toplevel proto(s), allowing us to get a list of transitive dependencies.
class RecordingDiskSourceTree
: public google::protobuf::compiler::DiskSourceTree {
public:
google::protobuf::io::ZeroCopyInputStream* Open(
const std::string& filename) override {
// Record resolved/canonical path because the same proto may be Open()'d via
// multiple relative paths and we only want to record it once.
std::string canonical_path;
if (!DiskSourceTree::VirtualFileToDiskFile(filename, &canonical_path)) {
return nullptr;
}
if (opened_files_.find(canonical_path) == opened_files_.end()) {
opened_files_.insert(canonical_path);
}

return DiskSourceTree::Open(filename);
}

// A set of unique file paths that have been passed to Open().
const std::set<std::string>& opened_files() const { return opened_files_; }

private:
std::set<std::string> opened_files_;
};

// Loads all data from a file or terminates the process.
static std::string LoadFileOrDie(const std::string& file) {
FILE* handle = fopen(file.c_str(), "rb");
CHECK(handle != nullptr) << "Couldn't open input file " << file;
CHECK_EQ(fseek(handle, 0, SEEK_END), 0) << "Couldn't seek " << file;
long size = ftell(handle);
CHECK_GE(size, 0) << "Bad size for " << file;
CHECK_EQ(fseek(handle, 0, SEEK_SET), 0) << "Couldn't seek " << file;
std::string content;
content.resize(size);
CHECK_EQ(fread(&content[0], size, 1, handle), 1) << "Couldn't read " << file;
CHECK_NE(fclose(handle), EOF) << "Couldn't close " << file;
return content;
}

// If KYTHE_VNAMES environment variable is set, configures the vname generator
// with the json config file at that path.
void SetVNameConfigFromEnv(FileVNameGenerator* vname_gen) {
std::string path;
if (const char* path_cstr = getenv("KYTHE_VNAMES")) {
path = path_cstr;
}
if (path.empty()) {
return;
}

std::string json = LoadFileOrDie(path);

std::string error_text;
CHECK(vname_gen->LoadJsonString(json, &error_text))
<< "Could not parse vname generator configuration: " << error_text;
}

} // anonymous namespace

int main(int argc, char* argv[]) {
google::InitGoogleLogging(argv[0]);
gflags::SetUsageMessage(R"(Standalone extractor for the Kythe Proto indexer.
Creates a kzip containing the main proto file(s) and any dependencies.
Examples:
export KYTHE_OUTPUT_FILE=foo.kzip
proto_extractor foo.proto
proto_extractor foo.proto bar.proto
proto_extractor foo.proto -- --proto_path dir/with/my/deps")");
gflags::ParseCommandLineFlags(&argc, &argv, /*remove_flags=*/true);
std::vector<std::string> final_args(argv + 1, argv + argc);

std::string output_filename;
if (const char* env_output_file = getenv("KYTHE_OUTPUT_FILE")) {
output_filename = env_output_file;
}
CHECK(!output_filename.empty())
<< "Please specify an output kzip file with the KYTHE_OUTPUT_FILE "
"environment variable.";

proto::IndexedCompilation compilation;
CHECK(GetCurrentDirectory(
compilation.mutable_unit()->mutable_working_directory()));

// File paths in the output kzip should be relative to this directory.
std::string kythe_root_directory = ".";
if (const char* env_root_directory = getenv("KYTHE_ROOT_DIRECTORY")) {
kythe_root_directory = env_root_directory;
}

// Parse --proto_path and -I args into a set of path substitions (search
// paths).
std::vector<std::pair<std::string, std::string>> path_substitutions;
std::vector<std::string> unused_args;
::kythe::lang_proto::ParsePathSubstitutions(final_args, &path_substitutions,
&unused_args);

// Look through remaining args for .proto files.
std::vector<std::string> proto_filenames;
for (const std::string& arg : unused_args) {
CHECK(absl::EndsWith(arg, ".proto"))
<< "Invalid arg, expected a proto file: '" << arg << "'";
proto_filenames.push_back(arg);
compilation.mutable_unit()->add_argument(arg);
compilation.mutable_unit()->add_source_file(
RelativizePath(arg, kythe_root_directory));
}
CHECK(proto_filenames.size() >= 1) << "Expected 1+ .proto files.";

// Add path substitutions to src_tree and add as args to compilation unit.
RecordingDiskSourceTree src_tree;
src_tree.MapPath("", ""); // Add current directory to VFS.
if (path_substitutions.size()) {
compilation.mutable_unit()->add_argument("--");
for (const auto& sub : path_substitutions) {
// Add search path to proto SourceTree.
src_tree.MapPath(sub.first, sub.second);

// Record in compilation unit args so indexer gets the same paths.
compilation.mutable_unit()->add_argument("--proto_path");
if (sub.first == "") {
compilation.mutable_unit()->add_argument(sub.second);
} else {
compilation.mutable_unit()->add_argument(
absl::StrCat(sub.first, "=", sub.second));
}
}
}

// Import the toplevel proto(s), which will record paths of any transitive
// dependencies to src_tree.
{
LoggingMultiFileErrorCollector err_collector;
for (const std::string& fname : proto_filenames) {
// Note that a separate importer instance is used for each top-level
// import to avoid double-importing any subprotos, which would happen if
// two top-level protos share any transitive dependencies.
google::protobuf::compiler::Importer importer(&src_tree, &err_collector);
CHECK(importer.Import(fname) != nullptr)
<< "Failed to import file: " << fname;
}
}

StatusOr<IndexWriter> kzip_writer = KzipWriter::Create(output_filename);
CHECK(kzip_writer.ok()) << "Error opening output kzip: "
<< kzip_writer.status();

FileVNameGenerator file_vnames;
SetVNameConfigFromEnv(&file_vnames);
std::string corpus;
if (const char* env_corpus = getenv("KYTHE_CORPUS")) {
corpus = env_corpus;
}

// Write each toplevel proto and its transitive dependencies into the kzip.
for (const std::string& filename : src_tree.opened_files()) {
// Read file contents
std::string file_contents;
{
std::unique_ptr<google::protobuf::io::ZeroCopyInputStream> in_stream(
src_tree.Open(filename));
CHECK(in_stream != nullptr) << "Can't open file: " << filename;

const void* data = nullptr;
int size = 0;
while (in_stream->Next(&data, &size)) {
file_contents.append(static_cast<const char*>(data), size);
}
}

// Resolve path relative to the proto compiler's search paths.
std::string full_path;
CHECK(src_tree.VirtualFileToDiskFile(filename, &full_path))
<< "Error canonicalizing path: " << filename;
// Make path relative to KYTHE_ROOT_DIRECTORY.
full_path = RelativizePath(full_path, kythe_root_directory);

// Write file to kzip.
auto digest = kzip_writer->WriteFile(file_contents);
CHECK(digest.ok()) << digest.status();

// Record file info to compilation unit.
proto::CompilationUnit::FileInput* file_input =
compilation.mutable_unit()->add_required_input();
proto::VName vname = file_vnames.LookupVName(full_path);
if (vname.corpus().empty()) {
vname.set_corpus(corpus);
}
*file_input->mutable_v_name() = std::move(vname);
file_input->mutable_info()->set_path(full_path);
file_input->mutable_info()->set_digest(*digest);
}

auto digest = kzip_writer->WriteUnit(compilation);
CHECK(digest.ok()) << "Error writing unit to kzip: " << digest.status();

CHECK(kzip_writer->Close().ok());

return 0;
}

} // namespace kythe

int main(int argc, char* argv[]) { return kythe::main(argc, argv); }
85 changes: 85 additions & 0 deletions kythe/cxx/extractor/proto/testdata/BUILD
@@ -0,0 +1,85 @@
load(":proto_extractor_test.bzl", "proto_extraction_golden_test")

proto_extraction_golden_test(
name = "relative_imports",
extra_files = ["subdir/other.proto"],
protoc_args = [
"--proto_path",
"kythe/cxx/extractor/proto/testdata/subdir",
],
protos = ["relative_imports.proto"],
)

# In this test, other.proto is both imported by relative_imports.proto and
# specified as a top-level proto to extract. Previously this caused an error
# because the extractor tried to import it twice and created duplicate symbol
# definitions.
proto_extraction_golden_test(
name = "duplicate_imports",
protoc_args = [
"--proto_path",
"kythe/cxx/extractor/proto/testdata/subdir",
],
protos = [
"relative_imports.proto",
"subdir/other.proto",
],
)

# Similar to duplicate_imports test above, but with the two proto files passed
# to the extractor in reverse order.
proto_extraction_golden_test(
name = "duplicate_imports2",
protoc_args = [
"--proto_path",
"kythe/cxx/extractor/proto/testdata/subdir",
],
protos = [
"subdir/other.proto",
# order matters for this test - don't sort these
"relative_imports.proto",
],
)

# File paths in the output should all be relative to the testdata directory.
proto_extraction_golden_test(
name = "custom_root_directory",
extra_env = {"KYTHE_ROOT_DIRECTORY": "kythe/cxx/extractor/proto/testdata"},
extra_files = ["subdir/other.proto"],
protoc_args = [
"--proto_path",
"kythe/cxx/extractor/proto/testdata/subdir",
],
protos = ["relative_imports.proto"],
)

proto_extraction_golden_test(
name = "simple",
protos = ["simple1.proto"],
)

proto_extraction_golden_test(
name = "multiple_source_files",
protos = [
"simple1.proto",
"simple2.proto",
],
)

proto_extraction_golden_test(
name = "custom_corpus",
extra_env = {"KYTHE_CORPUS": "my_custom_corpus"},
protos = ["simple1.proto"],
)

proto_extraction_golden_test(
name = "custom_vname_config",
extra_env = {
"KYTHE_VNAMES": "kythe/cxx/extractor/proto/testdata/custom_vname_config.json",
},
extra_files = ["custom_vname_config.json"],
protos = [
"simple1.proto",
"simple2.proto",
],
)
13 changes: 13 additions & 0 deletions kythe/cxx/extractor/proto/testdata/custom_corpus.UNIT
@@ -0,0 +1,13 @@
required_input {
v_name {
corpus: "my_custom_corpus"
path: "kythe/cxx/extractor/proto/testdata/simple1.proto"
}
info {
path: "kythe/cxx/extractor/proto/testdata/simple1.proto"
digest: "36aa2295772636e8e940333faa06f0838953101f353ab52d72279389e16117be"
}
}
argument: "kythe/cxx/extractor/proto/testdata/simple1.proto"
source_file: "kythe/cxx/extractor/proto/testdata/simple1.proto"
entry_context: "hash0"

0 comments on commit c1c53ce

Please sign in to comment.