diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b9ce44e9..79647a17 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,9 +16,14 @@ jobs: steps: - uses: actions/checkout@v4 + # runner runs as userid 1001 but userid 1000 is baked into docker image. + # we could adjust this if needed via env var but this should work + - name: Adjust permissions for versitygw directories + run: chmod 777 var/vgw var/metadata_vgw + - name: Build docker image run: docker compose build - + - name: Run tests run: docker compose run test-and-cover env: diff --git a/Dockerfile b/Dockerfile index a3d7c119..8fb2c6d0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,6 +71,11 @@ ENV PERL5LIB="/extlib/lib/perl5:$FEED_HOME/lib" COPY ./src/validateCache.cpp /usr/src/validateCache.cpp RUN /usr/bin/g++ -o /usr/local/bin/validate-cache /usr/src/validateCache.cpp -lxerces-c +ENV GNUPGHOME=/tmp/gnupg +RUN mkdir $GNUPGHOME +RUN chown $UID:$GID $GNUPGHOME +RUN chmod 700 $GNUPGHOME + USER $UID:$GID WORKDIR $FEED_HOME @@ -87,8 +92,6 @@ RUN mkdir -p /tmp/stage/grin RUN mkdir -p /tmp/prep/toingest /tmp/prep/failed /tmp/prep/ingested /tmp/prep/logs /tmp/prep/toingest/emma RUN mkdir $FEED_HOME/bin $FEED_HOME/src $FEED_HOME/.gnupg -RUN chown $UID:$GID $FEED_HOME/.gnupg -RUN chmod 700 $FEED_HOME/.gnupg COPY . $FEED_HOME diff --git a/docker-compose.yml b/docker-compose.yml index ace5bb00..7b3ed668 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -25,9 +25,9 @@ services: command: prove depends_on: mariadb: *healthy - minio: *healthy pushgateway: *healthy rabbitmq: *healthy + versitygw: *healthy # Note: for permissions purposes this does NOT bind in the local development # environment, so local changes after running docker compose build will NOT @@ -36,6 +36,7 @@ services: build: . volumes: - ./clamav:/var/lib/clamav + - ./var/vgw:/usr/local/feed/var/vgw environment: - HTFEED_CONFIG=/usr/local/feed/etc/config_test.yml - FEED_HOME=/usr/local/feed @@ -53,9 +54,9 @@ services: command: cover -test -report Coveralls -make 'prove; exit $?' depends_on: mariadb: *healthy - minio: *healthy pushgateway: *healthy rabbitmq: *healthy + versitygw: *healthy ingest: build: . @@ -119,17 +120,21 @@ services: <<: *healthcheck-defaults test: ["CMD", "healthcheck.sh", "--su-mysql", "--connect", "--innodb_initialized"] - # S3 compatible object storage - minio: - image: minio/minio + # S3 -> filesystem gateway + versitygw: + user: "1000:1000" + image: versity/versitygw restart: always environment: - MINIO_ACCESS_KEY: TESTINGACCESSKEY - MINIO_SECRET_KEY: testingsecretkey - command: server /data + ROOT_ACCESS_KEY: TESTINGACCESSKEY + ROOT_SECRET_KEY: testingsecretkey + volumes: + - ./var/vgw:/usr/local/feed/var/vgw + - ./var/metadata_vgw:/usr/local/feed/var/metadata_vgw + command: --health /health posix --sidecar /usr/local/feed/var/metadata_vgw /usr/local/feed/var/vgw healthcheck: <<: *healthcheck-defaults - test: timeout 5s mc ready local + test: [ "CMD", "wget", "--quiet", "--tries=1", "-O", "/dev/null", "http://127.0.0.1:7070/health" ] pushgateway: image: prom/pushgateway @@ -152,4 +157,4 @@ volumes: repository_link: repository_obj: backups: - rclone: + vgw_sidecar: diff --git a/etc/config_test.yml b/etc/config_test.yml index ac060ff5..56faa46d 100644 --- a/etc/config_test.yml +++ b/etc/config_test.yml @@ -25,7 +25,7 @@ emma: namespace: test packagetype: emma bucket: emma-test-bucket - awscli: ['aws', '--endpoint-url', 'http://minio:9000'] + awscli: ['aws', '--endpoint-url', 'http://versitygw:7070'] rabbitmq: host: rabbitmq @@ -34,11 +34,10 @@ rabbitmq: queue: testqueue priority_levels: 3 -test_awscli: ['aws', '--endpoint-url', 'http://minio:9000'] +awscli: ['aws', '--endpoint-url', 'http://versitygw:7070'] pushgateway: http://pushgateway:9091 - # To configure in production handle: diff --git a/etc/ingest.sql b/etc/ingest.sql index 43825c9c..17e95127 100644 --- a/etc/ingest.sql +++ b/etc/ingest.sql @@ -127,6 +127,20 @@ CREATE TABLE IF NOT EXISTS `feed_backups` ( KEY `feed_backups_version` (`version`) ); +CREATE TABLE IF NOT EXISTS `feed_storage` ( + `namespace` varchar(10) NOT NULL, + `id` varchar(32) NOT NULL, + `storage_name` varchar(32) NOT NULL, + `zip_size` bigint(20) DEFAULT NULL, + `mets_size` bigint(20) DEFAULT NULL, + `saved_md5sum` char(32) DEFAULT NULL, + `deposit_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + `lastchecked` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + `lastmd5check` timestamp NULL DEFAULT NULL, + `md5check_ok` tinyint(1) DEFAULT NULL, + PRIMARY KEY (`namespace`, `id`, `storage_name`) +); + CREATE TABLE IF NOT EXISTS `feed_audit_detail` ( `namespace` varchar(10) NOT NULL, `id` varchar(30) NOT NULL, diff --git a/lib/HTFeed/Stage/Collate.pm b/lib/HTFeed/Stage/Collate.pm index 81b5a587..0e5a5930 100644 --- a/lib/HTFeed/Stage/Collate.pm +++ b/lib/HTFeed/Stage/Collate.pm @@ -9,6 +9,7 @@ use Carp qw(croak); use HTFeed::Config qw(get_config); use HTFeed::Storage::LinkedPairtree; use HTFeed::Storage::LocalPairtree; +use HTFeed::Storage::PairtreeObjectStore; use HTFeed::Storage::ObjectStore; use HTFeed::Storage::PrefixedVersions; use Log::Log4perl qw(get_logger); diff --git a/lib/HTFeed/Storage/ObjectStore.pm b/lib/HTFeed/Storage/ObjectStore.pm index 683f383c..d0d9e121 100644 --- a/lib/HTFeed/Storage/ObjectStore.pm +++ b/lib/HTFeed/Storage/ObjectStore.pm @@ -96,6 +96,16 @@ sub mets_key { return $self->object_path . ".mets.xml"; } +sub zip_size { + my $self = shift; + return $self->{filesize}{$self->zip_key}, +} + +sub mets_size { + my $self = shift; + return $self->{filesize}{$self->mets_key}, +} + sub zip_filename { my $self = shift; @@ -208,15 +218,19 @@ sub record_audit { $self->record_backup; } +sub saved_md5sum { + my $self = shift; + + my $b64_checksum = $self->{checksums}{$self->zip_key}; + my $hex_checksum = unpack("H*", decode_base64($b64_checksum)); +} + sub record_backup { my $self = shift; get_logger->trace(" starting record_backup"); my $dbh = HTFeed::DBTools::get_dbh(); - my $b64_checksum = $self->{checksums}{$self->zip_key}; - my $hex_checksum = unpack("H*", decode_base64($b64_checksum)); - my $stmt = join( " ", "INSERT INTO feed_backups", @@ -232,9 +246,9 @@ sub record_backup { $self->audit_path, $self->{timestamp}, $self->{name}, - $self->{filesize}{$self->zip_key}, - $self->{filesize}{$self->object_path . '.mets.xml'}, - $hex_checksum + $self->zip_size, + $self->mets_size, + $self->saved_md5sum ); get_logger->trace(" finished record_backup"); diff --git a/lib/HTFeed/Storage/PairtreeObjectStore.pm b/lib/HTFeed/Storage/PairtreeObjectStore.pm new file mode 100644 index 00000000..2ca42426 --- /dev/null +++ b/lib/HTFeed/Storage/PairtreeObjectStore.pm @@ -0,0 +1,60 @@ +package HTFeed::Storage::PairtreeObjectStore; + +# Stores using the S3 protocol but with pairtree paths + +use HTFeed::Storage::ObjectStore; +use base qw(HTFeed::Storage::ObjectStore); + +use HTFeed::DBTools qw(get_dbh); +use File::Pairtree qw(id2ppath s2ppchars); + +sub object_path { + my $self = shift; + + return sprintf( + '%s/%s%s/', + $self->{namespace}, + id2ppath($self->{objid}), + s2ppchars($self->{objid}) + ); +} + +sub zip_key { + my $self = shift; + + return $self->object_path . $self->{volume}->get_pt_objid() . $self->zip_suffix; + +} + +sub mets_key { + my $self = shift; + + return $self->object_path . $self->{volume}->get_mets_filename; +} + +sub record_audit { + my $self = shift; + + my $stmt = + "insert into feed_storage (namespace, id, storage_name, zip_size, mets_size, saved_md5sum, deposit_time, lastchecked, lastmd5check, md5check_ok) \ + values(?,?,?,?,?,?,CURRENT_TIMESTAMP,CURRENT_TIMESTAMP,CURRENT_TIMESTAMP,1) \ + ON DUPLICATE KEY UPDATE zip_size=?, mets_size=?, saved_md5sum=?, deposit_time=CURRENT_TIMESTAMP, lastchecked = CURRENT_TIMESTAMP,lastmd5check = CURRENT_TIMESTAMP, md5check_ok = 1"; + + my $storage_name = $self->{name}; + my $saved_md5sum = $self->saved_md5sum; + + my $zip_size = $self->zip_size; + my $mets_size = $self->mets_size; + + my $sth = get_dbh()->prepare($stmt); + my $res = $sth->execute( + $self->{namespace}, $self->{objid}, $storage_name, + $zipsize, $metssize, $saved_md5sum, + # duplicate parameters for duplicate key update + $zipsize, $metssize, $saved_md5sum + ); + + return $res; +} + +1; diff --git a/t/collate.t b/t/collate.t index 3b36ed09..a9c94e13 100644 --- a/t/collate.t +++ b/t/collate.t @@ -177,27 +177,60 @@ describe "HTFeed::Collate" => sub { local our ($bucket, $s3); my $old_storage_classes; + my %s3s; + + before all => sub { + foreach my $suffix (qw(ptobj1 ptobj2 backup)) { + $s3s{$suffix} = HTFeed::Storage::S3->new( + bucket => "$bucket-$suffix", + awscli => get_config('awscli') + ); + $s3s{$suffix}->mb; + } + }; + + after all => sub { + foreach my $s3 (values(%s3s)) { + $s3->rm('/',"--recursive"); + $s3->rb; + } + }; before each => sub { $old_storage_classes = get_config('storage_classes'); my $new_storage_classes = { + # simulating isilon 'linkedpairtree-test' => { class => 'HTFeed::Storage::LinkedPairtree', obj_dir => $tmpdirs->{obj_dir}, link_dir => $tmpdirs->{link_dir} }, + # simulating truenas (site 1) + 'pairtreeobjectstore-ptobj1' => { + class => 'HTFeed::Storage::PairtreeObjectStore', + bucket => $s3s{ptobj1}->{bucket}, + awscli => $s3s{ptobj1}->{awscli}, + }, + # simulating truenas (site 2) + 'pairtreeobjectstore-ptobj2' => { + class => 'HTFeed::Storage::PairtreeObjectStore', + bucket => $s3s{ptobj2}->{bucket}, + awscli => $s3s{ptobj2}->{awscli}, + }, + # simulating data den 'prefixedversions-test' => { class => 'HTFeed::Storage::PrefixedVersions', obj_dir => $tmpdirs->{backup_obj_dir}, encryption_key => $tmpdirs->test_home . "/fixtures/encryption_key" }, + # simulating glacier deep archive 'objectstore-test' => { class => 'HTFeed::Storage::ObjectStore', - bucket => $s3->{bucket}, - awscli => $s3->{awscli}, + bucket => $s3s{backup}->{bucket}, + awscli => $s3s{backup}->{awscli}, encryption_key => $tmpdirs->test_home . "/fixtures/encryption_key" } }; @@ -223,16 +256,22 @@ describe "HTFeed::Collate" => sub { is(scalar(@{$s3_backup}),1,'records a backup for object store'); my $timestamp = $versioned_backup->[0][0]; - ok(-e "$tmpdirs->{obj_dir}/test/pairtree_root/te/st/test/test.mets.xml",'copies mets to local storage'); - ok(-e "$tmpdirs->{obj_dir}/test/pairtree_root/te/st/test/test.zip",'copies zip to local storage'); + + my $pt_path = "test/pairtree_root/te/st/test"; + ok(-e "$tmpdirs->{obj_dir}/$pt_path/test.mets.xml",'copies mets to local storage'); + ok(-e "$tmpdirs->{obj_dir}/$pt_path/test.zip",'copies zip to local storage'); ok(-e "$tmpdirs->{backup_obj_dir}/test/tes/test.$timestamp.zip.gpg","copies the encrypted zip to backup storage"); ok(-e "$tmpdirs->{backup_obj_dir}/test/tes/test.$timestamp.mets.xml","copies the mets backup storage"); my $s3_timestamp = $s3_backup->[0][0]; - ok($s3->s3_has("test.test.$s3_timestamp.zip.gpg")); - ok($s3->s3_has("test.test.$s3_timestamp.mets.xml")); + ok($s3s{ptobj1}->s3_has("$pt_path/test.mets.xml")); + ok($s3s{ptobj1}->s3_has("$pt_path/test.zip")); + ok($s3s{ptobj2}->s3_has("$pt_path/test.mets.xml")); + ok($s3s{ptobj2}->s3_has("$pt_path/test.zip")); + ok($s3s{backup}->s3_has("test.test.$s3_timestamp.zip.gpg")); + ok($s3s{backup}->s3_has("test.test.$s3_timestamp.mets.xml")); ok(! -e "$tmpdirs->{zip}/test/00000001.jp2","cleans up the extracted zip files"); ok(! -e "$tmpdirs->{zip}/test","cleans up the zip file tmpdir"); diff --git a/t/lib/HTFeed/Namespace/Test.pm b/t/lib/HTFeed/Namespace/ClassTest.pm similarity index 77% rename from t/lib/HTFeed/Namespace/Test.pm rename to t/lib/HTFeed/Namespace/ClassTest.pm index 20c5c3f8..a06b876e 100644 --- a/t/lib/HTFeed/Namespace/Test.pm +++ b/t/lib/HTFeed/Namespace/ClassTest.pm @@ -1,4 +1,4 @@ -package HTFeed::Namespace::Test; +package HTFeed::Namespace::ClassTest; use warnings; use strict; diff --git a/t/lib/HTFeed/Test/Class.pm b/t/lib/HTFeed/Test/Class.pm index d7318bb9..24aefb5c 100644 --- a/t/lib/HTFeed/Test/Class.pm +++ b/t/lib/HTFeed/Test/Class.pm @@ -7,10 +7,12 @@ use HTFeed::Config qw(get_config); use File::Path qw(remove_tree); # return testing class, with assumption that $class eq "$testing_class::Test" +# or for example "$testing_class::SomethingTest" + sub testing_class{ my $self = shift; my $class = ref $self; - $class =~ s/::Test$//; + $class =~ s/::\w*Test$//; return $class; } diff --git a/t/lib/HTFeed/Test/Support.pm b/t/lib/HTFeed/Test/Support.pm index b38beba0..e3c7f34b 100644 --- a/t/lib/HTFeed/Test/Support.pm +++ b/t/lib/HTFeed/Test/Support.pm @@ -56,9 +56,10 @@ my @test_classes; my $libDir = "$FindBin::Bin/lib/"; # get the path to each test classes find(sub{ - if (-f and $_ =~ /^Test\.pm$/ ){ + if (-f and $_ =~ /Test\.pm$/ ){ my $name = $File::Find::name; $name =~ s/$libDir//; + return if $name =~ /AbstractTest\.pm$/; push @test_classes, $name; } }, $libDir diff --git a/t/s3_helper.pl b/t/s3_helper.pl index b6420358..63edef93 100644 --- a/t/s3_helper.pl +++ b/t/s3_helper.pl @@ -6,7 +6,7 @@ $bucket = "bucket" . sprintf("%08d",rand(1000000)); $s3 = HTFeed::Storage::S3->new( bucket => $bucket, - awscli => get_config('test_awscli') + awscli => get_config('awscli') ); $ENV{AWS_MAX_ATTEMPTS} = 1; diff --git a/t/storage_pairtree_object_store.t b/t/storage_pairtree_object_store.t new file mode 100644 index 00000000..da344c28 --- /dev/null +++ b/t/storage_pairtree_object_store.t @@ -0,0 +1,111 @@ +use HTFeed::Config qw(get_config); +use Test::Spec; +use Test::Exception; +use File::Temp qw(tempdir); +use File::Basename qw(basename); +use File::Path qw(make_path remove_tree); +use HTFeed::Storage::PairtreeObjectStore; + +use strict; + +describe "HTFeed::Storage::PairtreeObjectStore" => sub { + spec_helper 'storage_helper.pl'; + spec_helper 's3_helper.pl'; + + my $vgw_home = "$ENV{FEED_HOME}/var/vgw"; + local our ($tmpdirs, $testlog, $bucket, $s3, $objdir, $bucket_dir); + + before each => sub { + $s3->rm("/","--recursive"); + }; + + before all => sub { + $bucket_dir = "$vgw_home/$bucket"; + $objdir = "$vgw_home/$bucket-obj"; + make_path($objdir); + }; + + after all => sub { + remove_tree($objdir,$bucket_dir); + }; + + sub object_storage { + my $volume = stage_volume($tmpdirs,@_); + + my $storage = HTFeed::Storage::PairtreeObjectStore->new( + name => 'pairtreeobjectstore-test', + volume => $volume, + config => { + bucket => $s3->{bucket}, + awscli => $s3->{awscli} + }, + ); + + return $storage; + } + + describe "#object_path" => sub { + it "includes the namespace, pairtree path, and pairtreeized object id" => sub { + my $storage = object_storage('test','ark:/123456/abcde'); + + is($storage->object_path, "test/pairtree_root/ar/k+/=1/23/45/6=/ab/cd/e/ark+=123456=abcde/"); + }; + }; + + describe "#move" => sub { + it "uploads zip and mets" => sub { + my $storage = object_storage('test','test'); + my $pt_path = "test/pairtree_root/te/st/test"; + $storage->move; + + # should be in the bucket and also visible in the filesystem + ok($s3->s3_has("$pt_path/test.zip")); + ok($s3->s3_has("$pt_path/test.mets.xml")); + ok(-s "$bucket_dir/$pt_path/test.zip"); + ok(-s "$bucket_dir/$pt_path/test.mets.xml"); + }; + + }; + + describe "#record_audit" => sub { + it "records the item info in the feed_storage table" => sub { + my $dbh = get_dbh(); + + my $storage = object_storage('test','test'); + $storage->stage; + $storage->make_object_path; + $storage->move; + $storage->record_audit; + + my $r = $dbh->selectall_arrayref("SELECT * from feed_storage WHERE namespace = 'test' and id = 'test' and storage_name='pairtreeobjectstore-test'"); + + ok($r->[0][0]); + + }; + }; + + it "writes through existing symlinks" => sub { + + my $pt_prefix = "test/pairtree_root/te/st"; + + # set things up using filesystem access rather than via s3 + make_path("$objdir/$pt_prefix/test","$bucket_dir/$pt_prefix"); + system("touch $objdir/$pt_prefix/test/test.zip"); + system("touch $objdir/$pt_prefix/test/test.mets.xml"); + system("ln -sv $objdir/$pt_prefix/test $bucket_dir/$pt_prefix/test"); + + # writes via the symlink in $bucket_dir + my $storage = object_storage('test','test'); + $storage->move; + + # started as zero size (via touch), should be nonzero size now + ok(-s "$objdir/$pt_prefix/test/test.zip"); + ok(-s "$objdir/$pt_prefix/test/test.mets.xml"); + + # should still be a link in the bucket dir + ok(-l "$bucket_dir/$pt_prefix/test"); + }; + +}; + +runtests unless caller; diff --git a/var/metadata_vgw/.keep b/var/metadata_vgw/.keep new file mode 100644 index 00000000..e69de29b diff --git a/var/vgw/.keep b/var/vgw/.keep new file mode 100644 index 00000000..e69de29b