Skip to content

Commit

Permalink
Merge pull request #127 from hathitrust/DEV-1120
Browse files Browse the repository at this point in the history
DEV-1120: Making sure image metadata field Artist gets copied
  • Loading branch information
mwarin committed Jun 11, 2024
2 parents b0d512b + 244d122 commit 2c0fc06
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 92 deletions.
123 changes: 59 additions & 64 deletions lib/HTFeed/PackageType/Simple/ImageRemediate.pm
Original file line number Diff line number Diff line change
@@ -1,84 +1,88 @@
package HTFeed::PackageType::Simple::ImageRemediate;

use warnings;
use strict;
use warnings;

use base qw(HTFeed::Stage::ImageRemediate);
use List::Util qw(max min);
use POSIX qw(ceil);

use Carp;
use File::Basename qw(basename);
use File::Copy qw(move);
use HTFeed::Config qw(get_config);
use HTFeed::Stage::Fetch;
use List::Util qw(max min);
use Log::Log4perl qw(get_logger);
use File::Basename qw(basename);
use File::Copy qw(move);
use Carp;
use POSIX qw(ceil);

my %tiff_field_map = (
# will be automatically reformatted for IFD0:ModifyDate and XMP-tiff:DateTime
capture_date => 'DateTime',
scanner_user => 'IFD0:Artist',
scanner_make => 'IFD0:Make',
capture_date => 'DateTime',
scanner_user => 'IFD0:Artist',
scanner_make => 'IFD0:Make',
scanner_model => 'IFD0:Model',
);

my %jpeg2000_field_map = (
capture_date => 'XMP-tiff:DateTime',
scanner_user => 'XMP-tiff:Artist',
scanner_make => 'XMP-tiff:Make',
capture_date => 'XMP-tiff:DateTime',
scanner_user => 'XMP-tiff:Artist',
scanner_make => 'XMP-tiff:Make',
scanner_model => 'XMP-tiff:Model',
);

sub run{
my $self = shift;
my $volume = $self->{volume};
sub run {
my $self = shift;
my $volume = $self->{volume};
my $preingest_dir = $volume->get_preingest_directory();
my $staging_dir = $volume->get_staging_directory();
my $staging_dir = $volume->get_staging_directory();

# decompress any lossless JPEG2000 images
my @jp2 = glob("$preingest_dir/*.jp2");
if(@jp2) {
$self->expand_lossless_jpeg2000($volume,$preingest_dir,[map { basename($_) } @jp2]);
if (@jp2) {
$self->expand_lossless_jpeg2000($volume, $preingest_dir, [map { basename($_) } @jp2]);
}

#remediate TIFFs
#remediate TIFFs
my @tiffs = map { basename($_) } glob("$preingest_dir/*.tif");
$self->remediate_tiffs($volume,$preingest_dir,\@tiffs,

# return extra fields to set that depend on the file
sub {
my $file = shift;

my $force_fields = {'IFD0:DocumentName' => join('/',$volume->get_objid(),$file) };
my $set_if_undefined = {};
while(my ($meta_yml_field,$tiff_field) = each(%tiff_field_map)) {
$self->set_from_meta_yml($meta_yml_field,$set_if_undefined,$tiff_field);
}

# force override resolution if it is provided in meta.yml
$self->set_from_meta_yml('bitonal_resolution_dpi',$force_fields,'Resolution');

return ( $force_fields, $set_if_undefined, $file);
}
) if @tiffs;
if (@tiffs) {
# return extra fields to set that depend on the file
my $headers_sub = sub {
my $file = shift;
my $force_fields = { 'IFD0:DocumentName' => join('/', $volume->get_objid(), $file) };
my $set_if_undefined = {};
while (my ($meta_yml_field, $tiff_field) = each(%tiff_field_map)) {
$self->set_from_meta_yml($meta_yml_field, $set_if_undefined, $tiff_field);
}
# force override resolution if it is provided in meta.yml
$self->set_from_meta_yml('bitonal_resolution_dpi', $force_fields, 'Resolution');

return ($force_fields, $set_if_undefined, $file);
};

$self->remediate_tiffs(
$volume,
$preingest_dir,
\@tiffs,
$headers_sub
)
}

# remediate JP2s

foreach my $jp2_submitted (glob("$preingest_dir/*.jp2"))
{
my $jp2_fields = $self->get_exiftool_fields($jp2_submitted);

foreach my $jp2_submitted (glob("$preingest_dir/*.jp2")) {
my $jp2_fields = $self->get_exiftool_fields($jp2_submitted);
my $staging_dir = $volume->get_staging_directory();

# there shouldn't be any JP2s for MOA material?
my $force_fields = {'XMP-dc:source' => join('/',$volume->get_objid(),basename($jp2_submitted)) };
my $force_fields = { 'XMP-dc:source' => join('/', $volume->get_objid(), basename($jp2_submitted)) };
my $set_if_undefined = {};
my $jp2_remediated = "$staging_dir/" . basename($jp2_submitted);
my $jp2_remediated = "$staging_dir/" . basename($jp2_submitted);

while(my ($meta_yml_field,$jp2_field) = each(%jpeg2000_field_map)) {
$self->set_from_meta_yml($meta_yml_field,$set_if_undefined,$jp2_field);
while (my ($meta_yml_field, $jp2_field) = each(%jpeg2000_field_map)) {
$self->set_from_meta_yml($meta_yml_field, $set_if_undefined, $jp2_field);
}

# force override resolution if it is provided in meta.yml
$self->set_from_meta_yml('contone_resolution_dpi',$force_fields,'Resolution');
$self->set_from_meta_yml('contone_resolution_dpi', $force_fields, 'Resolution');

$self->remediate_image( $jp2_submitted, $jp2_remediated, $force_fields, $set_if_undefined );
}
Expand All @@ -88,40 +92,31 @@ sub run{
# remove newlines & move OCR, supplementary files
my $fetch = HTFeed::Stage::Fetch->new(volume => $volume);
foreach my $file (glob("$preingest_dir/[0-9]*[0-9].{txt,html,xml}")) {
move($file,$staging_dir);
move($file, $staging_dir);
}
foreach my $file (glob("$preingest_dir/*.pdf")) {
move($file,$staging_dir);
move($file, $staging_dir);
}
$fetch->fix_line_endings($staging_dir);


$self->_set_done();
return $self->succeeded();

return $self->succeeded();
}

sub set_from_meta_yml {
my $self = shift;
my $meta_yml_key = shift;
my $field_output = shift;
my $self = shift;
my $meta_yml_key = shift;
my $field_output = shift;
my $metadata_field = shift;
my $require = shift;

$require = 0 if not defined $require;

my $require = shift || 0;
my $metadata_value = $self->{volume}->get_meta($meta_yml_key);

if($require and not defined $metadata_value) {
$self->set_error("MissingField",file => 'meta.yml',field=> $meta_yml_key);
if ($require and not defined $metadata_value) {
$self->set_error("MissingField", file => 'meta.yml', field => $meta_yml_key);
}

return if not defined $metadata_value;

$field_output->{$metadata_field} = $metadata_value;
}


1;

__END__
74 changes: 47 additions & 27 deletions lib/HTFeed/Stage/ImageRemediate.pm
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,9 @@ sub get_exiftool_fields {
$exifTool->Options('ScanForXMP' => 1);
$exifTool->ExtractInfo( $file, { Binary => 1 } );

foreach my $tag ( $exifTool->GetFoundTags() ) {

foreach my $tag ($exifTool->GetFoundTags()) {
# get only the groupname we'll use to update it later
my $group = $exifTool->GetGroup( $tag, "1" );
my $group = $exifTool->GetGroup( $tag, "1" );
my $tagname = Image::ExifTool::GetTagName($tag);
$fields->{"$group:$tagname"} = $exifTool->GetValue($tag);
}
Expand Down Expand Up @@ -335,7 +334,7 @@ sub _remediate_tiff {
}

# Fix the XMP, if needed
if($self->needs_xmp) {
if ($self->needs_xmp) {
# force required fields
$self->{newFields}{'XMP-tiff:BitsPerSample'} = 1;
$self->{newFields}{'XMP-tiff:Compression'} = 'T6/Group 4 Fax';
Expand Down Expand Up @@ -366,8 +365,11 @@ sub _remediate_tiff {

}

$ret = $ret
&& $self->repair_tiff_exiftool( $infile, $outfile, $self->{newFields} );
$ret = $ret && $self->repair_tiff_exiftool(
$infile,
$outfile,
$self->{newFields}
);

return $ret;
}
Expand Down Expand Up @@ -430,11 +432,28 @@ sub repair_tiff_imagemagick {
"TIFF_REPAIR: attempting to repair $infile to $outfile\n"
);

my $in_exif = Image::ExifTool->new;
my $in_meta = $in_exif->ImageInfo($infile);

# convert returns 0 on success, 1 on failure
my $imagemagick = get_config('imagemagick');
my $rval = system("$imagemagick -compress Group4 '$infile' '$outfile' > /dev/null 2>&1");
croak("failed repairing $infile\n") if $rval;

# Some metadata may be lost when imagemagick compresses infile to outfile.
# Here we are putting Artist back, or we'll crash at a later stage,
# due to missing ImageProducer (which depends on Artist).
my $out_exif = Image::ExifTool->new;
my $out_meta = $out_exif->ImageInfo($outfile);
if (defined $in_meta->{'Artist'} && !defined $out_meta->{'Artist'}) {
my ($success, $msg) = $out_exif->SetNewValue('Artist', $in_meta->{'Artist'});
if (defined $msg) {
croak("Error setting new tag Artist => $in_meta->{'Artist'}: $msg\n");
} else {
$self->update_tags($out_exif, $outfile);
}
}

$self->{job_metrics}->add("ingest_imageremediate_bytes", -s $infile);
$self->{job_metrics}->inc("ingest_imageremediate_images");

Expand Down Expand Up @@ -746,7 +765,7 @@ sub expand_lossless_jpeg2000 {
$exiftool->WriteInfo("$path/$jpeg2000_remediated");

rename("$path/$jpeg2000_remediated","$path/$jpeg2000");
unlink("$path/$tiff");
unlink("$path/$tiff");
}
},
"-m JPEG2000-hul"
Expand Down Expand Up @@ -876,10 +895,10 @@ for remediate_image (qv)
=cut

sub remediate_tiffs {

my ( $self, $volume, $tiffpath, $files, $headers_sub ) = @_;
my ($self, $volume, $tiffpath, $files, $headers_sub) = @_;
my $repStatus_xp = XML::LibXML::XPathExpression->new(
'/jhove:jhove/jhove:repInfo/jhove:status');
'/jhove:jhove/jhove:repInfo/jhove:status'
);
my $error_xp = XML::LibXML::XPathExpression->new(
'/jhove:jhove/jhove:repInfo/jhove:messages/jhove:message[@severity="error"]'
);
Expand All @@ -891,28 +910,24 @@ sub remediate_tiffs {
my $headers = $self->get_exiftool_fields("$tiffpath/$tiff");
my $needwrite = 0;
my $exiftool = new Image::ExifTool;
$exiftool->Options('ScanForXMP' => 1);

$exiftool->Options('ScanForXMP' => 1);
$exiftool->Options('IgnoreMinorErrors' => 1);
foreach my $field ( 'IFD0:ModifyDate', 'IFD0:Artist' ) {
foreach my $field ('IFD0:ModifyDate', 'IFD0:Artist') {
my $header = $headers->{$field};
eval {

# see if the header is valid ascii or UTF-8
my $decoded_header =
decode( 'utf-8', $header, Encode::FB_CROAK );
my $decoded_header = decode('utf-8', $header, Encode::FB_CROAK);
};
if ($@) {

# if not, strip it
$exiftool->SetNewValue($field);
$needwrite = 1;
}

}
}
if ($needwrite) {
$exiftool->WriteInfo("$tiffpath/$tiff");
}

}

$self->run_jhove(
Expand All @@ -922,25 +937,30 @@ sub remediate_tiffs {
sub {
my ( $volume, $file, $node ) = @_;
my $xpc = XML::LibXML::XPathContext->new($node);
my ( $force_headers, $set_if_undefined_headers, $renamed_file ) =
( undef, undef, undef );
my $force_headers = undef;
my $set_if_undefined_headers = undef;
my $renamed_file = undef;
register_namespaces($xpc);

$self->{jhoveStatus} = $xpc->findvalue($repStatus_xp);
$self->{jhoveErrors} =
[ map { $_->textContent } $xpc->findnodes($error_xp) ];
$self->{jhoveErrors} = [
map { $_->textContent } $xpc->findnodes($error_xp)
];

# get headers that may depend on the individual file
if ($headers_sub) {
( $force_headers, $set_if_undefined_headers, $renamed_file ) =
&$headers_sub($file);
($force_headers, $set_if_undefined_headers, $renamed_file) = &$headers_sub($file);
}

my $outfile = "$stage_path/$file";
$outfile = "$stage_path/$renamed_file" if ( defined $renamed_file );

$self->remediate_image( "$tiffpath/$file", $outfile, $force_headers,
$set_if_undefined_headers );
$self->remediate_image(
"$tiffpath/$file",
$outfile,
$force_headers,
$set_if_undefined_headers
);
},
"-m TIFF-hul"
);
Expand Down
Binary file added t/fixtures/simple/test/bitonal_tiff.zip
Binary file not shown.
9 changes: 8 additions & 1 deletion t/local_ingest.t
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,16 @@ describe "HTFeed::PackageType::Simple" => sub {
my $exiftool = Image::ExifTool->new();
$exiftool->ExtractInfo("$tmpdirs->{ingest}/lossless_jp2_with_xmp/00000001.jp2");
is($exiftool->GetValue("XMP-tiff:Make"),"Test scanner make");

};

it "does not lose artist when compressing a bitonal tiff" => sub {
my $volume = unpacked_volume("bitonal_tiff");
HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume)->run();
HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run();
my $validate = HTFeed::VolumeValidator->new(volume => $volume);
$validate->run();
ok($validate->succeeded());
};
};
};

Expand Down

0 comments on commit 2c0fc06

Please sign in to comment.