Skip to content

Commit

Permalink
added Parser and Writer for MAB2 Diskette data
Browse files Browse the repository at this point in the history
  • Loading branch information
jorol committed Dec 18, 2013
1 parent 88c3ac0 commit 28834f6
Show file tree
Hide file tree
Showing 9 changed files with 1,324 additions and 6 deletions.
6 changes: 5 additions & 1 deletion lib/Catmandu/Exporter/MAB2.pm
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package Catmandu::Exporter::MAB2;
#VERSION

use Catmandu::Sane;
use MAB2::Writer::Disk;
use MAB2::Writer::RAW;
use MAB2::Writer::XML;
use Moo;
Expand All @@ -22,6 +23,9 @@ sub _build_writer {
if ( $type eq 'raw' ) {
MAB2::Writer::RAW->new( fh => $self->fh );
}
elsif ( $type eq 'disk' ) {
MAB2::Writer::Disk->new( fh => $self->fh );
}
elsif ( $type eq 'xml' ) {
MAB2::Writer::XML->new(
fh => $self->fh,
Expand All @@ -30,7 +34,7 @@ sub _build_writer {
);
}
else {
croak("unknown type: $type");
die "unknown type: $type";
}
}

Expand Down
8 changes: 6 additions & 2 deletions lib/Catmandu/Importer/MAB2.pm
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package Catmandu::Importer::MAB2;

use Catmandu::Sane;
use Moo;
use MAB2::Parser::Disk;
use MAB2::Parser::RAW;
use MAB2::Parser::XML;

Expand All @@ -24,6 +25,9 @@ sub mab_generator {
elsif ( $type eq 'xml' ) {
$file = MAB2::Parser::XML->new( $self->fh );
}
elsif ( $type eq 'disk' ) {
$file = MAB2::Parser::Disk->new( $self->fh );
}
else {
die "unknown format";
}
Expand All @@ -41,7 +45,7 @@ sub generator {
my ($self) = @_;

my $type = lc($self->type);
if ( $type =~ /raw|xml$/ ) {
if ( $type =~ /raw|xml|disk$/ ) {
return $self->mab_generator;
}
else {
Expand Down Expand Up @@ -94,7 +98,7 @@ identifier of the record) and 'record' containing an ARRAY of ARRAYs for every f
=head2 new(file => $filename,type=>$type,[id=>$id_field])
Create a new MAB2 importer for $filename. Use STDIN when no filename is given. Type
describes the sytax of the MAB records. Currently we support: RAW and XML.
describes the sytax of the MAB records. Currently we support: RAW, XML and Disk.
Optionally provide an 'id' option pointing to the identifier field of the MAB record
(default 001).
Expand Down
147 changes: 147 additions & 0 deletions lib/MAB2/Parser/Disk.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
package MAB2::Parser::Disk;

# ABSTRACT: MAB2 RAW format parser
# VERSION

use strict;
use warnings;
use charnames qw< :full >;
use Carp qw(croak);
use Readonly;

Readonly my $SUBFIELD_INDICATOR => qq{\N{INFORMATION SEPARATOR ONE}};
Readonly my $END_OF_FIELD => qq{\n};
Readonly my $END_OF_RECORD => q{};

=head1 SYNOPSIS
L<MAB2::Parser::Disk> is a parser for MAB2-Diskette records.
L<MAB2::Parser::Disk> expects UTF-8 encoded files as input. Otherwise provide a
filehande with a specified I/O layer.
Catmandu...
use MAB2::Parser::Disk;
my $parser = MAB2::Parser::Disk->new( $filename );
while ( my $record_hash = $parser->next() ) {
# do something
}
=head1 SUBROUTINES/METHODS
=head2 new
=cut

# ToDo: use Moo

sub new {
my $class = shift;
my $file = shift;

my $self = {
filename => undef,
rec_number => 0,
reader => undef,
};

# check for file or filehandle
my $ishandle = eval { fileno($file); };
if ( !$@ && defined $ishandle ) {
$self->{filename} = scalar $file;
$self->{reader} = $file;
}
elsif ( -e $file ) {
open $self->{reader}, '<:encoding(UTF-8)', $file
or croak "cannot read from file $file\n";
$self->{filename} = $file;
}
else {
croak "file or filehande $file does not exists";
}
return ( bless $self, $class );
}

=head2 next()
Reads the next record from MAB2 input stream. Returns a Perl hash.
=cut

sub next {
my $self = shift;
local $/ = $END_OF_RECORD;
if ( my $data = $self->{reader}->getline() ) {
$self->{rec_number}++;
my $record = _decode($data);

# get last subfield from 001 as id
my ($id) = map { $_->[-1] } grep { $_->[0] =~ '001' } @{$record};
return { _id => $id, record => $record };
}
return;
}

=head2 _decode()
Deserialize a raw MAB2 record to an array of field arrays.
=cut

sub _decode {
my $reader = shift;
chomp($reader);

my @record;

my @fields = split($END_OF_FIELD, $reader);

my $leader = shift @fields;
if( $leader =~ m/^\N{NUMBER SIGN}{3}\s(\d{5}[cdnpu]M2.0\d{7}\s{6}\w)/xms ){
push( @record, [ 'LDR', '', '_', $1 ] );
}
else{
croak "record leader not valid: $leader";
}

# ToDo: skip faulty fields
foreach my $field (@fields) {
croak "incomplete field: \"$field\"" if length($field) <= 4;
my $tag = substr( $field, 0, 3 );
my $ind = substr( $field, 3, 1 );
my $data = substr( $field, 4 );

# check for a 3-digit numeric tag
( $tag =~ m/^[0-9]{3}$/xms ) or croak "Invalid tag: \"$tag\"";

# check if indicator is an single alphabetic character
( $ind =~ m/^[a-z\s]$/xms ) or croak "Invalid indicator: \"$ind\"";

# check if data contains subfields
if ( $data =~ $SUBFIELD_INDICATOR ) {

# check if data starts with a SUBFIELD_INDICATOR
( substr( $data, 0, 1 ) eq $SUBFIELD_INDICATOR ) or croak "Invalid subfield structure at: \"$tag$ind\"";
my @subfields = split( $SUBFIELD_INDICATOR, substr( $data, 1 ) );
( @subfields ) or croak "no subfield data found: \"$tag$ind$data\"";
push(
@record,
[ $tag,
$ind,
map { substr( $_, 0, 1 ), substr( $_, 1 ) } @subfields
]
);
}
else {
push( @record, [ $tag, $ind, '_', $data ] );
}
}
return \@record;
}

=cut

1; # End of MAB2::Parser::Disk
92 changes: 92 additions & 0 deletions lib/MAB2/Writer/Disk.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package MAB2::Writer::Disk;

#ABSTRACT: MAB2 Diskette format serializer
#VERSION

use strict;
use Moo;
with 'MAB2::Writer::Handle';

use charnames ':full';
use Readonly;

Readonly my $SUBFIELD_INDICATOR => qq{\N{INFORMATION SEPARATOR ONE}};
Readonly my $END_OF_FIELD => qq{\n};
Readonly my $END_OF_RECORD => qq{\n};

=head1 SYNOPSIS
L<MAB2::Writer::Disk> is a MAB2 Diskette serializer.
use MAB2::Writer::Disk;
my @mab_records = (
[
['001', ' ', '_', '2415107-5'],
['331', ' ', '_', 'Code4Lib journal'],
['655', 'e', 'u', 'http://journal.code4lib.org/', 'z', 'kostenfrei'],
...
],
{
record => [
['001', ' ', '_', '2415107-5'],
['331', ' ', '_', 'Code4Lib journal'],
['655', 'e', 'u', 'http://journal.code4lib.org/', 'z', 'kostenfrei'],
...
]
}
);
$writer = MAB2::Writer::Disk->new( fh => $fh );
foreach my $record (@mab_records) {
$writer->write($record);
}
=head1 SUBROUTINES/METHODS
=head2 new()
=cut

sub BUILD {
my ($self) = @_;
}

=head2 _write_record()
=cut

sub _write_record {
my ( $self, $record ) = @_;
my $fh = $self->fh;

if ( $record->[0][0] eq 'LDR' ) {
my $leader = shift( @{$record} );
print $fh "### ", $leader->[3], $END_OF_FIELD;
}
else {
# set default record leader
print $fh "### 99999nM2.01200024 h", $END_OF_FIELD;
}

foreach my $field (@$record) {

if ( $field->[2] eq '_' ) {
print $fh $field->[0], $field->[1], $field->[3], $END_OF_FIELD;
}
else {
print $fh $field->[0], $field->[1];
for ( my $i = 2; $i < scalar @$field; $i += 2 ) {
my $subfield_code = $field->[ $i ];
my $value = $field->[ $i + 1 ];
print $fh $SUBFIELD_INDICATOR, $subfield_code, $value;
}
print $fh $END_OF_FIELD;
}
}
print $fh $END_OF_RECORD;
}

1;
11 changes: 10 additions & 1 deletion t/01-parser.t
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,13 @@ ok($record->{record}->[0][3] eq '02020nM2.01200024 h', 'record leader' );
is_deeply($record->{record}->[1], ['001', ' ', '_', '47918-4'], 'first field');
ok($parser->next()->{_id} eq '54251-9', 'next record');

done_testing();
use MAB2::Parser::Disk;
$parser = MAB2::Parser::Disk->new( './t/mab2disk.dat' );
isa_ok( $parser, 'MAB2::Parser::Disk' );
$record = $parser->next();
ok($record->{_id} eq '47918-4', 'record _id' );
ok($record->{record}->[0][3] eq '02020nM2.01200024 h', 'record leader' );
is_deeply($record->{record}->[1], ['001', ' ', '_', '47918-4'], 'first field');
ok($parser->next()->{_id} eq '54251-9', 'next record');

done_testing();
14 changes: 14 additions & 0 deletions t/02-importer.t
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,18 @@ is_deeply( $records[0]->{'record'}->[0], ['001', ' ', '_', '47918-4'],,
'record field'
);


$importer = Catmandu::Importer::MAB2->new(file => "./t/mab2disk.dat", type=> "Disk");
@records = ();
$importer->each(
sub {
push( @records, $_[0] );
}
);
ok(scalar @records == 20, 'records');
ok( $records[0]->{'_id'} eq '47918-4', 'record _id' );
is_deeply( $records[0]->{'record'}->[0], ['LDR', '', '_', '02020nM2.01200024 h'],,
'record field'
);

done_testing();
29 changes: 27 additions & 2 deletions t/04-exporter.t
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,37 @@ $exporter->commit();

close($fh);

open my $fh_utf8, '<:encoding(UTF-8)', $filename or die $!;
$out = do { local $/; <$fh_utf8> };
open $fh, '<:encoding(UTF-8)', $filename or die $!;
$out = do { local $/; <$fh> };

is $out, <<'MABRAW';
99999nM2.01200024 h001 47918-4310 Daß Ümläüt406bj1983
99999nM2.01200024 h406aj1990k2000
MABRAW

( $fh, $filename ) = tempfile();
$exporter = Catmandu::Exporter::MAB2->new( file => $filename, type => 'DISK' );

for my $record (@mab_records) {
$exporter->add($record);
}

$exporter->commit();

close($fh);

open $fh, '<:encoding(UTF-8)', $filename or die $!;
$out = do { local $/; <$fh> };

is $out, <<'MABDISK';
### 99999nM2.01200024 h
001 47918-4
310 Daß Ümläüt
406bj1983
### 99999nM2.01200024 h
406aj1990k2000
MABDISK

done_testing;
Loading

0 comments on commit 28834f6

Please sign in to comment.