Skip to content
This repository has been archived by the owner on Dec 28, 2023. It is now read-only.

Commit

Permalink
Added OLE file checking to identify xlsx and encrypted files.
Browse files Browse the repository at this point in the history
  • Loading branch information
jmcnamara committed Feb 11, 2012
1 parent a486843 commit ee9ec0c
Show file tree
Hide file tree
Showing 20 changed files with 364 additions and 46 deletions.
137 changes: 116 additions & 21 deletions lib/Excel/Reader/XLSX.pm
Expand Up @@ -16,6 +16,7 @@ use strict;
use warnings;
use Exporter;
use Archive::Zip;
use OLE::Storage_Lite;
use File::Temp qw(tempdir);
use Excel::Reader::XLSX::Workbook;
use Excel::Reader::XLSX::Package::ContentTypes;
Expand All @@ -31,18 +32,24 @@ our $VERSION = '0.00';
# Error codes for some common errors.
our $ERROR_none = 0;
our $ERROR_file_not_found = 1;
our $ERROR_file_zip_error = 2;
our $ERROR_file_missing_subfile = 3;
our $ERROR_file_has_no_content_types = 4;
our $ERROR_file_missing_workbook = 5;
our $ERROR_file_is_xls = 2;
our $ERROR_file_is_encrypted = 3;
our $ERROR_file_is_unknown_ole = 4;
our $ERROR_file_zip_error = 5;
our $ERROR_file_missing_subfile = 6;
our $ERROR_file_has_no_content_types = 7;
our $ERROR_file_missing_workbook = 8;

our @error_strings = (
'', # 0
'File not found', # 1
'File has zip error', # 2
'File missing subfile', # 3
'File has no [Content_Types].xml', # 4
'File is missing workbook.xml', # 5
'File is xls not xlsx', # 2
'File is encrypted xlsx', # 3
'File is unknown OLE doc type', # 4
'File has zip error', # 5
'File missing subfile', # 6
'File has no [Content_Types].xml', # 7
'File is missing workbook.xml', # 8
);


Expand Down Expand Up @@ -90,6 +97,14 @@ sub read_file {
return;
}

# Check for xls or encrypted OLE files.
my $ole_file = $self->_check_if_ole_file( $filename );
if ( $ole_file ) {
$self->{_error_status} = $ole_file;
$self->{_error_extra_text} = $filename;
return;
}

# Create a, locally scoped, temp dir to unzip the XLSX file into.
my $tempdir = File::Temp->newdir( DIR => $self->{_tempdir} );

Expand Down Expand Up @@ -130,19 +145,22 @@ sub read_file {
# Read the filenames from the [Content_Types].
my %files = $content_types->_get_files();

# Create a reader object to read the sharedStrings.xml file.

# Check that the files actually exist.
my $files_exist = $self->_check_files_exist( $tempdir, %files );

if ( !$files_exist ) {
$self->{_error_status} = $ERROR_file_missing_subfile;
return;
}


# Create a reader object to read the sharedStrings.xml file.
my $shared_strings = Excel::Reader::XLSX::Package::SharedStrings->new();

# Read the sharedStrings if present. Only files with strings have one.
if ( $files{_shared_strings} ) {

# Check that the file exists even if it is listed in [Content_Types].
if ( !-e $tempdir . $files{_shared_strings} ) {
$self->{_error_status} = $ERROR_file_missing_subfile;
return;
}

$shared_strings->_read_file( $tempdir . $files{_shared_strings} );
$shared_strings->_read_all_nodes();
}
Expand All @@ -155,11 +173,6 @@ sub read_file {

);

# Check that the file exists even if it is listed in [Content_Types].
if ( !-e $tempdir . $files{_workbook} ) {
$self->{_error_status} = $ERROR_file_missing_subfile;
return;
}

# Read data from the workbook.xml file.
$workbook->_read_file( $tempdir . $files{_workbook} );
Expand All @@ -175,6 +188,88 @@ sub read_file {
}


###############################################################################
#
# _check_files_exist()
#
# Verify that the subfiles read from the Content_Types actually exist;
#
sub _check_files_exist {

my $self = shift;
my $tempdir = shift;
my %files = @_;
my @filenames;

# Get the filenames for the files hash.
for my $key ( keys %files ) {
my $filename = $files{$key};

# Worksheets are stored in an aref.
if ( ref $filename ) {
push @filenames, @$filename;
}
else {
push @filenames, $filename;
}
}

# Verify that the files exist.
for my $filename ( @filenames ) {
if ( !-e $tempdir . $filename ) {
$self->{_error_extra_text} = $filename;
return;
}
}

return 1;
}


###############################################################################
#
# _check_if_ole_file()
#
# Check if the file in an OLE compound doc. This can happen in a few cases.
# This first is when the file is xls and not xlsx. The second is when the
# file is an encrypted xlsx file. We also handle the case of unknown OLE
# file types.
#
# Porting note. As a lightweight test you can check for OLE files by looking
# for the magic number 0xD0CF11E0 (docfile0) at the start of the file.
#
sub _check_if_ole_file {

my $self = shift;
my $filename = shift;
my $ole = OLE::Storage_Lite->new( $filename );
my $pps = $ole->getPpsTree();

# If getPpsTree() failed then this isn't an OLE file.
return if !$pps;

# Loop throught the PPS children below the root.
for my $child_pps ( @{ $pps->{Child} } ) {

my $pps_name = OLE::Storage_Lite::Ucs2Asc( $child_pps->{Name} );

# Match an Excel xls file.
if ($pps_name eq 'Workbook' || $pps_name eq 'Book' ) {
return $ERROR_file_is_xls;
}

# Match an encrypted Excel xlsx file.
if ($pps_name eq 'EncryptedPackage') {
return $ERROR_file_is_encrypted;
}
}

return $ERROR_file_is_unknown_ole;
}




###############################################################################
#
# error().
Expand All @@ -188,7 +283,7 @@ sub error {
my $error = $error_strings[$error_index];

if ($self->{_error_extra_text}) {
$error .= '. ' . $self->{_error_extra_text};
$error .= ': ' . $self->{_error_extra_text};
}

return $error;
Expand Down
12 changes: 11 additions & 1 deletion lib/Excel/Reader/XLSX/Package/ContentTypes.pm
Expand Up @@ -140,6 +140,9 @@ sub _read_node {
my $content_type = $node->getAttribute('ContentType');


# Strip leading directory separator from filename.
$part_name =~ s{^/}{};

if ( $part_name =~ /app\.xml$/ ) {
$self->{_files}->{_app} = $part_name;
return;
Expand All @@ -161,7 +164,14 @@ sub _read_node {
}

if ( $part_name =~ /workbook\.xml$/ ) {
$self->{_files}->{_workbook} = $part_name;

# The workbook.xml.rels file isn't included in the ContentTypes but
# it is usually in the _rels dir at the same level at the workbook.xml.
my $workbook_rels = $part_name;
$workbook_rels =~ s{(workbook.xml)}{_rels/$1.rels};

$self->{_files}->{_workbook} = $part_name;
$self->{_files}->{_workbook_rels} = $workbook_rels;
return;
}

Expand Down
13 changes: 7 additions & 6 deletions t/package/content_types/content_types01.t
Expand Up @@ -34,12 +34,13 @@ $reader->_read_all_nodes();
$caption = " \tContentTypes: _strings";

$expected = {
'_workbook' => '/xl/workbook.xml',
'_app' => '/docProps/app.xml',
'_styles' => '/xl/styles.xml',
'_worksheets' => [ '/xl/worksheets/sheet1.xml' ],
'_core' => '/docProps/core.xml',
'_shared_strings' => '/xl/sharedStrings.xml'
'_workbook' => 'xl/workbook.xml',
'_workbook_rels' => 'xl/_rels/workbook.xml.rels',
'_app' => 'docProps/app.xml',
'_styles' => 'xl/styles.xml',
'_worksheets' => [ 'xl/worksheets/sheet1.xml' ],
'_core' => 'docProps/core.xml',
'_shared_strings' => 'xl/sharedStrings.xml'
};


Expand Down
13 changes: 7 additions & 6 deletions t/package/content_types/content_types02.t
Expand Up @@ -35,12 +35,13 @@ $reader->_read_all_nodes();
$caption = " \tContentTypes: _strings";

$expected = {
'_workbook' => '/xl/workbook.xml',
'_app' => '/docProps/app.xml',
'_styles' => '/xl/styles.xml',
'_worksheets' => [ '/xl/worksheets/sheet1.xml' ],
'_core' => '/docProps/core.xml',
'_shared_strings' => '/xl/sharedStrings.xml'
'_workbook' => 'xl/workbook.xml',
'_workbook_rels' => 'xl/_rels/workbook.xml.rels',
'_app' => 'docProps/app.xml',
'_styles' => 'xl/styles.xml',
'_worksheets' => [ 'xl/worksheets/sheet1.xml' ],
'_core' => 'docProps/core.xml',
'_shared_strings' => 'xl/sharedStrings.xml'
};


Expand Down
20 changes: 11 additions & 9 deletions t/package/content_types/content_types03.t
Expand Up @@ -34,16 +34,18 @@ $reader->_read_all_nodes();
$caption = " \tContentTypes: _strings";

$expected = {
'_workbook' => '/xl/workbook.xml',
'_app' => '/docProps/app.xml',
'_styles' => '/xl/styles.xml',
'_worksheets' => [
'/xl/worksheets/sheet1.xml',
'/xl/worksheets/sheet2.xml',
'/xl/worksheets/sheet3.xml'
'_workbook' => 'xl/workbook.xml',
'_workbook_rels' => 'xl/_rels/workbook.xml.rels',
'_app' => 'docProps/app.xml',
'_styles' => 'xl/styles.xml',
'_worksheets' => [
'xl/worksheets/sheet1.xml',
'xl/worksheets/sheet2.xml',
'xl/worksheets/sheet3.xml'

],
'_core' => '/docProps/core.xml',
'_shared_strings' => '/xl/sharedStrings.xml'
'_core' => 'docProps/core.xml',
'_shared_strings' => 'xl/sharedStrings.xml'
};


Expand Down
2 changes: 1 addition & 1 deletion t/regression/json_files/read_error01.json
Expand Up @@ -4,6 +4,6 @@
"xlsx_file" : "error01.xlsx",
"expected" : {
"error_code" : 1,
"error_text" : "File not found. t/regression/xlsx_files/error01.xlsx"
"error_text" : "File not found: t/regression/xlsx_files/error01.xlsx"
}
}
4 changes: 2 additions & 2 deletions t/regression/json_files/read_error02.json
@@ -1,9 +1,9 @@
{
"caption" : "Test error handling.",
"description" : "Returns a zip error when not a valid zip file.",
"description" : "Returns a error if the file is an xls file.",
"xlsx_file" : "error02.xlsx",
"expected" : {
"error_code" : 2,
"error_text" : "File has zip error. format error: can\'t find EOCD signature "
"error_text" : "File is xls not xlsx: t/regression/xlsx_files/error02.xlsx"
}
}
9 changes: 9 additions & 0 deletions t/regression/json_files/read_error03.json
@@ -0,0 +1,9 @@
{
"caption" : "Test error handling.",
"description" : "Returns an error when the file is encrypted xlsx.",
"xlsx_file" : "error03.xlsx",
"expected" : {
"error_code" : 3,
"error_text" : "File is encrypted xlsx: t/regression/xlsx_files/error03.xlsx"
}
}
9 changes: 9 additions & 0 deletions t/regression/json_files/read_error04.json
@@ -0,0 +1,9 @@
{
"caption" : "Test error handling.",
"description" : "Returns a error if the file is an unknown OLE doc.",
"xlsx_file" : "error04.xlsx",
"expected" : {
"error_code" : 4,
"error_text" : "File is unknown OLE doc type: t/regression/xlsx_files/error04.xlsx"
}
}
9 changes: 9 additions & 0 deletions t/regression/json_files/read_error05.json
@@ -0,0 +1,9 @@
{
"caption" : "Test error handling.",
"description" : "Returns a zip error when not a valid zip file.",
"xlsx_file" : "error05.xlsx",
"expected" : {
"error_code" : 5,
"error_text" : "File has zip error: format error: can\'t find EOCD signature "
}
}
9 changes: 9 additions & 0 deletions t/regression/json_files/read_error06.json
@@ -0,0 +1,9 @@
{
"caption" : "Test error handling.",
"description" : "Returns an error when a subfile is missing.",
"xlsx_file" : "error06.xlsx",
"expected" : {
"error_code" : 6,
"error_text" : "File missing subfile: docProps/core.xml"
}
}
43 changes: 43 additions & 0 deletions t/regression/read_error03.t
@@ -0,0 +1,43 @@
###############################################################################
#
# Tests for Excel::Writer::XLSX.
#
# reverse('©'), February 2012, John McNamara, jmcnamara@cpan.org
#

use lib 't/lib';
use TestFunctions qw(_is_deep_diff _read_json);
use strict;
use warnings;
use Excel::Reader::XLSX;

use Test::More tests => 1;

###############################################################################
#
# Test setup.
#
my $json_filename = 't/regression/json_files/read_error03.json';
my $json = _read_json( $json_filename );
my $caption = $json->{caption};
my $expected = $json->{expected};
my $xlsx_file = 't/regression/xlsx_files/' . $json->{xlsx_file};
my $got;


###############################################################################
#
# Test error handling when reading data from an Excel file.
#
use Excel::Reader::XLSX;

my $reader = Excel::Reader::XLSX->new();
my $workbook = $reader->read_file( $xlsx_file );

$got = {
error_code => $reader->error_code(),
error_text => $reader->error(),
};

# Test the results.
_is_deep_diff( $got, $expected, $caption );

0 comments on commit ee9ec0c

Please sign in to comment.