Skip to content

Commit

Permalink
Dev: testing different methods (array, SPL array) of loading shared s…
Browse files Browse the repository at this point in the history
…trings, because the default one (files) is too slow on large files.
  • Loading branch information
kewlar committed Jul 6, 2015
1 parent 3edb056 commit bc72e2c
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 14 deletions.
15 changes: 9 additions & 6 deletions src/Spout/Reader/AbstractReader.php
Expand Up @@ -33,9 +33,11 @@ abstract class AbstractReader implements ReaderInterface
* Opens the file at the given file path to make it ready to be read
*
* @param string $filePath Path of the file to be read
* @param int $readerMethod Shared strings storage strategy to use. See SharedStringsHelper::USE_* constants.
*
* @return void
*/
abstract protected function openReader($filePath);
abstract protected function openReader($filePath, $readerMethod);

/**
* Reads and returns next row if available.
Expand Down Expand Up @@ -65,11 +67,12 @@ public function setGlobalFunctionsHelper($globalFunctionsHelper)
* Prepares the reader to read the given file. It also makes sure
* that the file exists and is readable.
*
* @param string $filePath Path of the file to be read
* @return void
* @throws \Box\Spout\Common\Exception\IOException If the file at the given path does not exist, is not readable or is corrupted
* @param string $filePath Path of the file to be read
* @param int $readerMethod Shared strings storage strategy to use. See SharedStringsHelper::USE_* constants.
*
* @throws IOException
*/
public function open($filePath)
public function open($filePath, $readerMethod = null)
{
if (!$this->isPhpStream($filePath)) {
// we skip the checks if the provided file path points to a PHP stream
Expand All @@ -84,7 +87,7 @@ public function open($filePath)
$this->hasReachedEndOfFile = false;

try {
$this->openReader($filePath);
$this->openReader($filePath, $readerMethod);
$this->isStreamOpened = true;
} catch (\Exception $exception) {
throw new IOException('Could not open ' . $filePath . ' for reading! (' . $exception->getMessage() . ')');
Expand Down
56 changes: 50 additions & 6 deletions src/Spout/Reader/Helper/XLSX/SharedStringsHelper.php
Expand Up @@ -34,6 +34,10 @@ class SharedStringsHelper
/** Value to use to escape the line feed character ("\n") */
const ESCAPED_LINE_FEED_CHARACTER = '_x000A_';

const USE_FILES = 0;
const USE_ARRAY = 1;
const USE_SPL_ARRAY = 2;

/** @var string Path of the XLSX file being read */
protected $filePath;

Expand All @@ -58,17 +62,22 @@ class SharedStringsHelper
*/
protected $inMemoryTempFileContents;

protected $stringStorageMethod = self::USE_FILES;
protected $inMemoryStrings = [];
protected $inMemoryStringCount = 0;

/**
* @param string $filePath Path of the XLSX file being read
* @param string|void $tempFolder Temporary folder where the temporary files to store shared strings will be stored
*/
public function __construct($filePath, $tempFolder = null)
public function __construct($filePath, $tempFolder = null, $stringStorageMethod = self::USE_FILES)
{
$this->filePath = $filePath;

$rootTempFolder = ($tempFolder) ?: sys_get_temp_dir();
$this->fileSystemHelper = new FileSystemHelper($rootTempFolder);
$this->tempFolder = $this->fileSystemHelper->createFolder($rootTempFolder, uniqid('sharedstrings'));
$this->stringStorageMethod = $stringStorageMethod ?: self::USE_FILES;
}

/**
Expand Down Expand Up @@ -120,6 +129,10 @@ public function extractSharedStrings()
// do nothing until a 'si' tag is reached
}

if ($this->stringStorageMethod === self::USE_SPL_ARRAY) {
$this->inMemoryStrings = new \SplFixedArray();
}

while ($xmlReader->name === 'si') {
$node = new \SimpleXMLElement($xmlReader->readOuterXml());
$node->registerXPathNamespace('ns', self::MAIN_NAMESPACE_FOR_SHARED_STRINGS_XML);
Expand All @@ -141,11 +154,22 @@ public function extractSharedStrings()

$unescapedTextValue = $escaper->unescape($textValue);

// The shared string retrieval logic expects each cell data to be on one line only
// Encoding the line feed character allows to preserve this assumption
$lineFeedEncodedTextValue = $this->escapeLineFeed($unescapedTextValue);

$this->writeSharedStringToTempFile($lineFeedEncodedTextValue, $sharedStringIndex);
switch ($this->stringStorageMethod) {
case self::USE_ARRAY:
$this->inMemoryStrings[$sharedStringIndex] = $unescapedTextValue;
break;
case self::USE_SPL_ARRAY:
$this->inMemoryStrings->setSize($sharedStringIndex + 1);
$this->inMemoryStrings[$sharedStringIndex] = $unescapedTextValue;
break;
case self::USE_FILES:
default:
// The shared string retrieval logic expects each cell data to be on one line only
// Encoding the line feed character allows to preserve this assumption
$lineFeedEncodedTextValue = $this->escapeLineFeed($unescapedTextValue);
$this->writeSharedStringToTempFile($lineFeedEncodedTextValue, $sharedStringIndex);
break;
}

$sharedStringIndex++;

Expand All @@ -158,6 +182,15 @@ public function extractSharedStrings()
fclose($this->tempFilePointer);
}

switch ($this->stringStorageMethod) {
case self::USE_ARRAY:
$this->inMemoryStringCount = count($this->inMemoryStrings);
break;
case self::USE_SPL_ARRAY:
$this->inMemoryStringCount = $this->inMemoryStrings->getSize();
break;
}

$xmlReader->close();
}

Expand Down Expand Up @@ -262,6 +295,17 @@ protected function getSharedStringTempFilePath($sharedStringIndex)
*/
public function getStringAtIndex($sharedStringIndex)
{
switch ($this->stringStorageMethod) {
case self::USE_ARRAY:
case self::USE_SPL_ARRAY:
if ($sharedStringIndex < $this->inMemoryStringCount) {
return $this->inMemoryStrings[$sharedStringIndex];
}

throw new SharedStringNotFoundException("Shared string not found for index: $sharedStringIndex");
break;
}

$tempFilePath = $this->getSharedStringTempFilePath($sharedStringIndex);
$indexInFile = $sharedStringIndex % self::MAX_NUM_STRINGS_PER_TEMP_FILE;

Expand Down
6 changes: 4 additions & 2 deletions src/Spout/Reader/XLSX.php
Expand Up @@ -70,17 +70,19 @@ public function setTempFolder($tempFolder)
* and fetches all the available worksheets.
*
* @param string $filePath Path of the file to be read
* @param int $readerMethod Shared strings storage strategy to use. See SharedStringsHelper::USE_* constants.
*
* @return void
* @throws \Box\Spout\Common\Exception\IOException If the file at the given path or its content cannot be read
* @throws Exception\NoWorksheetsFoundException If there are no worksheets in the file
*/
protected function openReader($filePath)
protected function openReader($filePath, $readerMethod)
{
$this->filePath = $filePath;
$this->zip = new \ZipArchive();

if ($this->zip->open($filePath) === true) {
$this->sharedStringsHelper = new SharedStringsHelper($filePath, $this->tempFolder);
$this->sharedStringsHelper = new SharedStringsHelper($filePath, $this->tempFolder, $readerMethod);

if ($this->sharedStringsHelper->hasSharedStrings()) {
// Extracts all the strings from the worksheets for easy access in the future
Expand Down

0 comments on commit bc72e2c

Please sign in to comment.