Skip to content

Commit

Permalink
Merge pull request #196 from Kdecherf/jsonld-ignore
Browse files Browse the repository at this point in the history
Rework JsonLd extraction: ignore some objects and some names
  • Loading branch information
j0k3r committed Feb 13, 2019
2 parents c9e85d2 + 879ef69 commit 6c1506f
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 3 deletions.
22 changes: 20 additions & 2 deletions src/Extractor/ContentExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -1145,9 +1145,19 @@ private function extractJsonLdInformation($html)
return;
}

$ignoreNames = [];
$candidateNames = [];

foreach ($matches[1] as $matche) {
$data = json_decode(trim($matche), true);

if (isset($data['@type']) && \in_array($data['@type'], ['Organization', 'WebSite', 'Person'], true)) {
if (isset($data['name'])) {
$ignoreNames[] = $data['name'];
}
continue;
}

$this->logger->info('JSON-LD data: {JsonLdData}', ['JsonLdData' => $data]);

// just in case datePublished isn't defined, we use the modified one at first
Expand All @@ -1171,11 +1181,11 @@ private function extractJsonLdInformation($html)
}

if (isset($data['headline'])) {
$this->title = $data['headline'];
$candidateNames[] = $data['headline'];
}

if (isset($data['name'])) {
$this->title = $data['name'];
$candidateNames[] = $data['name'];
}

if (isset($data['author']['name'])) {
Expand All @@ -1190,5 +1200,13 @@ private function extractJsonLdInformation($html)
}
}
}

if (\is_array($candidateNames) && \count($candidateNames) > 0) {
foreach ($candidateNames as $name) {
if (!\in_array($name, $ignoreNames, true)) {
$this->title = $name;
}
}
}
}
}
2 changes: 1 addition & 1 deletion src/SiteConfig/ConfigBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ public function getCachedVersion($key)
$key = substr($key, 4);
}

if (array_key_exists($key, $this->cache)) {
if (\array_key_exists($key, $this->cache)) {
return $this->cache[$key];
}

Expand Down
29 changes: 29 additions & 0 deletions tests/Extractor/ContentExtractorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,35 @@ public function testJsonLd()
$this->assertContains('<p>hihi</p>', $content_block->ownerDocument->saveXML($content_block));
}

public function testJsonLdIgnoreList()
{
$contentExtractor = new ContentExtractor(self::$contentExtractorConfig);

$res = $contentExtractor->process(
'<html><body><script type="application/ld+json">{ "@context": "http:\/\/schema.org", "@type": "NewsArticle", "publisher": { "@type": "Organization", "name": "Foobar Company" }, "description": "A method for fooling tools", "mainEntityOfPage": { "@type": "WebPage", "@id": "https:\/\/www.example.com/foobar" }, "headline": "The Foobar Company is launching globally", "datePublished": "2019-01-14T16:02:00.000+00:00", "dateModified": "2019-01-14T13:25:09.980+00:00", "author": { "@type": "Person", "name": "Foobar CEO" } }</script> <script type="application/ld+json">{ "@context": "http:\/\/schema.org", "@type": "Organization", "name": "Foobar Company", "url": "https:\/\/www.example.com" }</script><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>',
'https://example.com/jsonld'
);

$this->assertTrue($res, 'Extraction went well');

$this->assertSame('The Foobar Company is launching globally', $contentExtractor->getTitle());
$this->assertContains('Foobar CEO', $contentExtractor->getAuthors());
}

public function testJsonLdIgnoreListWithPeriodical()
{
$contentExtractor = new ContentExtractor(self::$contentExtractorConfig);

$res = $contentExtractor->process(
'<html><body><script type="application/ld+json">{ "@context": "http:\/\/schema.org", "@type": "Periodical", "publisher": { "@type": "Organization", "name": "Foobar Company" }, "description": "A method for fooling tools", "mainEntityOfPage": { "@type": "WebPage", "@id": "https:\/\/www.example.com/foobar" }, "name": "Foobar Company", "datePublished": "2019-01-14T16:02:00.000+00:00", "dateModified": "2019-01-14T13:25:09.980+00:00", "author": { "@type": "Person", "name": "Foobar CEO" } }</script> <script type="application/ld+json">{ "@context": "http:\/\/schema.org", "@type": "Organization", "name": "Foobar Company", "url": "https:\/\/www.example.com" }</script><h1>Hello world, this is title</h1><p>' . str_repeat('this is the best part of the show', 10) . '</p></body></html>',
'https://example.com/jsonld'
);

$this->assertTrue($res, 'Extraction went well');

$this->assertSame('Hello world, this is title', $contentExtractor->getTitle());
}

public function testJsonLdSkipper()
{
$contentExtractor = new ContentExtractor(self::$contentExtractorConfig);
Expand Down

0 comments on commit 6c1506f

Please sign in to comment.