/
SplitIntoParagraphs.php
179 lines (163 loc) · 4.93 KB
/
SplitIntoParagraphs.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
<?php
namespace Drupal\atom_migrate\Plugin\migrate\process;
use DOMDocument;
use DOMXpath;
use Drupal\Core\Language\Language;
use Drupal\media\Entity\Media;
use Drupal\migrate\MigrateExecutableInterface;
use Drupal\migrate\ProcessPluginBase;
use Drupal\migrate\Row;
use Drupal\paragraphs\Entity\Paragraph;
/**
* Split an HTML blob into paragraphs.
*
* This version assumes that the HTML is a sequence of <p> elements. If the <p>
* tag wraps a single <img> element, then create a Media paragraph. Otherwise,
* create a Text Area paragraph.
*
* Return an array of arrays. The inner arrays are keyed by 'target_id' and
* 'target_revision_id', suitable for passing into a Paragraph field.
*
* Example:
*
* @code
* process:
* field_paragraph:
* plugin: split_into_paragraphs
* source: html_blob
* @endcode
*
* @MigrateProcessPlugin(
* id = "split_into_paragraphs"
* )
*/
class SplitIntoParagraphs extends ProcessPluginBase {
/**
* Options for DOMDocument::loadHTML().
*
* Do not add a DTD when loading HTML into DOMDocument.
* @var int
*/
const DOM_OPTIONS = LIBXML_HTML_NODEFDTD;
/**
* {@inheritdoc}
*/
public function transform($value, MigrateExecutableInterface $migrate_executable, Row $row, $destination_property) {
$paragraphs = [];
$post = new DOMDocument();
// I hope that $value is always a string.
$post->loadHTML($value, static::DOM_OPTIONS);
$html = $post->getElementsByTagName('body')->item(0);
$current = '';
foreach ($html->childNodes as $child) {
$fragment = $post->saveHTML($child);
// If the current node contains an image tag, then push $current onto the
// output. Assume there is nothing else interesting in the current node,
// and push the image onto the output.
if (!static::hasDescendantTag($fragment, 'img')) {
$current .= $fragment;
}
else {
if (strlen($current)) {
$paragraphs[] = static::createTextParagraph($current);
$current = '';
}
$paragraphs[] = static::createMediaParagraph($fragment);
}
}
if (strlen($current)) {
$paragraphs[] = static::createTextParagraph($current);
}
return $paragraphs;
}
/**
* Check whether an HTML blob includes an given tag.
*
* @param string $blob
* The HTML string to check.
* @param $tag
* The tag name.
*
* @return bool
* Return TRUE if the blob contains the tag $tag.
*/
static protected function hasDescendantTag($blob, $tag) {
// Even when $blob comes from a DOMNode object, I could not get this to work
// using DOMXpath::query().
$dom = new DOMDocument();
$dom->loadHTML($blob, static::DOM_OPTIONS);
$nodes = $dom->getElementsByTagName($tag);
return ($nodes->length != 0);
}
/**
* Create a Text Area paragraph.
*
* @param string $blob
* The HTML string to use as the main text field.
*
* @return int[]
* An array of entity/revision IDs keyed by 'target_id' and
* 'target_revision_id'.
*/
static protected function createTextParagraph($blob) {
$paragraph = Paragraph::create([
'type' => 'text_area',
'field_text_area' => [
'value' => $blob,
'format' => 'basic_html'
],
]);
$paragraph->save();
return [
'target_id' => $paragraph->id(),
'target_revision_id' => $paragraph->getRevisionId(),
];
}
/**
* Create a Media paragraph.
*
* Save the file locally, creating a File entity of type image.
* Create a Media entity of type image referencing that File entity.
* Create a Media paragraph referencing that Media entity.
*
* @param string $blob
* The HTML string to use as the main text field. It should contain an <img>
* element. If there is more than one, only the first is used.
*
* @return int[]
* An array of entity/revision IDs keyed by 'target_id' and
* 'target_revision_id'.
*/
static protected function createMediaParagraph($blob) {
$dom = new DOMDocument();
$dom->loadHTML($blob, static::DOM_OPTIONS);
$node = $dom->getElementsByTagName('img')->item(0);
$src = $node->getAttribute('src');
$alt_text = $node->getAttribute('alt');
// Download and save the file.
$destination = 'public://' . date('Y-m');
$file = system_retrieve_file($src, $destination, TRUE);
// Create the Media entity.
$media = Media::create([
'bundle' => 'image',
'uid' => '9',
// 'langcode' => Language::LANGCODE_DEFAULT,
// 'status' => Media::PUBLISHED,
'field_media_image' => [
'target_id' => $file->id(),
'alt' => $alt_text,
],
]);
$media->save();
// Create the Media paragraph entity.
$paragraph = Paragraph::create([
'type' => 'media',
'field_media' => $media->id(),
]);
$paragraph->save();
return [
'target_id' => $paragraph->id(),
'target_revision_id' => $paragraph->getRevisionId(),
];
}
}