-
Notifications
You must be signed in to change notification settings - Fork 476
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support for sitemaps #4261 #5084
Merged
Merged
Changes from all commits
Commits
Show all changes
22 commits
Select commit
Hold shift + click to select a range
d2ccf59
add doc stub for sitemaps #4261
pdurbin afe3d0f
stub out sitemap code and tests #4261
pdurbin 61231b2
write sitemap to docroot #4261
pdurbin 45bde32
serve from /sitemap.xml #4261
pdurbin e436355
add datasets to sitemap #4261
pdurbin 125d163
add test to assert that XML is well formed #4261
pdurbin f3d9b31
validate sitemap against the schema #4261
pdurbin d96f8cb
add dataverses to sitemap #4261
pdurbin 9934fdf
fix test (dv must be published to appear in sitemap) #4261
pdurbin b742ce4
consistent "lastmod" for dataverses and datasets #4261
pdurbin b0c9401
add todo to support more than 50,000 URLs in sitemap #4261
pdurbin 63d7a4f
improve docs, explain what's in sitemap, cron #4261
pdurbin 6eecbf1
Merge branch 'develop' into 4261-sitemap #4261
pdurbin e73595f
Merge branch '5122-fix-netbeans-compat' into 4261-sitemap #4261
pdurbin bd54ba0
add BEGIN and END lines to log #4261
pdurbin c4116a1
explain that logos and sitemaps are written per server #4261
pdurbin b574f27
stage sitemap before writing to final file #4261
pdurbin 3b9bbf1
add validation to main routine, s/copy/move/ #4261
pdurbin 11f6fca
make async, report error if staged file exists #4261
pdurbin d3531c5
Merge branch 'develop' into 4261-sitemap #4261
pdurbin c41fc16
Merge branch 'develop' into 4261-sitemap #4261
pdurbin c80dc43
typo: wrong directory for sitemap was documented #4261
pdurbin File filter
Filter by extension
Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
package edu.harvard.iq.dataverse.api; | ||
|
||
import edu.harvard.iq.dataverse.sitemap.SiteMapServiceBean; | ||
import edu.harvard.iq.dataverse.sitemap.SiteMapUtil; | ||
import javax.ejb.EJB; | ||
import javax.ejb.Stateless; | ||
import javax.ws.rs.POST; | ||
import javax.ws.rs.Path; | ||
import javax.ws.rs.Produces; | ||
import javax.ws.rs.core.MediaType; | ||
import javax.ws.rs.core.Response; | ||
|
||
@Stateless | ||
@Path("admin/sitemap") | ||
public class SiteMap extends AbstractApiBean { | ||
|
||
@EJB | ||
SiteMapServiceBean siteMapSvc; | ||
|
||
@POST | ||
@Produces(MediaType.APPLICATION_JSON) | ||
public Response updateSiteMap() { | ||
boolean stageFileExists = SiteMapUtil.stageFileExists(); | ||
if (stageFileExists) { | ||
return error(Response.Status.BAD_REQUEST, "Sitemap cannot be updated because staged file exists."); | ||
} | ||
siteMapSvc.updateSiteMap(dataverseSvc.findAll(), datasetSvc.findAll()); | ||
return ok("Sitemap update has begun. Check logs for status."); | ||
} | ||
|
||
} |
17 changes: 17 additions & 0 deletions
17
src/main/java/edu/harvard/iq/dataverse/sitemap/SiteMapServiceBean.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
package edu.harvard.iq.dataverse.sitemap; | ||
|
||
import edu.harvard.iq.dataverse.Dataset; | ||
import edu.harvard.iq.dataverse.Dataverse; | ||
import java.util.List; | ||
import javax.ejb.Asynchronous; | ||
import javax.ejb.Stateless; | ||
|
||
@Stateless | ||
public class SiteMapServiceBean { | ||
|
||
@Asynchronous | ||
public void updateSiteMap(List<Dataverse> dataverses, List<Dataset> datasets) { | ||
SiteMapUtil.updateSiteMap(dataverses, datasets); | ||
} | ||
|
||
} |
225 changes: 225 additions & 0 deletions
225
src/main/java/edu/harvard/iq/dataverse/sitemap/SiteMapUtil.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
package edu.harvard.iq.dataverse.sitemap; | ||
|
||
import edu.harvard.iq.dataverse.Dataset; | ||
import edu.harvard.iq.dataverse.Dataverse; | ||
import edu.harvard.iq.dataverse.DvObjectContainer; | ||
import edu.harvard.iq.dataverse.util.SystemConfig; | ||
import edu.harvard.iq.dataverse.util.xml.XmlValidator; | ||
import java.io.File; | ||
import java.io.IOException; | ||
import java.net.MalformedURLException; | ||
import java.net.URL; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.nio.file.Paths; | ||
import java.nio.file.StandardCopyOption; | ||
import java.text.SimpleDateFormat; | ||
import java.util.List; | ||
import java.util.logging.Logger; | ||
import javax.xml.parsers.DocumentBuilder; | ||
import javax.xml.parsers.DocumentBuilderFactory; | ||
import javax.xml.parsers.ParserConfigurationException; | ||
import javax.xml.transform.OutputKeys; | ||
import javax.xml.transform.Transformer; | ||
import javax.xml.transform.TransformerConfigurationException; | ||
import javax.xml.transform.TransformerException; | ||
import javax.xml.transform.TransformerFactory; | ||
import javax.xml.transform.dom.DOMSource; | ||
import javax.xml.transform.stream.StreamResult; | ||
import org.w3c.dom.Document; | ||
import org.w3c.dom.Element; | ||
import org.xml.sax.SAXException; | ||
|
||
public class SiteMapUtil { | ||
|
||
private static final Logger logger = Logger.getLogger(SiteMapUtil.class.getCanonicalName()); | ||
|
||
static final String SITEMAP_FILENAME_FINAL = "sitemap.xml"; | ||
static final String SITEMAP_FILENAME_STAGED = "sitemap.xml.staged"; | ||
|
||
/** | ||
* TODO: Handle more than 50,000 entries in the sitemap. | ||
* | ||
* (As of this writing Harvard Dataverse only has ~3000 dataverses and | ||
* ~30,000 datasets.) | ||
* | ||
* "each Sitemap file that you provide must have no more than 50,000 URLs" | ||
* https://www.sitemaps.org/protocol.html | ||
* | ||
* Consider using a third party library: "One sitemap can contain a maximum | ||
* of 50,000 URLs. (Some sitemaps, like Google News sitemaps, can contain | ||
* only 1,000 URLs.) If you need to put more URLs than that in a sitemap, | ||
* you'll have to use a sitemap index file. Fortunately, WebSitemapGenerator | ||
* can manage the whole thing for you." | ||
* https://github.com/dfabulich/sitemapgen4j | ||
*/ | ||
public static void updateSiteMap(List<Dataverse> dataverses, List<Dataset> datasets) { | ||
|
||
logger.info("BEGIN updateSiteMap"); | ||
|
||
String sitemapPathString = getSitemapPathString(); | ||
String stagedSitemapPathAndFileString = sitemapPathString + File.separator + SITEMAP_FILENAME_STAGED; | ||
String finalSitemapPathAndFileString = sitemapPathString + File.separator + SITEMAP_FILENAME_FINAL; | ||
|
||
Path stagedPath = Paths.get(stagedSitemapPathAndFileString); | ||
if (Files.exists(stagedPath)) { | ||
logger.warning("Unable to update sitemap! The staged file from a previous run already existed. Delete " + stagedSitemapPathAndFileString + " and try again."); | ||
return; | ||
} | ||
|
||
DocumentBuilderFactory documentBuilderFactory = DocumentBuilderFactory.newInstance(); | ||
DocumentBuilder documentBuilder = null; | ||
try { | ||
documentBuilder = documentBuilderFactory.newDocumentBuilder(); | ||
} catch (ParserConfigurationException ex) { | ||
logger.warning("Unable to update sitemap! ParserConfigurationException: " + ex.getLocalizedMessage()); | ||
return; | ||
} | ||
Document document = documentBuilder.newDocument(); | ||
|
||
Element urlSet = document.createElement("urlset"); | ||
urlSet.setAttribute("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9"); | ||
urlSet.setAttribute("xmlns:xhtml", "http://www.w3.org/1999/xhtml"); | ||
document.appendChild(urlSet); | ||
|
||
for (Dataverse dataverse : dataverses) { | ||
if (!dataverse.isReleased()) { | ||
continue; | ||
} | ||
Element url = document.createElement("url"); | ||
urlSet.appendChild(url); | ||
|
||
Element loc = document.createElement("loc"); | ||
String dataverseAlias = dataverse.getAlias(); | ||
loc.appendChild(document.createTextNode(SystemConfig.getDataverseSiteUrlStatic() + "/dataverse/" + dataverseAlias)); | ||
url.appendChild(loc); | ||
|
||
Element lastmod = document.createElement("lastmod"); | ||
lastmod.appendChild(document.createTextNode(getLastModDate(dataverse))); | ||
url.appendChild(lastmod); | ||
} | ||
|
||
for (Dataset dataset : datasets) { | ||
if (!dataset.isReleased()) { | ||
continue; | ||
} | ||
if (dataset.isHarvested()) { | ||
continue; | ||
} | ||
// The deaccessioned check is last because it has to iterate through dataset versions. | ||
if (dataset.isDeaccessioned()) { | ||
continue; | ||
} | ||
Element url = document.createElement("url"); | ||
urlSet.appendChild(url); | ||
|
||
Element loc = document.createElement("loc"); | ||
String datasetPid = dataset.getGlobalId().asString(); | ||
loc.appendChild(document.createTextNode(SystemConfig.getDataverseSiteUrlStatic() + "/dataset.xhtml?persistentId=" + datasetPid)); | ||
url.appendChild(loc); | ||
|
||
Element lastmod = document.createElement("lastmod"); | ||
lastmod.appendChild(document.createTextNode(getLastModDate(dataset))); | ||
url.appendChild(lastmod); | ||
} | ||
|
||
TransformerFactory transformerFactory = TransformerFactory.newInstance(); | ||
Transformer transformer = null; | ||
try { | ||
transformer = transformerFactory.newTransformer(); | ||
} catch (TransformerConfigurationException ex) { | ||
logger.warning("Unable to update sitemap! TransformerConfigurationException: " + ex.getLocalizedMessage()); | ||
return; | ||
} | ||
transformer.setOutputProperty(OutputKeys.INDENT, "yes"); | ||
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "2"); | ||
DOMSource source = new DOMSource(document); | ||
File directory = new File(sitemapPathString); | ||
if (!directory.exists()) { | ||
directory.mkdir(); | ||
} | ||
|
||
boolean debug = false; | ||
if (debug) { | ||
logger.info("Writing sitemap to console/logs"); | ||
StreamResult consoleResult = new StreamResult(System.out); | ||
try { | ||
transformer.transform(source, consoleResult); | ||
} catch (TransformerException ex) { | ||
logger.warning("Unable to print sitemap to the console: " + ex.getLocalizedMessage()); | ||
} | ||
} | ||
|
||
logger.info("Writing staged sitemap to " + stagedSitemapPathAndFileString); | ||
StreamResult result = new StreamResult(new File(stagedSitemapPathAndFileString)); | ||
try { | ||
transformer.transform(source, result); | ||
} catch (TransformerException ex) { | ||
logger.warning("Unable to update sitemap! Unable to write staged sitemap to " + stagedSitemapPathAndFileString + ". TransformerException: " + ex.getLocalizedMessage()); | ||
return; | ||
} | ||
|
||
logger.info("Checking staged sitemap for well-formedness. The staged file is " + stagedSitemapPathAndFileString); | ||
try { | ||
XmlValidator.validateXmlWellFormed(stagedSitemapPathAndFileString); | ||
} catch (Exception ex) { | ||
logger.warning("Unable to update sitemap! Staged sitemap file is not well-formed XML! The exception for " + stagedSitemapPathAndFileString + " is " + ex.getLocalizedMessage()); | ||
return; | ||
} | ||
|
||
logger.info("Checking staged sitemap against XML schema. The staged file is " + stagedSitemapPathAndFileString); | ||
URL schemaUrl = null; | ||
try { | ||
schemaUrl = new URL("https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"); | ||
} catch (MalformedURLException ex) { | ||
// This URL is hard coded and it's fine. We should never get MalformedURLException so we just swallow the exception and carry on. | ||
} | ||
try { | ||
XmlValidator.validateXmlSchema(stagedSitemapPathAndFileString, schemaUrl); | ||
} catch (SAXException | IOException ex) { | ||
logger.warning("Unable to update sitemap! Exception caught while checking XML staged file (" + stagedSitemapPathAndFileString + " ) against XML schema: " + ex.getLocalizedMessage()); | ||
return; | ||
} | ||
|
||
Path finalPath = Paths.get(finalSitemapPathAndFileString); | ||
logger.info("Copying staged sitemap from " + stagedSitemapPathAndFileString + " to " + finalSitemapPathAndFileString); | ||
try { | ||
Files.move(stagedPath, finalPath, StandardCopyOption.REPLACE_EXISTING); | ||
} catch (IOException ex) { | ||
logger.warning("Unable to update sitemap! Unable to copy staged sitemap from " + stagedSitemapPathAndFileString + " to " + finalSitemapPathAndFileString + ". IOException: " + ex.getLocalizedMessage()); | ||
return; | ||
} | ||
|
||
logger.info("END updateSiteMap"); | ||
} | ||
|
||
private static String getLastModDate(DvObjectContainer dvObjectContainer) { | ||
// TODO: Decide if YYYY-MM-DD is enough. https://www.sitemaps.org/protocol.html | ||
// says "The date of last modification of the file. This date should be in W3C Datetime format. | ||
// This format allows you to omit the time portion, if desired, and use YYYY-MM-DD." | ||
return new SimpleDateFormat("yyyy-MM-dd").format(dvObjectContainer.getModificationTime()); | ||
} | ||
|
||
public static boolean stageFileExists() { | ||
String sitemapPathString = getSitemapPathString(); | ||
String stagedSitemapPathAndFileString = sitemapPathString + File.separator + SITEMAP_FILENAME_STAGED; | ||
Path stagedPath = Paths.get(stagedSitemapPathAndFileString); | ||
if (Files.exists(stagedPath)) { | ||
logger.warning("Unable to update sitemap! The staged file from a previous run already existed. Delete " + stagedSitemapPathAndFileString + " and try again."); | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
private static String getSitemapPathString() { | ||
String sitemapPathString = "/tmp"; | ||
// i.e. /usr/local/glassfish4/glassfish/domains/domain1 | ||
String domainRoot = System.getProperty("com.sun.aas.instanceRoot"); | ||
if (domainRoot != null) { | ||
// Note that we write to a directory called "sitemap" but we serve just "/sitemap.xml" using PrettyFaces. | ||
sitemapPathString = domainRoot + File.separator + "docroot" + File.separator + "sitemap"; | ||
} | ||
return sitemapPathString; | ||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
package edu.harvard.iq.dataverse.api; | ||
|
||
import com.jayway.restassured.RestAssured; | ||
import org.junit.BeforeClass; | ||
import org.junit.Test; | ||
import com.jayway.restassured.response.Response; | ||
|
||
public class SiteMapIT { | ||
|
||
@BeforeClass | ||
public static void setUpClass() { | ||
RestAssured.baseURI = UtilIT.getRestAssuredBaseUri(); | ||
} | ||
|
||
@Test | ||
public void testSiteMap() { | ||
Response response = UtilIT.sitemapUpdate(); | ||
response.prettyPrint(); | ||
Response download = UtilIT.sitemapDownload(); | ||
download.prettyPrint(); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems like this generation could benefit from schema validation via DocumentBuilderFactory.setSchema(schema) ?