Skip to content

Commit

Permalink
Merge ef92564 into 5486030
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Jun 2, 2021
2 parents 5486030 + ef92564 commit 5cc2b1f
Show file tree
Hide file tree
Showing 6 changed files with 592 additions and 363 deletions.
Expand Up @@ -105,6 +105,37 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
return toTEIHeader(biblio, SchemaDeclaration.XSD, defaultPublicationStatement, bds, config);
}

public static String toISOString(Date date) {
int year = date.getYear();
int month = date.getMonth();
int day = date.getDay();

String when = "";
if (year != -1) {
if (year <= 9)
when += "000" + year;
else if (year <= 99)
when += "00" + year;
else if (year <= 999)
when += "0" + year;
else
when += year;
if (month != -1) {
if (month <= 9)
when += "-0" + month;
else
when += "-" + month;
if (day != -1) {
if (day <= 9)
when += "-0" + day;
else
when += "-" + day;
}
}
}
return when;
}

public StringBuilder toTEIHeader(BiblioItem biblio,
SchemaDeclaration schemaDeclaration,
String defaultPublicationStatement,
Expand Down Expand Up @@ -188,36 +219,15 @@ public StringBuilder toTEIHeader(BiblioItem biblio,

if (biblio.getNormalizedPublicationDate() != null) {
Date date = biblio.getNormalizedPublicationDate();
int year = date.getYear();
int month = date.getMonth();
int day = date.getDay();

String when = "";
if (year != -1) {
if (year <= 9)
when += "000" + year;
else if (year <= 99)
when += "00" + year;
else if (year <= 999)
when += "0" + year;
else
when += year;
if (month != -1) {
if (month <= 9)
when += "-0" + month;
else
when += "-" + month;
if (day != -1) {
if (day <= 9)
when += "-0" + day;
else
when += "-" + day;
}
}
String when = toISOString(date);
if (StringUtils.isNotBlank(when)) {
tei.append("\t\t\t\t<date type=\"published\" when=\"");
tei.append(when + "\">");
} else
tei.append(when).append("\">");
} else {
tei.append("\t\t\t\t<date>");
}

if (biblio.getPublicationDate() != null) {
tei.append(TextUtilities.HTMLEncode(biblio.getPublicationDate()));
} else {
Expand Down Expand Up @@ -516,32 +526,9 @@ else if (meeting != null) {

if (biblio.getNormalizedPublicationDate() != null) {
Date date = biblio.getNormalizedPublicationDate();
int year = date.getYear();
int month = date.getMonth();
int day = date.getDay();

String when = "";
if (year != -1) {
if (year <= 9)
when += "000" + year;
else if (year <= 99)
when += "00" + year;
else if (year <= 999)
when += "0" + year;
else
when += year;
if (month != -1) {
if (month <= 9)
when += "-0" + month;
else
when += "-" + month;
if (day != -1) {
if (day <= 9)
when += "-0" + day;
else
when += "-" + day;
}
}
String when = toISOString(date);
if (StringUtils.isNotBlank(when)) {
if (biblio.getPublicationDate() != null) {
tei.append("\t\t\t\t\t\t\t<date type=\"published\" when=\"");
tei.append(when + "\">");
Expand Down
133 changes: 95 additions & 38 deletions grobid-core/src/main/java/org/grobid/core/engines/DateParser.java
@@ -1,18 +1,23 @@
package org.grobid.core.engines;

import org.apache.commons.lang3.StringUtils;
import org.grobid.core.GrobidModel;
import org.grobid.core.GrobidModels;
import org.grobid.core.data.Date;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.features.FeaturesVectorDate;
import org.grobid.core.utilities.TextUtilities;
import org.grobid.core.lang.Language;
import org.grobid.core.utilities.TextUtilities;

import java.util.ArrayList;
import java.util.Calendar;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.apache.commons.lang3.StringUtils.isNotBlank;

/**
* @author Patrice Lopez
*/
Expand All @@ -22,6 +27,10 @@ public DateParser() {
super(GrobidModels.DATE);
}

DateParser(GrobidModel model) {
super(model);
}

/**
* Processing of authors in header
*/
Expand Down Expand Up @@ -61,8 +70,8 @@ public List<Date> processing(String input) {
if (date.isNotNull()) {
if (dates == null)
dates = new ArrayList<Date>();
normalize(date);
dates.add(date);
Date normalisedDate = normalize(date);
dates.add(normalisedDate);
}
date = new Date();
continue;
Expand All @@ -82,17 +91,17 @@ public List<Date> processing(String input) {
i++;
}

if (s1.equals("<year>") || s1.equals("I-<year>")) {
if ("<year>".equals(s1) || "I-<year>".equals(s1)) {
if (date.getYearString() != null) {
if ((s1.equals("I-<year>")) ||
(!s1.equals(lastTag) && !lastTag.equals("I-<year>"))
(!s1.equals(lastTag) && !"I-<year>".equals(lastTag))
) {
// new date
if (date.isNotNull()) {
if (dates == null)
dates = new ArrayList<Date>();
normalize(date);
dates.add(date);
Date normalisedDate = normalize(date);
dates.add(normalisedDate);
}

date = new Date();
Expand All @@ -109,17 +118,17 @@ else if ((date.getYearString().charAt(date.getYearString().length() - 1) == '-')
} else {
date.setYearString(s2);
}
} else if (s1.equals("<month>") || s1.equals("I-<month>")) {
} else if ("<month>".equals(s1) || "I-<month>".equals(s1)) {
if (date.getMonthString() != null) {
if ((s1.equals("I-<month>")) ||
(!s1.equals(lastTag) && !lastTag.equals("I-<month>"))
(!s1.equals(lastTag) && !"I-<month>".equals(lastTag))
) {
// new date
if (date.isNotNull()) {
if (dates == null)
dates = new ArrayList<Date>();
normalize(date);
dates.add(date);
Date normalizedDate = normalize(date);
dates.add(normalizedDate);
}

date = new Date();
Expand All @@ -136,17 +145,17 @@ else if ((date.getMonthString().charAt(date.getMonthString().length() - 1) == '-
} else {
date.setMonthString(s2);
}
} else if (s1.equals("<day>") || s1.equals("I-<day>")) {
} else if ("<day>".equals(s1) || "I-<day>".equals(s1)) {
if (date.getDayString() != null) {
if ((s1.equals("I-<day>")) ||
(!s1.equals(lastTag) && !lastTag.equals("I-<day>"))
(!s1.equals(lastTag) && !"I-<day>".equals(lastTag))
) {
// new date
if (date.isNotNull()) {
if (dates == null)
dates = new ArrayList<Date>();
normalize(date);
dates.add(date);
Date normalizedDate = normalize(date);
dates.add(normalizedDate);
}

date = new Date();
Expand All @@ -170,9 +179,9 @@ else if ((date.getDayString().charAt(date.getDayString().length() - 1) == '-')
}
if (date.isNotNull()) {
if (dates == null)
dates = new ArrayList<Date>();
normalize(date);
dates.add(date);
dates = new ArrayList<>();
Date normalizedDate = normalize(date);
dates.add(normalizedDate);
}

} catch (Exception e) {
Expand Down Expand Up @@ -209,65 +218,71 @@ else if ((date.getDayString().charAt(date.getDayString().length() - 1) == '-')

public static final Pattern[] months = {jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec};

public void normalize(Date date) {
public Date normalize(Date date) {
Date normalizedDate = new Date();

// normalize day
if (date.getDayString() != null) {
String dayStringBis = "";
if (isNotBlank(date.getDayString())) {
StringBuilder dayStringBis = new StringBuilder();
String dayString = date.getDayString().trim();
normalizedDate.setDayString(dayString);
for (int n = 0; n < dayString.length(); n++) {
char c = dayString.charAt(n);
if (Character.isDigit(c)) {
dayStringBis += c;
dayStringBis.append(c);
}
}
try {
int day = Integer.parseInt(dayStringBis);
date.setDay(day);
int day = Integer.parseInt(dayStringBis.toString());
normalizedDate.setDay(day);
} catch (Exception e) {
//e.printStackTrace();
}
}

//normalize month
if (date.getMonthString() != null) {
if (isNotBlank(date.getMonthString())) {
String month = date.getMonthString().trim();
normalizedDate.setMonthString(month);
int n = 0;
while (n < 12) {
Matcher ma = months[n].matcher(month);
if (ma.find()) {
date.setMonth(n + 1);
normalizedDate.setMonth(n + 1);
break;
}
n++;
}
}

if (date.getYearString() != null) {
String yearStringBis = "";
if (StringUtils.isNotBlank(date.getYearString())) {
StringBuilder yearStringBis = new StringBuilder();
String yearString = date.getYearString().trim();
normalizedDate.setYearString(yearString);
for (int n = 0; n < yearString.length(); n++) {
char c = yearString.charAt(n);
if (Character.isDigit(c)) {
yearStringBis += c;
yearStringBis.append(c);
}
}
try {
int year = Integer.parseInt(yearStringBis);
int year = Integer.parseInt(yearStringBis.toString());
if ((year >= 20) && (year < 100)) {
year = year + 1900;
} else if ((year >= 0) && (year < 20)) {
year = year + 2000;
}
date.setYear(year);
normalizedDate.setYear(year);
} catch (Exception e) {
//e.printStackTrace();
}
}

// if we don't have day and month, but a year with 8 digits, we might have a YYYYMMDD pattern
if (date.getDay() == -1 && date.getMonth() == -1 && date.getYear() != -1 && date.getYear() > 19000000 && date.getYear() < 20251231) {
int maxYear = Calendar.getInstance().getWeekYear() + 4;
if (date.getDay() == -1 && date.getMonth() == -1 && date.getYear() != -1 && date.getYear() > 19000000 && date.getYear() < maxYear * 10000+1231) {
int yearPart = date.getYear() / 10000;
if (yearPart > 1900 && yearPart < 2025) {
if (yearPart > 1900 && yearPart < maxYear) {
String yearString = ""+date.getYear();
String theMonthString = yearString.substring(4,6);
String theDayString = yearString.substring(6,8);
Expand All @@ -288,16 +303,58 @@ public void normalize(Date date) {

if (dayPart != -1 && monthPart != -1) {
if (dayPart > 0 && dayPart < 32 && monthPart > 0 && monthPart < 13) {
date.setDay(dayPart);
date.setDayString(theDayString);
date.setMonth(monthPart);
date.setMonthString(theMonthString);
date.setYear(yearPart);
normalizedDate.setDay(dayPart);
normalizedDate.setDayString(theDayString);
normalizedDate.setMonth(monthPart);
normalizedDate.setMonthString(theMonthString);
normalizedDate.setYear(yearPart);
}
}
}
}

Date validatedDate = postValidate(normalizedDate);

return validatedDate;

}

/**
* Simple and loose date validation, checking:
* - the year has not more than 4 digits
* - the month and day has not more than 2 digits
*
* Assuming that incomplete dates of any form and nature can pass by here, only the information that are "out of bounds" will be reverted.
*
* @return the date where invalid information are removed or reverted
*/
public static Date postValidate(Date originalDate) {
Date validatedDate = new Date();

if (originalDate.getDay() > -1) {
if (String.valueOf(originalDate.getDay()).length() < 3) {
validatedDate.setDay(originalDate.getDay());
validatedDate.setDayString(originalDate.getDayString());
}
}

if (originalDate.getMonth() > -1) {
if (String.valueOf(originalDate.getMonth()).length() < 3) {
validatedDate.setMonth(originalDate.getMonth());
validatedDate.setMonthString(originalDate.getMonthString());
}
}

if (originalDate.getYear() > -1) {
if (String.valueOf(originalDate.getYear()).length() < 5) {
validatedDate.setYear(originalDate.getYear());
validatedDate.setYearString(originalDate.getYearString());
}
}

return validatedDate;
}



/**
Expand Down

0 comments on commit 5cc2b1f

Please sign in to comment.