Skip to content

Commit

Permalink
cosmetic
Browse files Browse the repository at this point in the history
  • Loading branch information
detonator413 committed Jan 14, 2016
1 parent 82dcbd8 commit b04945a
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 18 deletions.
Expand Up @@ -8,6 +8,11 @@
* @author Patrice Lopez
*/
public class BibDataSet {
public enum Counters {
CITATIONS_CNT,
CITATIONS_WITH_CONTEXT_CNT,
CITATIONS_WITHOUT_CONTEXT_CNT
}

private BiblioItem resBib = null; // identified parsed bibliographical item
private List<String> sourceBib = null;
Expand Down
Expand Up @@ -1810,11 +1810,14 @@ private void toTEI(Document doc,
} catch (Exception e) {
throw new GrobidException("An exception occurred while running Grobid.", e);
}
doc.setTei(
XmlBuilderUtils.toPrettyXml(
XmlBuilderUtils.fromString(tei.toString())
)
);
doc.setTei(tei.toString());

//TODO: reevaluate
// doc.setTei(
// XmlBuilderUtils.toPrettyXml(
// XmlBuilderUtils.fromString(tei.toString())
// )
// );
}

@Override
Expand Down
Expand Up @@ -455,6 +455,9 @@ public void endElement(java.lang.String uri, java.lang.String localName,

for(String tok : subTokenizations) {

if (tok.contains("Minnich")) {
int f = 0;
}
diaresis = false;
accent = false;

Expand Down Expand Up @@ -543,6 +546,8 @@ public void endElement(java.lang.String uri, java.lang.String localName,
localTok.setPage(currentPage);
addToken(localTok);

// addToken(previousTok);

//System.out.println("add token layout: " + previousTok.getText());
//System.out.println("add tokenizations: " + previousTok.getText());
}
Expand Down
Expand Up @@ -115,6 +115,17 @@ public static int tokenPos(List<LayoutToken> toks, String text) {
return -1;
}

public static int tokenPos(List<LayoutToken> toks, Pattern p) {
int cnt = 0;
for (LayoutToken t : toks) {
if (p.matcher(t.t()).matches()) {
return cnt;
}
cnt++;
}
return -1;
}

// public static List<List<LayoutToken>> split(List<LayoutToken> toks, Pattern p) {
// return split(toks, p, false);
// }
Expand Down
Expand Up @@ -30,12 +30,13 @@ public class ReferenceMarkerMatcher {

public static final Pattern YEAR_PATTERN = Pattern.compile("[12][0-9]{3,3}[a-d]?");
public static final Pattern AUTHOR_NAME_PATTERN = Pattern.compile("[A-Z][A-Za-z]+");
private static final Pattern NUMBERED_CITATION_PATTERN = Pattern.compile(" *[\\(\\[]? *(?:\\d+[-–]\\d+,|\\d+, *)*(?:\\d+[-–]\\d+|\\d+)[\\)\\]]? *");
private static final Pattern NUMBERED_CITATION_PATTERN = Pattern.compile(" *[\\(\\[]? *(?:\\d+[-–]\\d+,|\\d+, *)*[ ]*(?:\\d+[-–]\\d+|\\d+)[\\)\\]]? *");
public static final Pattern AUTHOR_SEPARATOR_PATTERN = Pattern.compile(";");
public static final ClassicAnalyzer ANALYZER = new ClassicAnalyzer(Version.LUCENE_45);
public static final int MAX_RANGE = 20;
public static final Pattern NUMBERED_CITATIONS_SPLIT_PATTERN = Pattern.compile("[,;]");
public static final Pattern AND_WORD_PATTERN = Pattern.compile("and");
public static final Pattern DASH_PATTERN = Pattern.compile("[–-]");

public enum Counters {
MATCHED_REF_MARKERS,
Expand Down Expand Up @@ -188,7 +189,7 @@ private static List<Pair<String, List<LayoutToken>>> getNumberedLabels(List<Layo
// Splitter.on(NUMBERED_CITATIONS_SPLIT_PATTERN).omitEmptyStrings().splitToList(text);
List<Pair<String, List<LayoutToken>>> res = new ArrayList<>();
for (List<LayoutToken> s : split) {
int minusPos = LayoutTokensUtil.tokenPos(s, "-");
int minusPos = LayoutTokensUtil.tokenPos(s, DASH_PATTERN);
if (minusPos < 0) {
res.add(new Pair<>(LayoutTokensUtil.toText(s), s));
} else {
Expand Down Expand Up @@ -268,10 +269,12 @@ private List<MatchResult> matchAuthorCitation(String text, List<LayoutToken> ref
cntManager.i(Counters.NO_CANDIDATES_AFTER_POST_FILTERING);
} else {
cntManager.i(Counters.MANY_CANDIDATES_AFTER_POST_FILTERING);
System.out.println("MANY CANDIDATES: " + text + "\n" + c + "\n");
System.out.println("MANY CANDIDATES: " + text + "\n-----\n" + c + "\n");
for (BibDataSet bds : matches) {
System.out.println("+++++");
System.out.println(" " + bds.getRawBib());
}
System.out.println("===============");
}
}
} else {
Expand Down Expand Up @@ -348,14 +351,37 @@ private static int matchCount(List<LayoutToken> toks, Pattern p) {

//if we match more than 1 citation based on name, then we leave only those citations that have author name first
private List<BibDataSet> postFilterMatches(String c, List<BibDataSet> matches) {
String[] sp = c.trim().split(" ");
final String author = sp[0].toLowerCase();
return Lists.newArrayList(Iterables.filter(matches, new Predicate<BibDataSet>() {
@Override
public boolean apply(BibDataSet bibDataSet) {
return bibDataSet.getRawBib().trim().toLowerCase().startsWith(author);
if (c.toLowerCase().contains("et al") || c.toLowerCase().contains(" and ")) {
String[] sp = c.trim().split(" ");
final String author = sp[0].toLowerCase();

ArrayList<BibDataSet> bibDataSets = Lists.newArrayList(Iterables.filter(matches, new Predicate<BibDataSet>() {
@Override
public boolean apply(BibDataSet bibDataSet) {
return bibDataSet.getRawBib().trim().toLowerCase().startsWith(author);
}
}));

if (bibDataSets.size() <= 1) {
return bibDataSets;
}
}));

//cases like c = "Smith et al, 2015" and Bds = <"Smith, Hoffmann, 2015", "Smith, 2015"> -- should prefer first one
return Lists.newArrayList(Iterables.filter(bibDataSets, new Predicate<BibDataSet>() {
@Override
public boolean apply(BibDataSet bibDataSet) {
return (bibDataSet.getResBib().getFullAuthors() != null && bibDataSet.getResBib().getFullAuthors().size() > 1);
}
}));
} else {
//cases like c = "Smith, 2015" and Bds = <"Smith, Hoffmann, 2015", "Smith, 2015"> -- should prefer second one
return Lists.newArrayList(Iterables.filter(matches, new Predicate<BibDataSet>() {
@Override
public boolean apply(BibDataSet bibDataSet) {
return bibDataSet.getResBib().getFullAuthors() != null && bibDataSet.getResBib().getFullAuthors().size() == 1;
}
}));
}
}

public static void main(String[] args) {
Expand Down
Expand Up @@ -39,8 +39,14 @@ public class CitationsVisualizer {

public static void main(String args[]) {
try {
// /Work/temp/context/1000k/AS_101465473421322_1401202662564.pdf

// /Work/temp/context/1000k/AS_104748833312772_1401985480367.pdf - invalid byte
//
// File input = new File("/Work/temp/pub_citation_styles/1994FEBSLett350_235Hadden.pdf");
File input = new File("/Work/temp/context/coords/2.pdf");
// File input = new File("/Work/temp/context/1000k/AS_99223336914944_1400668095132.pdf");
File input = new File("/tmp/AS_100005549445135_1400854589869.pdf"); // not all tokens
// File input = new File("/Work/temp/context/coords/1.pdf");
// File input = new File("/Work/temp/context/tilo/4.pdf");

// File input = new File("/Work/temp/pub_citation_styles/1996ParPrecConfProc00507369.pdf");
Expand Down Expand Up @@ -217,7 +223,7 @@ private static void annotatePage(PDDocument document, String coords, long seed,

// ADDING LINE TO THE REFERENCE
PDPageContentStream stream = new PDPageContentStream(document, page, true, false);
Random r = new Random(seed);
Random r = new Random(seed + 1);


// stream.setStrokingColor(85, 177, 245);
Expand Down
Expand Up @@ -866,20 +866,29 @@ public void testFulltexts() throws Exception {
matchingMode(1)
.build();

for (File f : new File("/Work/temp/pub_citation_styles").listFiles(new FileFilter() {
int cnt = 0;
// for (File f : new File("/Work/temp/pub_citation_styles").listFiles(new FileFilter() {
// @Override
// public boolean accept(File pathname) {
for (File f : new File("/Work/temp/context/1000k").listFiles(new FileFilter() {
@Override
public boolean accept(File pathname) {
return pathname.getName().endsWith(".pdf");
}
})) {
try {
Engine.getCntManager().i("PDFS", "INPUT_CNT");
System.out.println("Processing: " + f);
String tei = engine.fullTextToTEI(f, config);
System.out.println(tei.length());
} catch (Exception e) {
e.printStackTrace();
Engine.getCntManager().i("FAILED", e.getClass().getSimpleName());
}
if (++cnt % 10 == 0) {
System.out.println("Processed: " + cnt);
System.out.println(Engine.getCntManager());
}
}

// System.out.println(engine.fullTextToTEI(new File("/Users/zholudev/Work/workspace/pdf-analysis/pdf-analysis-service/src/test/resources/net/researchgate/pdfanalysisservice/papers.bad.input/40th_Conf_unprotected.pdf"), GrobidAnalysisConfig.defaultInstance()));
Expand Down

0 comments on commit b04945a

Please sign in to comment.