Skip to content

Commit

Permalink
Refactor char class parsing and enable macros in char classes
Browse files Browse the repository at this point in the history
Addresses issue #216.
  • Loading branch information
lsf37 committed Dec 4, 2019
1 parent 5253071 commit 6f045bd
Show file tree
Hide file tree
Showing 12 changed files with 709 additions and 866 deletions.
1,026 changes: 339 additions & 687 deletions jflex/src/main/cup/LexParse.cup

Large diffs are not rendered by default.

13 changes: 12 additions & 1 deletion jflex/src/main/java/jflex/chars/Interval.java
Expand Up @@ -24,7 +24,7 @@ public final class Interval {
public int end;

/**
* Construct a new interval from {@code start</code> to <code>end}.
* Construct a new interval from {@code start</code> to <code>end}, including both end points.
*
* @param start first character the interval should contain
* @param end last character the interval should contain
Expand All @@ -34,6 +34,17 @@ public Interval(int start, int end) {
this.end = end;
}

/**
* Construct a new interval containing a single character.
*
* @param content the single character the interval should contain
*/
public Interval(int content) {
this.start = content;
this.end = content;
}

/** Copy constructor */
public Interval(Interval other) {
this.start = other.start;
this.end = other.end;
Expand Down
27 changes: 3 additions & 24 deletions jflex/src/main/java/jflex/core/CharClasses.java
Expand Up @@ -121,6 +121,8 @@ public int getNumClasses() {
* @param caseless if true upper/lower/title case are considered equivalent
*/
public void makeClass(IntCharSet set, boolean caseless) {
set = set.copy(); // avoid destructively updating the original

if (caseless) set = set.getCaseless(unicodeProps);

if (DEBUG) {
Expand Down Expand Up @@ -275,7 +277,7 @@ public void makeClassNot(List<Interval> l, boolean caseless) {
* Returns an array that contains the character class codes of all characters in the specified set
* of input characters.
*/
private int[] getClassCodes(IntCharSet set, boolean negate) {
public int[] getClassCodes(IntCharSet set, boolean negate) {

if (DEBUG) {
Out.dump("getting class codes for " + set);
Expand Down Expand Up @@ -309,29 +311,6 @@ private int[] getClassCodes(IntCharSet set, boolean negate) {
return result;
}

/**
* Returns an array that contains the character class codes of all characters in the specified set
* of input characters.
*
* @param intervalList a List of Intervals, the set of characters to get the class codes for
* @return an array with the class codes for intervalList
*/
public int[] getClassCodes(List<Interval> intervalList) {
return getClassCodes(new IntCharSet(intervalList), false);
}

/**
* Returns an array that contains the character class codes of all characters that are
* <strong>not</strong> in the specified set of input characters.
*
* @param intervalList a List of Intervals, the complement of the set of characters to get the
* class codes for
* @return an array with the class codes for the complement of intervalList
*/
public int[] getNotClassCodes(List<Interval> intervalList) {
return getClassCodes(new IntCharSet(intervalList), true);
}

/**
* Check consistency of the stored classes [debug].
*
Expand Down
60 changes: 52 additions & 8 deletions jflex/src/main/java/jflex/core/IntCharSet.java
Expand Up @@ -31,14 +31,21 @@ public final class IntCharSet implements Comparable<IntCharSet> {

/* invariant: all intervals are disjoint, ordered */
private List<Interval> intervals = new ArrayList<>();

/** for iterating over the char set */
private int pos;

/** Creates an empty char set. */
public IntCharSet() {}

/** Creates a char set that contains only the given character. */
public IntCharSet(int c) {
this(new Interval(c, c));
this(new Interval(c));
}

/** Creates a char set that contains only the given interval. */
public IntCharSet(int start, int end) {
this(new Interval(start, end));
}

/** Creates a charset that contains only one interval. */
Expand All @@ -58,6 +65,28 @@ public IntCharSet(List<Interval> chars) {
for (Interval interval : chars) add(interval);
}

/**
* The set of all characters.
*
* @return a new IntCharSet that contains all characters.
*/
public static IntCharSet allChars() {
return new IntCharSet(0, CharClasses.maxChar);
}

/**
* The set of new-line characters.
*
* @return a new IntCharSet that contains all characters that are considered a new-line char in
* Java.
*/
public static IntCharSet nlChars() {
IntCharSet set = new IntCharSet(new Interval('\n', '\r'));
set.add(new Interval('\u0085', '\u0085'));
set.add(new Interval('\u2028', '\u2029'));
return set;
}

/**
* returns the index of the interval that contains the character c, -1 if there is no such
* interval
Expand Down Expand Up @@ -150,7 +179,7 @@ public void add(Interval interval) {
}

/**
* add.
* Add a single character.
*
* @param c a int.
*/
Expand Down Expand Up @@ -191,23 +220,38 @@ public void add(int c) {
}

// end reached but nothing found -> append at end
intervals.add(new Interval(c, c));
intervals.add(new Interval(c));
}

/**
* contains.
* Check wether this set contains a given character.
*
* @param singleChar a int.
* @return a boolean.
* @param singleChar a single character (int).
* @return true iff singleChar is contained in the set.
*/
public boolean contains(int singleChar) {
return indexOf(singleChar) >= 0;
}

/**
* Check wether this set contains a another set.
*
* @param other an IntCharSet.
* @return true iff all characters of {@code other} are contained in this set.
*/
public boolean contains(IntCharSet other) {
// treat null as empty set
if (other == null) return true;
IntCharSet set = other.copy();
IntCharSet inter = this.and(other);
set.sub(inter);
return !set.containsElements();
}

/**
* {@inheritDoc}
*
* <p>o instanceof Interval
* <p>o instanceof IntCharSet
*/
@Override
public boolean equals(Object o) {
Expand All @@ -219,6 +263,7 @@ public boolean equals(Object o) {
return Objects.equals(intervals, set.intervals);
}

/** {@inheritDoc} */
@Override
public int hashCode() {
int h = 1;
Expand All @@ -229,7 +274,6 @@ public int hashCode() {
return h;
}

/* intersection */
/**
* Intersects two sets.
*
Expand Down
24 changes: 20 additions & 4 deletions jflex/src/main/java/jflex/core/Macros.java
Expand Up @@ -124,8 +124,7 @@ public RegExp getDefinition(String name) {
public void expand() throws jflex.exceptions.MacroException {
for (String name : macros.keySet()) {
if (isUsed(name)) macros.put(name, expandMacro(name, getDefinition(name)));
// this put doesn't get a new key, so only a new value
// is set for the key "name"
// this put doesn't get a new key, so only a new value is set for the key "name"
}
}

Expand All @@ -138,6 +137,7 @@ public void expand() throws jflex.exceptions.MacroException {
* @throws jflex.exceptions.MacroException when an error (such as a cyclic definition) occurs
* during expansion
*/
@SuppressWarnings("unchecked")
private RegExp expandMacro(String name, RegExp definition)
throws jflex.exceptions.MacroException {

Expand Down Expand Up @@ -181,14 +181,30 @@ private RegExp expandMacro(String name, RegExp definition)
case sym.STRING_I:
case sym.CHAR:
case sym.CHAR_I:
case sym.PRIMCLASS:
return definition;

case sym.CCLASS:
case sym.CCLASSNOT:
return definition;
RegExp1 cclass = (RegExp1) definition;
List<RegExp> classes = (List<RegExp>) cclass.content;
List<RegExp> newClasses = new ArrayList<RegExp>();
for (RegExp r : classes) {
newClasses.add(expandMacro(name, r));
}
cclass.content = newClasses;
return cclass;

case sym.CCLASSOP:
RegExp2 cclassOp = (RegExp2) ((RegExp1) definition).content;
cclassOp.r1 = expandMacro(name, cclassOp.r1);
cclassOp.r2 = expandMacro(name, cclassOp.r2);
return cclassOp;

default:
throw new MacroException(
"unknown expression type "
+ definition.type
+ sym.terminalNames[definition.type]
+ " in macro expansion"); // $NON-NLS-1$ //$NON-NLS-2$
}
}
Expand Down
45 changes: 11 additions & 34 deletions jflex/src/main/java/jflex/core/NFA.java
Expand Up @@ -188,7 +188,7 @@ public void addRegExp(int regExpNum) {
// base forward pass
IntPair forward = insertNFA(r1);
// lookahead backward pass
IntPair backward = insertNFA(r2.rev(macros));
IntPair backward = insertNFA(r2.rev());

isFinal[forward.end()] = true;
action[forward.end()] = new Action(Action.FORWARD_ACTION);
Expand Down Expand Up @@ -222,9 +222,6 @@ private void insertLookAheadChoices(int baseEnd, Action a, RegExp lookAhead) {
RegExp2 r = (RegExp2) lookAhead;
insertLookAheadChoices(baseEnd, a, r.r1);
insertLookAheadChoices(baseEnd, a, r.r2);
} else if (lookAhead.type == sym.MACROUSE) {
RegExp1 r = (RegExp1) lookAhead;
insertLookAheadChoices(baseEnd, a, macros.getDefinition((String) r.content));
} else {
int len = SemCheck.length(lookAhead);

Expand All @@ -245,9 +242,9 @@ private void insertLookAheadChoices(int baseEnd, Action a, RegExp lookAhead) {
// should never happen
throw new Error(
"When inserting lookahead expression: unknown expression type "
+ lookAhead.type
+ lookAhead.typeName()
+ " in "
+ lookAhead); // $NON-NLS-1$ //$NON-NLS-2$
+ lookAhead);
}
}
}
Expand Down Expand Up @@ -682,16 +679,8 @@ private IntPair insertStringNFA(boolean caseless, String str) {
return IntPair.create(start, i + start);
}

private void insertClassNFA(List<Interval> intervals, int start, int end) {
// empty char class is ok:
if (intervals == null) return;

for (int aCl : classes.getClassCodes(intervals)) addTransition(start, aCl, end);
}

private void insertNotClassNFA(List<Interval> intervals, int start, int end) {

for (int input : classes.getNotClassCodes(intervals)) addTransition(start, input, end);
private void insertClassNFA(IntCharSet set, int start, int end) {
for (int aCl : classes.getClassCodes(set, false)) addTransition(start, aCl, end);
}

/**
Expand Down Expand Up @@ -915,7 +904,6 @@ private void removeDead(int start, int end) {
* @param regExp the regular expression to construct the NFA for
* @return a pair of integers denoting the index of start and end state of the NFA.
*/
@SuppressWarnings("unchecked") // for List<Interval> casts
private void insertCCLNFA(RegExp regExp, int start, int end) {
switch (regExp.type) {
case sym.BAR:
Expand All @@ -924,12 +912,8 @@ private void insertCCLNFA(RegExp regExp, int start, int end) {
insertCCLNFA(r.r2, start, end);
return;

case sym.CCLASS:
insertClassNFA((List<Interval>) ((RegExp1) regExp).content, start, end);
return;

case sym.CCLASSNOT:
insertNotClassNFA((List<Interval>) ((RegExp1) regExp).content, start, end);
case sym.PRIMCLASS:
insertClassNFA((IntCharSet) ((RegExp1) regExp).content, start, end);
return;

case sym.CHAR:
Expand All @@ -939,13 +923,9 @@ private void insertCCLNFA(RegExp regExp, int start, int end) {
case sym.CHAR_I:
insertLetterNFA(true, (Integer) ((RegExp1) regExp).content, start, end);
return;

case sym.MACROUSE:
insertCCLNFA(macros.getDefinition((String) ((RegExp1) regExp).content), start, end);
return;
}

throw new Error("Unknown expression type " + regExp.type + " in NFA construction");
throw new Error("Unknown expression type " + regExp.typeName() + " in NFA construction");
}

/**
Expand All @@ -967,7 +947,7 @@ public IntPair insertNFA(RegExp regExp) {
Out.debug("Inserting RegExp : " + regExp);
}

if (regExp.isCharClass(macros)) {
if (regExp.isCharClass()) {
start = numStates;
end = numStates + 1;

Expand Down Expand Up @@ -1044,19 +1024,16 @@ public IntPair insertNFA(RegExp regExp) {
return complement(insertNFA((RegExp) ((RegExp1) regExp).content));

case sym.TILDE:
return insertNFA(regExp.resolveTilde(macros));
return insertNFA(regExp.resolveTilde());

case sym.STRING:
return insertStringNFA(false, (String) ((RegExp1) regExp).content);

case sym.STRING_I:
return insertStringNFA(true, (String) ((RegExp1) regExp).content);

case sym.MACROUSE:
return insertNFA(macros.getDefinition((String) ((RegExp1) regExp).content));
}

throw new Error("Unknown expression type " + regExp.type + " in NFA construction");
throw new Error("Unknown expression type " + regExp.typeName() + " in NFA construction");
}

public int numStates() {
Expand Down

0 comments on commit 6f045bd

Please sign in to comment.