Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[uk] adj/noun inflection agreement rule is on by default
- Loading branch information
Showing
11 changed files
with
2,989 additions
and
582 deletions.
There are no files selected for viewing
46 changes: 46 additions & 0 deletions
46
...ool-language-modules/uk/src/main/java/org/languagetool/rules/uk/CaseGovernmentHelper.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
package org.languagetool.rules.uk; | ||
This comment has been minimized.
Sorry, something went wrong. |
||
|
||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.util.Arrays; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.Map; | ||
import java.util.Scanner; | ||
import java.util.Set; | ||
|
||
import org.languagetool.AnalyzedToken; | ||
import org.languagetool.AnalyzedTokenReadings; | ||
import org.languagetool.JLanguageTool; | ||
|
||
public class CaseGovernmentHelper { | ||
|
||
static final Map<String, Set<String>> CASE_GOVERNMENT_MAP = loadMap("/uk/case_government.txt"); | ||
|
||
private static Map<String, Set<String>> loadMap(String path) { | ||
Map<String, Set<String>> result = new HashMap<>(); | ||
try (InputStream is = JLanguageTool.getDataBroker().getFromResourceDirAsStream(path); | ||
Scanner scanner = new Scanner(is, "UTF-8")) { | ||
while (scanner.hasNextLine()) { | ||
String line = scanner.nextLine(); | ||
String[] parts = line.split(" "); | ||
String[] vidm = parts[1].split(":"); | ||
result.put(parts[0], new HashSet<String>(Arrays.asList(vidm))); | ||
} | ||
// System.err.println("Found case governments: " + result.size()); | ||
return result; | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
This comment has been minimized.
Sorry, something went wrong.
danielnaber
Member
|
||
} | ||
} | ||
|
||
public static boolean hasCaseGovernment(AnalyzedTokenReadings analyzedTokenReadings, String rvCase) { | ||
for(AnalyzedToken token: analyzedTokenReadings.getReadings()) { | ||
if( CASE_GOVERNMENT_MAP.containsKey(token.getLemma()) | ||
&& CASE_GOVERNMENT_MAP.get(token.getLemma()).contains(rvCase) ) | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
} |
147 changes: 147 additions & 0 deletions
147
...agetool-language-modules/uk/src/main/java/org/languagetool/rules/uk/InflectionHelper.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
package org.languagetool.rules.uk; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.regex.Matcher; | ||
|
||
import org.languagetool.AnalyzedToken; | ||
|
||
class InflectionHelper { | ||
|
||
static class Inflection implements Comparable<Inflection> { | ||
final String gender; | ||
final String _case; | ||
final String animTag; | ||
|
||
public Inflection(String gender, String _case, String animTag) { | ||
this.gender = gender; | ||
this._case = _case; | ||
this.animTag = animTag; | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
final int prime = 31; | ||
This comment has been minimized.
Sorry, something went wrong.
danielnaber
Member
|
||
int result = 1; | ||
result = prime * result + ((_case == null) ? 0 : _case.hashCode()); | ||
result = prime * result + ((animTag == null) ? 0 : animTag.hashCode()); | ||
result = prime * result + ((gender == null) ? 0 : gender.hashCode()); | ||
return result; | ||
} | ||
|
||
@Override | ||
public boolean equals(Object obj) { | ||
if (this == obj) | ||
This comment has been minimized.
Sorry, something went wrong. |
||
return true; | ||
if (obj == null) | ||
return false; | ||
if (getClass() != obj.getClass()) | ||
return false; | ||
|
||
Inflection other = (Inflection) obj; | ||
return gender.equals(other.gender) | ||
&& _case.equals(other._case) | ||
&& (animTag == null || other.animTag == null | ||
|| ! animMatters() || ! other.isAnimalSensitive() || animTag.equals(other.animTag)); | ||
} | ||
|
||
public boolean equalsIgnoreGender(Inflection other) { | ||
return //gender.equals(other.gender) | ||
_case.equals(other._case) | ||
&& (animTag == null || other.animTag == null | ||
|| ! animMatters() || animTag.equals(other.animTag)); | ||
} | ||
|
||
boolean animMatters() { | ||
return _case.equals("v_zna") && isAnimalSensitive(); | ||
} | ||
|
||
private boolean isAnimalSensitive() { | ||
return "mp".contains(gender); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return ":" + gender + ":" + _case | ||
+ (animMatters() ? "_"+animTag : ""); | ||
} | ||
|
||
@Override | ||
public int compareTo(Inflection o) { | ||
int compared = GEN_ORDER.get(gender).compareTo(GEN_ORDER.get(o.gender)); | ||
if( compared != 0 ) | ||
return compared; | ||
|
||
compared = VIDM_ORDER.get(_case).compareTo(VIDM_ORDER.get(o._case)); | ||
return compared; | ||
} | ||
|
||
} | ||
|
||
static List<Inflection> getAdjInflections(List<AnalyzedToken> adjTokenReadings) { | ||
List<Inflection> masterInflections = new ArrayList<>(); | ||
for (AnalyzedToken token: adjTokenReadings) { | ||
String posTag = token.getPOSTag(); | ||
|
||
if( posTag == null || ! posTag.startsWith("adj") ) | ||
continue; | ||
|
||
Matcher matcher = TokenInflectionAgreementRule.ADJ_INFLECTION_PATTERN.matcher(posTag); | ||
matcher.find(); | ||
|
||
String gen = matcher.group(1); | ||
String vidm = matcher.group(2); | ||
String animTag = null; | ||
if (matcher.group(3) != null) { | ||
animTag = matcher.group(3).substring(2); // :rinanim/:ranim | ||
} | ||
|
||
masterInflections.add(new Inflection(gen, vidm, animTag)); | ||
} | ||
return masterInflections; | ||
} | ||
|
||
static List<Inflection> getNounInflections(List<AnalyzedToken> nounTokenReadings) { | ||
List<Inflection> slaveInflections = new ArrayList<>(); | ||
for (AnalyzedToken token: nounTokenReadings) { | ||
String posTag2 = token.getPOSTag(); | ||
if( posTag2 == null ) | ||
continue; | ||
|
||
Matcher matcher = TokenInflectionAgreementRule.NOUN_INFLECTION_PATTERN.matcher(posTag2); | ||
if( ! matcher.find() ) { | ||
// System.err.println("Failed to find slave inflection tag in " + posTag2 + " for " + nounTokenReadings); | ||
continue; | ||
} | ||
String gen = matcher.group(2); | ||
String vidm = matcher.group(3); | ||
String animTag = matcher.group(1); | ||
|
||
slaveInflections.add(new Inflection(gen, vidm, animTag)); | ||
} | ||
return slaveInflections; | ||
} | ||
|
||
static final Map<String,Integer> GEN_ORDER = new HashMap<>(); | ||
static final Map<String,Integer> VIDM_ORDER = new HashMap<>(); | ||
|
||
static { | ||
GEN_ORDER.put("m", 0); | ||
GEN_ORDER.put("f", 1); | ||
GEN_ORDER.put("n", 3); | ||
// GEN_ORDER.put("s", 4); | ||
GEN_ORDER.put("p", 5); | ||
|
||
VIDM_ORDER.put("v_naz", 10); | ||
VIDM_ORDER.put("v_rod", 20); | ||
VIDM_ORDER.put("v_dav", 30); | ||
VIDM_ORDER.put("v_zna", 40); | ||
VIDM_ORDER.put("v_oru", 50); | ||
VIDM_ORDER.put("v_mis", 60); | ||
VIDM_ORDER.put("v_kly", 70); | ||
|
||
} | ||
|
||
} |
153 changes: 153 additions & 0 deletions
153
languagetool-language-modules/uk/src/main/java/org/languagetool/rules/uk/LemmaHelper.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
package org.languagetool.rules.uk; | ||
|
||
import java.util.List; | ||
import java.util.regex.Pattern; | ||
|
||
import org.languagetool.AnalyzedToken; | ||
import org.languagetool.AnalyzedTokenReadings; | ||
import org.languagetool.tagging.uk.PosTagHelper; | ||
|
||
public abstract class LemmaHelper { | ||
|
||
public static boolean hasLemma(AnalyzedTokenReadings analyzedTokenReadings, List<String> lemmas) { | ||
List<AnalyzedToken> readings = analyzedTokenReadings.getReadings(); | ||
return hasLemma(readings, lemmas); | ||
} | ||
|
||
public static boolean hasLemma(List<AnalyzedToken> readings, List<String> lemmas) { | ||
for(AnalyzedToken analyzedToken: readings) { | ||
if( lemmas.contains(analyzedToken.getLemma()) ) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
public static boolean hasLemma(AnalyzedTokenReadings analyzedTokenReadings, List<String> lemmas, String partPos) { | ||
for(AnalyzedToken analyzedToken: analyzedTokenReadings.getReadings()) { | ||
for(String lemma: lemmas) { | ||
if( lemma.equals(analyzedToken.getLemma()) | ||
&& analyzedToken.getPOSTag() != null && analyzedToken.getPOSTag().contains(partPos) ) { | ||
return true; | ||
} | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
public static boolean hasLemma(AnalyzedTokenReadings analyzedTokenReadings, List<String> lemmas, Pattern posRegex) { | ||
for(AnalyzedToken analyzedToken: analyzedTokenReadings.getReadings()) { | ||
for(String lemma: lemmas) { | ||
if( lemma.equals(analyzedToken.getLemma()) | ||
&& analyzedToken.getPOSTag() != null | ||
&& posRegex.matcher(analyzedToken.getPOSTag()).matches()) { | ||
return true; | ||
} | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
public static boolean hasLemma(AnalyzedToken token, List<String> asList, String partPos) { | ||
return asList.contains(token.getLemma()) | ||
&& token.getPOSTag() != null && token.getPOSTag().contains(partPos); | ||
} | ||
|
||
public static boolean hasLemma(AnalyzedTokenReadings analyzedTokenReadings, String lemmas) { | ||
for(AnalyzedToken analyzedToken: analyzedTokenReadings.getReadings()) { | ||
if( lemmas.equals(analyzedToken.getLemma()) ) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
public static boolean hasLemma(AnalyzedTokenReadings analyzedTokenReadings, Pattern pattern) { | ||
for(AnalyzedToken analyzedToken: analyzedTokenReadings.getReadings()) { | ||
String lemma = analyzedToken.getLemma(); | ||
if( lemma != null && pattern.matcher(lemma).matches() ) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
public static boolean hasLemma(AnalyzedTokenReadings analyzedTokenReadings, Pattern pattern, Pattern posTagRegex) { | ||
for(AnalyzedToken analyzedToken: analyzedTokenReadings.getReadings()) { | ||
String lemma = analyzedToken.getLemma(); | ||
if( lemma != null && pattern.matcher(lemma).matches() | ||
&& posTagRegex != null && analyzedToken.getPOSTag() != null && posTagRegex.matcher(analyzedToken.getPOSTag()).matches() ) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
static boolean reverseSeach(AnalyzedTokenReadings[] tokens, int pos, int depth, Pattern lemma, Pattern postag) { | ||
for(int i=pos; i>pos-depth && i>=0; i--) { | ||
if( (lemma == null || hasLemma(tokens[i], lemma)) | ||
&& (postag == null || PosTagHelper.hasPosTag(tokens[i], postag)) ) | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
static boolean forwardPosTagSearch(AnalyzedTokenReadings[] tokens, int pos, String posTag, int maxSkip) { | ||
for(int i=pos; i < tokens.length && i <= pos + maxSkip; i++) { | ||
if( PosTagHelper.hasPosTagPart(tokens[i], posTag) ) | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
enum Dir {FORWARD, REVERSE} | ||
|
||
private static final Pattern QUOTES = Pattern.compile("[«»„“]"); | ||
|
||
static int tokenSearch(AnalyzedTokenReadings[] tokens, int pos, String posTag, Pattern token, Pattern posTagsToIgnore, Dir dir) { | ||
int step = dir == Dir.FORWARD ? 1 : -1; | ||
|
||
for(int i = pos; i < tokens.length && i > 0; i += step) { | ||
if( (posTag == null || PosTagHelper.hasPosTagPart(tokens[i], posTag)) | ||
&& (token == null || token.matcher(tokens[i].getToken()).matches()) ) | ||
return i; | ||
|
||
if( ! PosTagHelper.hasPosTag(tokens[i], posTagsToIgnore) | ||
&& ! QUOTES.matcher(tokens[i].getToken()).matches() ) | ||
break; | ||
} | ||
|
||
return -1; | ||
} | ||
|
||
static boolean revSearch(AnalyzedTokenReadings[] tokens, int startPos, Pattern lemma, String postagRegex) { | ||
return LemmaHelper.revSearchIdx(tokens, startPos, lemma, postagRegex) != -1; | ||
} | ||
|
||
static int revSearchIdx(AnalyzedTokenReadings[] tokens, int startPos, Pattern lemma, String postagRegex) { | ||
if( startPos > 0 && PosTagHelper.hasPosTag(tokens[startPos], "part.*") ) { | ||
// if( startPos > 0 && LemmaHelper.hasLemma(tokens[startPos], Arrays.asList("б", "би")) ) { | ||
startPos -= 1; | ||
} | ||
|
||
if( startPos > 0 && PosTagHelper.hasPosTag(tokens[startPos], "adv(:.*)?|.*pron.*") ) { | ||
startPos -= 1; | ||
} | ||
|
||
if( startPos > 0 && PosTagHelper.hasPosTag(tokens[startPos], "part.*") ) { | ||
startPos -= 1; | ||
} | ||
|
||
if( startPos > 0 ) { | ||
if( lemma != null && ! hasLemma(tokens[startPos], lemma) ) | ||
return -1; | ||
if( postagRegex != null && ! PosTagHelper.hasPosTag(tokens[startPos], postagRegex) ) | ||
return -1; | ||
|
||
return startPos; | ||
} | ||
|
||
return -1; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Please add a copyright header to all new files.