Skip to content

Commit

Permalink
[uk] adj/noun inflection agreement rule is on by default
Browse files Browse the repository at this point in the history
  • Loading branch information
arysin committed Dec 17, 2016
1 parent 7589eef commit 8ac5740
Show file tree
Hide file tree
Showing 11 changed files with 2,989 additions and 582 deletions.
@@ -0,0 +1,46 @@
package org.languagetool.rules.uk;

This comment has been minimized.

Copy link
@danielnaber

danielnaber Dec 17, 2016

Member

Please add a copyright header to all new files.


import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;

import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.JLanguageTool;

public class CaseGovernmentHelper {

static final Map<String, Set<String>> CASE_GOVERNMENT_MAP = loadMap("/uk/case_government.txt");

private static Map<String, Set<String>> loadMap(String path) {
Map<String, Set<String>> result = new HashMap<>();
try (InputStream is = JLanguageTool.getDataBroker().getFromResourceDirAsStream(path);
Scanner scanner = new Scanner(is, "UTF-8")) {
while (scanner.hasNextLine()) {
String line = scanner.nextLine();
String[] parts = line.split(" ");
String[] vidm = parts[1].split(":");
result.put(parts[0], new HashSet<String>(Arrays.asList(vidm)));
}
// System.err.println("Found case governments: " + result.size());
return result;
} catch (IOException e) {
throw new RuntimeException(e);

This comment has been minimized.

Copy link
@danielnaber

danielnaber Dec 17, 2016

Member

I suggest to use throw new RuntimeException("Could not load " + path, e) instead to get a better error message.

}
}

public static boolean hasCaseGovernment(AnalyzedTokenReadings analyzedTokenReadings, String rvCase) {
for(AnalyzedToken token: analyzedTokenReadings.getReadings()) {
if( CASE_GOVERNMENT_MAP.containsKey(token.getLemma())
&& CASE_GOVERNMENT_MAP.get(token.getLemma()).contains(rvCase) )
return true;
}
return false;
}

}
@@ -0,0 +1,147 @@
package org.languagetool.rules.uk;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;

import org.languagetool.AnalyzedToken;

class InflectionHelper {

static class Inflection implements Comparable<Inflection> {
final String gender;
final String _case;
final String animTag;

public Inflection(String gender, String _case, String animTag) {
this.gender = gender;
this._case = _case;
this.animTag = animTag;
}

@Override
public int hashCode() {
final int prime = 31;

This comment has been minimized.

Copy link
@danielnaber

danielnaber Dec 17, 2016

Member

java.util.Objects.hash() should probably be used for a more compact implementation.

int result = 1;
result = prime * result + ((_case == null) ? 0 : _case.hashCode());
result = prime * result + ((animTag == null) ? 0 : animTag.hashCode());
result = prime * result + ((gender == null) ? 0 : gender.hashCode());
return result;
}

@Override
public boolean equals(Object obj) {
if (this == obj)

This comment has been minimized.

Copy link
@danielnaber

danielnaber Dec 17, 2016

Member

Please consider using java.util.Objects.equals()

return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;

Inflection other = (Inflection) obj;
return gender.equals(other.gender)
&& _case.equals(other._case)
&& (animTag == null || other.animTag == null
|| ! animMatters() || ! other.isAnimalSensitive() || animTag.equals(other.animTag));
}

public boolean equalsIgnoreGender(Inflection other) {
return //gender.equals(other.gender)
_case.equals(other._case)
&& (animTag == null || other.animTag == null
|| ! animMatters() || animTag.equals(other.animTag));
}

boolean animMatters() {
return _case.equals("v_zna") && isAnimalSensitive();
}

private boolean isAnimalSensitive() {
return "mp".contains(gender);
}

@Override
public String toString() {
return ":" + gender + ":" + _case
+ (animMatters() ? "_"+animTag : "");
}

@Override
public int compareTo(Inflection o) {
int compared = GEN_ORDER.get(gender).compareTo(GEN_ORDER.get(o.gender));
if( compared != 0 )
return compared;

compared = VIDM_ORDER.get(_case).compareTo(VIDM_ORDER.get(o._case));
return compared;
}

}

static List<Inflection> getAdjInflections(List<AnalyzedToken> adjTokenReadings) {
List<Inflection> masterInflections = new ArrayList<>();
for (AnalyzedToken token: adjTokenReadings) {
String posTag = token.getPOSTag();

if( posTag == null || ! posTag.startsWith("adj") )
continue;

Matcher matcher = TokenInflectionAgreementRule.ADJ_INFLECTION_PATTERN.matcher(posTag);
matcher.find();

String gen = matcher.group(1);
String vidm = matcher.group(2);
String animTag = null;
if (matcher.group(3) != null) {
animTag = matcher.group(3).substring(2); // :rinanim/:ranim
}

masterInflections.add(new Inflection(gen, vidm, animTag));
}
return masterInflections;
}

static List<Inflection> getNounInflections(List<AnalyzedToken> nounTokenReadings) {
List<Inflection> slaveInflections = new ArrayList<>();
for (AnalyzedToken token: nounTokenReadings) {
String posTag2 = token.getPOSTag();
if( posTag2 == null )
continue;

Matcher matcher = TokenInflectionAgreementRule.NOUN_INFLECTION_PATTERN.matcher(posTag2);
if( ! matcher.find() ) {
// System.err.println("Failed to find slave inflection tag in " + posTag2 + " for " + nounTokenReadings);
continue;
}
String gen = matcher.group(2);
String vidm = matcher.group(3);
String animTag = matcher.group(1);

slaveInflections.add(new Inflection(gen, vidm, animTag));
}
return slaveInflections;
}

static final Map<String,Integer> GEN_ORDER = new HashMap<>();
static final Map<String,Integer> VIDM_ORDER = new HashMap<>();

static {
GEN_ORDER.put("m", 0);
GEN_ORDER.put("f", 1);
GEN_ORDER.put("n", 3);
// GEN_ORDER.put("s", 4);
GEN_ORDER.put("p", 5);

VIDM_ORDER.put("v_naz", 10);
VIDM_ORDER.put("v_rod", 20);
VIDM_ORDER.put("v_dav", 30);
VIDM_ORDER.put("v_zna", 40);
VIDM_ORDER.put("v_oru", 50);
VIDM_ORDER.put("v_mis", 60);
VIDM_ORDER.put("v_kly", 70);

}

}
@@ -0,0 +1,153 @@
package org.languagetool.rules.uk;

import java.util.List;
import java.util.regex.Pattern;

import org.languagetool.AnalyzedToken;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.tagging.uk.PosTagHelper;

public abstract class LemmaHelper {

public static boolean hasLemma(AnalyzedTokenReadings analyzedTokenReadings, List<String> lemmas) {
List<AnalyzedToken> readings = analyzedTokenReadings.getReadings();
return hasLemma(readings, lemmas);
}

public static boolean hasLemma(List<AnalyzedToken> readings, List<String> lemmas) {
for(AnalyzedToken analyzedToken: readings) {
if( lemmas.contains(analyzedToken.getLemma()) ) {
return true;
}
}
return false;
}

public static boolean hasLemma(AnalyzedTokenReadings analyzedTokenReadings, List<String> lemmas, String partPos) {
for(AnalyzedToken analyzedToken: analyzedTokenReadings.getReadings()) {
for(String lemma: lemmas) {
if( lemma.equals(analyzedToken.getLemma())
&& analyzedToken.getPOSTag() != null && analyzedToken.getPOSTag().contains(partPos) ) {
return true;
}
}
}
return false;
}

public static boolean hasLemma(AnalyzedTokenReadings analyzedTokenReadings, List<String> lemmas, Pattern posRegex) {
for(AnalyzedToken analyzedToken: analyzedTokenReadings.getReadings()) {
for(String lemma: lemmas) {
if( lemma.equals(analyzedToken.getLemma())
&& analyzedToken.getPOSTag() != null
&& posRegex.matcher(analyzedToken.getPOSTag()).matches()) {
return true;
}
}
}
return false;
}

public static boolean hasLemma(AnalyzedToken token, List<String> asList, String partPos) {
return asList.contains(token.getLemma())
&& token.getPOSTag() != null && token.getPOSTag().contains(partPos);
}

public static boolean hasLemma(AnalyzedTokenReadings analyzedTokenReadings, String lemmas) {
for(AnalyzedToken analyzedToken: analyzedTokenReadings.getReadings()) {
if( lemmas.equals(analyzedToken.getLemma()) ) {
return true;
}
}
return false;
}

public static boolean hasLemma(AnalyzedTokenReadings analyzedTokenReadings, Pattern pattern) {
for(AnalyzedToken analyzedToken: analyzedTokenReadings.getReadings()) {
String lemma = analyzedToken.getLemma();
if( lemma != null && pattern.matcher(lemma).matches() ) {
return true;
}
}
return false;
}

public static boolean hasLemma(AnalyzedTokenReadings analyzedTokenReadings, Pattern pattern, Pattern posTagRegex) {
for(AnalyzedToken analyzedToken: analyzedTokenReadings.getReadings()) {
String lemma = analyzedToken.getLemma();
if( lemma != null && pattern.matcher(lemma).matches()
&& posTagRegex != null && analyzedToken.getPOSTag() != null && posTagRegex.matcher(analyzedToken.getPOSTag()).matches() ) {
return true;
}
}
return false;
}

static boolean reverseSeach(AnalyzedTokenReadings[] tokens, int pos, int depth, Pattern lemma, Pattern postag) {
for(int i=pos; i>pos-depth && i>=0; i--) {
if( (lemma == null || hasLemma(tokens[i], lemma))
&& (postag == null || PosTagHelper.hasPosTag(tokens[i], postag)) )
return true;
}
return false;
}

static boolean forwardPosTagSearch(AnalyzedTokenReadings[] tokens, int pos, String posTag, int maxSkip) {
for(int i=pos; i < tokens.length && i <= pos + maxSkip; i++) {
if( PosTagHelper.hasPosTagPart(tokens[i], posTag) )
return true;
}
return false;
}

enum Dir {FORWARD, REVERSE}

private static final Pattern QUOTES = Pattern.compile("[«»„“]");

static int tokenSearch(AnalyzedTokenReadings[] tokens, int pos, String posTag, Pattern token, Pattern posTagsToIgnore, Dir dir) {
int step = dir == Dir.FORWARD ? 1 : -1;

for(int i = pos; i < tokens.length && i > 0; i += step) {
if( (posTag == null || PosTagHelper.hasPosTagPart(tokens[i], posTag))
&& (token == null || token.matcher(tokens[i].getToken()).matches()) )
return i;

if( ! PosTagHelper.hasPosTag(tokens[i], posTagsToIgnore)
&& ! QUOTES.matcher(tokens[i].getToken()).matches() )
break;
}

return -1;
}

static boolean revSearch(AnalyzedTokenReadings[] tokens, int startPos, Pattern lemma, String postagRegex) {
return LemmaHelper.revSearchIdx(tokens, startPos, lemma, postagRegex) != -1;
}

static int revSearchIdx(AnalyzedTokenReadings[] tokens, int startPos, Pattern lemma, String postagRegex) {
if( startPos > 0 && PosTagHelper.hasPosTag(tokens[startPos], "part.*") ) {
// if( startPos > 0 && LemmaHelper.hasLemma(tokens[startPos], Arrays.asList("б", "би")) ) {
startPos -= 1;
}

if( startPos > 0 && PosTagHelper.hasPosTag(tokens[startPos], "adv(:.*)?|.*pron.*") ) {
startPos -= 1;
}

if( startPos > 0 && PosTagHelper.hasPosTag(tokens[startPos], "part.*") ) {
startPos -= 1;
}

if( startPos > 0 ) {
if( lemma != null && ! hasLemma(tokens[startPos], lemma) )
return -1;
if( postagRegex != null && ! PosTagHelper.hasPosTag(tokens[startPos], postagRegex) )
return -1;

return startPos;
}

return -1;
}

}
Expand Up @@ -84,13 +84,6 @@ public String getDescription() {
public String getShort() {
return "Узгодження слів у реченні";
}
/**
* Indicates if the rule is case-sensitive.
* @return true if the rule is case-sensitive, false otherwise.
*/
public boolean isCaseSensitive() {
return false;
}

@Override
public final RuleMatch[] match(AnalyzedSentence text) {
Expand Down Expand Up @@ -206,7 +199,7 @@ public final RuleMatch[] match(AnalyzedSentence text) {
}

// System.out.println("For " + tokenReadings + " to match " + posTagsToFind + " of " + reqTokenReadings.getToken());
if( ! getReadingWithVidmPosTag(posTagsToFind, tokenReadings) ) {
if( ! hasVidmPosTag(posTagsToFind, tokenReadings) ) {
if( isTokenToSkip(tokenReadings) )
continue;

Expand Down Expand Up @@ -335,14 +328,6 @@ else if( prep.equalsIgnoreCase("до") ) {
// reqTokenReadings = null;
continue;
}
// // спиралося на місячної давнини рішення
// if (prep.equalsIgnoreCase("на") && posTag.matches("adj.*:[mfn]:v_rod.*")) {
// String gender = PosTagHelper.getGender(posTag);
// if ( hasPosTag(tokens[i+1], "noun.*:"+gender+":v_rod.*")) {
// i+=1;
// continue;
// }
// }
}
}

Expand Down Expand Up @@ -395,7 +380,7 @@ private boolean isTokenToSkip(AnalyzedTokenReadings tokenReadings) {
// return false;
// }

private boolean getReadingWithVidmPosTag(Collection<String> posTagsToFind, AnalyzedTokenReadings tokenReadings) {
static boolean hasVidmPosTag(Collection<String> posTagsToFind, AnalyzedTokenReadings tokenReadings) {
boolean vidminokFound = false; // because POS dictionary is not complete

for(AnalyzedToken token: tokenReadings) {
Expand Down

0 comments on commit 8ac5740

Please sign in to comment.