Skip to content

Commit

Permalink
[uk] sentence and work tokenization improvement; new simple replaceme…
Browse files Browse the repository at this point in the history
…nt suggestions; new compound prefixes
  • Loading branch information
arysin committed Mar 28, 2015
1 parent db37646 commit 4257280
Show file tree
Hide file tree
Showing 15 changed files with 301 additions and 36 deletions.
Expand Up @@ -11,6 +11,7 @@
import org.languagetool.rules.RuleMatch; import org.languagetool.rules.RuleMatch;
import org.languagetool.rules.WordRepeatRule; import org.languagetool.rules.WordRepeatRule;
import org.languagetool.tagging.uk.IPOSTag; import org.languagetool.tagging.uk.IPOSTag;
import org.languagetool.tagging.uk.PosTagHelper;


/** /**
* @since 2.9 * @since 2.9
Expand Down Expand Up @@ -48,11 +49,13 @@ public boolean ignore(AnalyzedTokenReadings[] tokens, int position) {
if( REPEAT_ALLOWED_CAPS_SET.contains(token) ) if( REPEAT_ALLOWED_CAPS_SET.contains(token) )
return true; return true;


if( PosTagHelper.hasPosTag(analyzedTokenReadings, "date|time|number") )
return true;

for(AnalyzedToken analyzedToken: analyzedTokenReadings.getReadings()) { for(AnalyzedToken analyzedToken: analyzedTokenReadings.getReadings()) {
String posTag = analyzedToken.getPOSTag(); String posTag = analyzedToken.getPOSTag();
if( posTag != null ) { if( posTag != null ) {
if (! posTag.equals(IPOSTag.number.getText()) if ( ! isInitial(analyzedToken, tokens, position)
&& ! isInitial(analyzedToken, tokens, position)
// && ! posTag.equals(JLanguageTool.SENTENCE_START_TAGNAME) // && ! posTag.equals(JLanguageTool.SENTENCE_START_TAGNAME)
&& ! posTag.equals(JLanguageTool.SENTENCE_END_TAGNAME) ) && ! posTag.equals(JLanguageTool.SENTENCE_END_TAGNAME) )
return false; return false;
Expand Down
Expand Up @@ -29,6 +29,7 @@ public enum IPOSTag {
numr("numr"), numr("numr"),
number("number"), number("number"),
date("date"), date("date"),
time("time"),
advp("advp"), advp("advp"),
predic("predic"), predic("predic"),
insert("insert"), insert("insert"),
Expand Down
Expand Up @@ -67,6 +67,7 @@ public class UkrainianTagger extends BaseTagger {
private static final Pattern NOUN_SING_V_ROD_REGEX = Pattern.compile("noun:[mfn]:v_rod.*"); private static final Pattern NOUN_SING_V_ROD_REGEX = Pattern.compile("noun:[mfn]:v_rod.*");
private static final Pattern NOUN_V_NAZ_REGEX = Pattern.compile("noun:.:v_naz.*"); private static final Pattern NOUN_V_NAZ_REGEX = Pattern.compile("noun:.:v_naz.*");
private static final Pattern SING_REGEX_F = Pattern.compile(":[mfn]:"); private static final Pattern SING_REGEX_F = Pattern.compile(":[mfn]:");
private static final Pattern O_ADJ_PATTERN = Pattern.compile(".*(о|[чшщ]е)");


// private static final String VERB_TAG_FOR_REV_IMPR = IPOSTag.verb.getText()+":rev:impr"; // private static final String VERB_TAG_FOR_REV_IMPR = IPOSTag.verb.getText()+":rev:impr";
// private static final String VERB_TAG_FOR_IMPR = IPOSTag.verb.getText()+":impr"; // private static final String VERB_TAG_FOR_IMPR = IPOSTag.verb.getText()+":impr";
Expand All @@ -75,14 +76,15 @@ public class UkrainianTagger extends BaseTagger {
// full latin number regex: M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3}) // full latin number regex: M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})
private static final Pattern NUMBER = Pattern.compile("[+-±]?[€₴\\$]?[0-9]+(,[0-9]+)?([-–—][0-9]+(,[0-9]+)?)?(%|°С?)?|(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})"); private static final Pattern NUMBER = Pattern.compile("[+-±]?[€₴\\$]?[0-9]+(,[0-9]+)?([-–—][0-9]+(,[0-9]+)?)?(%|°С?)?|(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})");
private static final Pattern DATE = Pattern.compile("[\\d]{2}\\.[\\d]{2}\\.[\\d]{4}"); private static final Pattern DATE = Pattern.compile("[\\d]{2}\\.[\\d]{2}\\.[\\d]{4}");
private static final Pattern TIME = Pattern.compile("([01]?[0-9]|2[0-3])[.:][0-5][0-9]");
private static final String stdNounTag = IPOSTag.noun.getText() + ":.:v_"; private static final String stdNounTag = IPOSTag.noun.getText() + ":.:v_";
private static final int stdNounTagLen = stdNounTag.length(); private static final int stdNounTagLen = stdNounTag.length();
private static final Pattern stdNounTagRegex = Pattern.compile(stdNounTag + ".*"); private static final Pattern stdNounTagRegex = Pattern.compile(stdNounTag + ".*");
// private static final Pattern stdNounNvTagRegex = Pattern.compile(IPOSTag.noun.getText() + ".*:nv.*"); // private static final Pattern stdNounNvTagRegex = Pattern.compile(IPOSTag.noun.getText() + ".*:nv.*");
private static final Set<String> dashPrefixes; private static final Set<String> dashPrefixes;
private static final Set<String> leftMasterSet; private static final Set<String> leftMasterSet;
private static final Set<String> cityAvenue = new HashSet<>(Arrays.asList("сіті", "авеню", "стріт", "штрассе")); private static final Set<String> cityAvenue = new HashSet<>(Arrays.asList("сіті", "авеню", "стріт", "штрассе"));
private static final Map<String, String> rightPartsWithLeftTagMap = new HashMap<>(); private static final Map<String, Pattern> rightPartsWithLeftTagMap = new HashMap<>();
private static final Set<String> slaveSet; private static final Set<String> slaveSet;


public static final Map<String, String> VIDMINKY_MAP; public static final Map<String, String> VIDMINKY_MAP;
Expand Down Expand Up @@ -120,11 +122,11 @@ public class UkrainianTagger extends BaseTagger {
// map2.put("тих", Arrays.asList(":p:v_rod", ":p:v_zna")); // map2.put("тих", Arrays.asList(":p:v_rod", ":p:v_zna"));
NUMR_ENDING_MAP = Collections.unmodifiableMap(map2); NUMR_ENDING_MAP = Collections.unmodifiableMap(map2);


rightPartsWithLeftTagMap.put("бо", "(verb(:rev)?:impr|.*pron|noun|adv|excl|part|predic).*"); rightPartsWithLeftTagMap.put("бо", Pattern.compile("(verb(:rev)?:impr|.*pron|noun|adv|excl|part|predic).*"));
rightPartsWithLeftTagMap.put("но", "(verb(:rev)?:(impr|futr)|excl).*"); rightPartsWithLeftTagMap.put("но", Pattern.compile("(verb(:rev)?:(impr|futr)|excl).*"));
rightPartsWithLeftTagMap.put("от", "(.*pron|adv|part).*"); rightPartsWithLeftTagMap.put("от", Pattern.compile("(.*pron|adv|part).*"));
rightPartsWithLeftTagMap.put("то", "(.*pron|noun|adv|part|conj).*"); rightPartsWithLeftTagMap.put("то", Pattern.compile("(.*pron|noun|adv|part|conj).*"));
rightPartsWithLeftTagMap.put("таки", "(verb(:rev)?:(futr|past|pres)|.*pron|noun|part|predic|insert).*"); rightPartsWithLeftTagMap.put("таки", Pattern.compile("(verb(:rev)?:(futr|past|pres)|.*pron|noun|part|predic|insert).*"));


dashPrefixes = loadSet("/uk/dash_prefixes.txt"); dashPrefixes = loadSet("/uk/dash_prefixes.txt");
leftMasterSet = loadSet("/uk/dash_left_master.txt"); leftMasterSet = loadSet("/uk/dash_left_master.txt");
Expand Down Expand Up @@ -159,13 +161,19 @@ public UkrainianTagger() {
@Override @Override
public List<AnalyzedToken> additionalTags(String word, WordTagger wordTagger) { public List<AnalyzedToken> additionalTags(String word, WordTagger wordTagger) {
if ( NUMBER.matcher(word).matches() ) { if ( NUMBER.matcher(word).matches() ) {
List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>(); List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
additionalTaggedTokens.add(new AnalyzedToken(word, IPOSTag.number.getText(), word)); additionalTaggedTokens.add(new AnalyzedToken(word, IPOSTag.number.getText(), word));
return additionalTaggedTokens; return additionalTaggedTokens;
} }


if ( TIME.matcher(word).matches() ) {
List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
additionalTaggedTokens.add(new AnalyzedToken(word, IPOSTag.time.getText(), word));
return additionalTaggedTokens;
}

if ( DATE.matcher(word).matches() ) { if ( DATE.matcher(word).matches() ) {
List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>(); List<AnalyzedToken> additionalTaggedTokens = new ArrayList<>();
additionalTaggedTokens.add(new AnalyzedToken(word, IPOSTag.date.getText(), word)); additionalTaggedTokens.add(new AnalyzedToken(word, IPOSTag.date.getText(), word));
return additionalTaggedTokens; return additionalTaggedTokens;
} }
Expand Down Expand Up @@ -203,13 +211,13 @@ private List<AnalyzedToken> guessCompoundTag(String word) {
if( leftWdList.isEmpty() ) if( leftWdList.isEmpty() )
return null; return null;


String leftTagRegex = rightPartsWithLeftTagMap.get(rightWord); Pattern leftTagRegex = rightPartsWithLeftTagMap.get(rightWord);


List<AnalyzedToken> leftAnalyzedTokens = asAnalyzedTokenListForTaggedWords(leftWord, leftWdList); List<AnalyzedToken> leftAnalyzedTokens = asAnalyzedTokenListForTaggedWords(leftWord, leftWdList);
List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>(leftAnalyzedTokens.size()); List<AnalyzedToken> newAnalyzedTokens = new ArrayList<>(leftAnalyzedTokens.size());
for (AnalyzedToken analyzedToken : leftAnalyzedTokens) { for (AnalyzedToken analyzedToken : leftAnalyzedTokens) {
String posTag = analyzedToken.getPOSTag(); String posTag = analyzedToken.getPOSTag();
if( posTag.matches(leftTagRegex) ) { if( posTag != null && leftTagRegex.matcher(posTag).matches() ) {
newAnalyzedTokens.add(new AnalyzedToken(word, posTag, analyzedToken.getLemma())); newAnalyzedTokens.add(new AnalyzedToken(word, posTag, analyzedToken.getLemma()));
} }
} }
Expand Down Expand Up @@ -301,7 +309,7 @@ else if( rightWord.endsWith("ський") ) {
} }
} }


if( leftWord.endsWith("о") ) { if( O_ADJ_PATTERN.matcher(leftWord).matches() ) {
return oAdjMatch(word, rightAnalyzedTokens, leftWord); return oAdjMatch(word, rightAnalyzedTokens, leftWord);
} }


Expand Down
Expand Up @@ -39,18 +39,34 @@ public class UkrainianWordTokenizer implements Tokenizer {
+ "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f" + "\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f"
+ "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d" + "\u205F\u2060\u2061\u2062\u2063\u206A\u206b\u206c\u206d"
+ "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb" + "\u206E\u206F\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb"
+ ",.;()[]{}<>!?:/|\\\"«»„”“`´‘‛′…¿¡\t\n\r"; + ",.;()[]{}<>!?:/|\\\"«»„”“`´‘‛′…¿¡\t\n\r\uE100";


// decimal comma between digits // decimal comma between digits
private static final Pattern DECIMAL_COMMA_PATTERN = Pattern.compile("([\\d]),([\\d])", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE); private static final Pattern DECIMAL_COMMA_PATTERN = Pattern.compile("([\\d]),([\\d])", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
private static final char DECIMAL_COMMA_SUBST = '\uE001'; // some unused character to hide comma in decimal number temporary for tokenizer run private static final char DECIMAL_COMMA_SUBST = '\uE001'; // some unused character to hide comma in decimal number temporary for tokenizer run
// space between digits
// private static final Pattern DECIMAL_SPACE_PATTERN = Pattern.compile("([\\d]{1,3})( ([\\d]{3}))+", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
// private static final char DECIMAL_SPACE_SUBST = '\uE008';
// dots in numbers
private static final Pattern DOTTED_NUMBERS_PATTERN = Pattern.compile("([\\d])\\.([\\d])", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
private static final char NUMBER_DOT_SUBST = '\uE002';
// colon in numbers
private static final Pattern COLON_NUMBERS_PATTERN = Pattern.compile("([\\d]):([\\d])", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
private static final char COLON_DOT_SUBST = '\uE003';
// dates // dates
private static final Pattern DATE_PATTERN = Pattern.compile("([\\d]{2})\\.([\\d]{2})\\.([\\d]{4})|([\\d]{4})\\.([\\d]{2})\\.([\\d]{2})|([\\d]{4})-([\\d]{2})-([\\d]{2})", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE); private static final Pattern DATE_PATTERN = Pattern.compile("([\\d]{2})\\.([\\d]{2})\\.([\\d]{4})|([\\d]{4})\\.([\\d]{2})\\.([\\d]{2})|([\\d]{4})-([\\d]{2})-([\\d]{2})", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
private static final char DATE_DOT_SUBST = '\uE002'; // some unused character to hide dot in date temporary for tokenizer run private static final char DATE_DOT_SUBST = '\uE004'; // some unused character to hide dot in date temporary for tokenizer run
// braces in words // braces in words
private static final Pattern BRACE_IN_WORD_PATTERN = Pattern.compile("([а-яіїєґ'])\\(([а-яіїєґ']+)\\)", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE); private static final Pattern BRACE_IN_WORD_PATTERN = Pattern.compile("([а-яіїєґ'])\\(([а-яіїєґ']+)\\)", Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
private static final char LEFT_BRACE_SUBST = '\uE003'; private static final char LEFT_BRACE_SUBST = '\uE005';
private static final char RIGHT_BRACE_SUBST = '\uE004'; private static final char RIGHT_BRACE_SUBST = '\uE006';
// abbreviation dot
//TODO: also use abbreviation list to allow next letter to be capital
private static final Pattern ABBR_DOT_PATTERN = Pattern.compile("([а-яіїєґ])\\. ([а-яіїєґ])");
private static final char ABBR_DOT_SUBST = '\uE007';
// ellipsis
private static final String ELLIPSIS = "...";
private static final String ELLIPSIS_SUBST = "\uE100";




public UkrainianWordTokenizer() { public UkrainianWordTokenizer() {
Expand All @@ -66,11 +82,25 @@ public List<String> tokenize(String text) {


if( text.contains(".") ) { if( text.contains(".") ) {
text = DATE_PATTERN.matcher(text).replaceAll("$1" + DATE_DOT_SUBST + "$2" + DATE_DOT_SUBST + "$3"); text = DATE_PATTERN.matcher(text).replaceAll("$1" + DATE_DOT_SUBST + "$2" + DATE_DOT_SUBST + "$3");
text = DOTTED_NUMBERS_PATTERN.matcher(text).replaceAll("$1" + NUMBER_DOT_SUBST + "$2");
text = ABBR_DOT_PATTERN.matcher(text).replaceAll("$1" + ABBR_DOT_SUBST + " $2");
}

if( text.contains(":") ) {
text = COLON_NUMBERS_PATTERN.matcher(text).replaceAll("$1" + COLON_DOT_SUBST + "$2");
} }


if( text.contains("(") ) { if( text.contains("(") ) {
text = BRACE_IN_WORD_PATTERN.matcher(text).replaceAll("$1" + LEFT_BRACE_SUBST + "$2" + RIGHT_BRACE_SUBST); text = BRACE_IN_WORD_PATTERN.matcher(text).replaceAll("$1" + LEFT_BRACE_SUBST + "$2" + RIGHT_BRACE_SUBST);
} }

// if( text.contains(" ") ) {
// text = DECIMAL_SPACE_PATTERN.matcher(text).replaceAll("$1" + DECIMAL_SPACE_SUBST + "$2");
// }

if( text.contains(ELLIPSIS) ) {
text = text.replace(ELLIPSIS, ELLIPSIS_SUBST);
}


List<String> tokenList = new ArrayList<>(); List<String> tokenList = new ArrayList<>();
StringTokenizer st = new StringTokenizer(text, SPLIT_CHARS, true); StringTokenizer st = new StringTokenizer(text, SPLIT_CHARS, true);
Expand All @@ -79,9 +109,16 @@ public List<String> tokenize(String text) {
String token = st.nextToken(); String token = st.nextToken();


token = token.replace(DECIMAL_COMMA_SUBST, ','); token = token.replace(DECIMAL_COMMA_SUBST, ',');

//TODO: merge all dots to speed things up ???
token = token.replace(DATE_DOT_SUBST, '.'); token = token.replace(DATE_DOT_SUBST, '.');
token = token.replace(NUMBER_DOT_SUBST, '.');
token = token.replace(ABBR_DOT_SUBST, '.');

token = token.replace(COLON_DOT_SUBST, ':');
token = token.replace(LEFT_BRACE_SUBST, '('); token = token.replace(LEFT_BRACE_SUBST, '(');
token = token.replace(RIGHT_BRACE_SUBST, ')'); token = token.replace(RIGHT_BRACE_SUBST, ')');
token = token.replaceAll(ELLIPSIS_SUBST, ELLIPSIS);


tokenList.add( token ); tokenList.add( token );
} }
Expand Down
Expand Up @@ -10,13 +10,15 @@ CD
CDMA CDMA
CFI CFI
CNG CNG
css
DDoS DDoS
DNS DNS
DoS DoS
DSL DSL
dvd dvd
e e
fashion fashion
feed
FM FM
ftp ftp
G G
Expand All @@ -28,10 +30,13 @@ GSM
HD HD
HR HR
HSDPA HSDPA
HTML
ID ID
IMEA IMEA
IP IP
IT IT
java
javascript
led led
LCD LCD
LNG LNG
Expand All @@ -43,6 +48,7 @@ n
OSB OSB
pdf pdf
PhD PhD
PHP
PIN PIN
POS POS
pr pr
Expand Down
Expand Up @@ -905,6 +905,27 @@ $Id$
<example>рахуючись місцями</example> <example>рахуючись місцями</example>
</rule> </rule>
--> -->
<rule>
<pattern>
<token regexp="yes" inflected="yes">стикатися|зіткнутися|стикнутися|зіткнення</token>
<token>з</token>
<token postag_regexp="yes" postag="adv.*" min="0"/>
<token>труднощами</token>
</pattern>
<message>Правильно: <suggestion>натрапляти на труднощі</suggestion>, <suggestion>поставати перед труднощами</suggestion></message>
<example correction="натрапляти на труднощі|поставати перед труднощами">ми <marker>зіткнулися з труднощами</marker></example>
</rule>

<rule>
<pattern>
<token regexp="yes" inflected="yes">стикатися|зіткнутися|стикнутися|зіткнення</token>
<token>з</token>
<token postag_regexp="yes" postag="adv.*" min="0"/>
<token regexp="yes">фактом|фактами</token>
</pattern>
<message>Правильно: <suggestion>подибувати факт</suggestion>, <suggestion>натрапляти на факт</suggestion></message>
<example correction="подибувати факт|натрапляти на факт">ми знову <marker>зіткнулися з фактом</marker></example>
</rule>
</rulegroup> </rulegroup>


<rulegroup id="NON_LEXEM_BORROWING_ADJ" name="Кальки: прикметники"> <rulegroup id="NON_LEXEM_BORROWING_ADJ" name="Кальки: прикметники">
Expand Down
Expand Up @@ -173,8 +173,8 @@ $Id$
<token case_sensitive="yes" postag_regexp="yes" postag="noun:m:v_rod.*" regexp="yes">[А-ЯІЇЄҐ].*</token> <token case_sensitive="yes" postag_regexp="yes" postag="noun:m:v_rod.*" regexp="yes">[А-ЯІЇЄҐ].*</token>
</antipattern> </antipattern>
<pattern> <pattern>
<token regexp="yes">два|дві|обидва|обидві|три|чотири|[^,-]*[2-9]?[234] <token regexp="yes">два|дві|обидва|обидві|три|чотири|[^,.№:-]*[2-9]?[234]
<exception postag="date"/> <exception postag_regexp="yes" postag="date|time"/>
<exception scope="previous" regexp="yes">[:/№]|статт[яі]|пункті?|частин[аи]|\.</exception> <!-- ст.,ч.,п. тощо --> <exception scope="previous" regexp="yes">[:/№]|статт[яі]|пункті?|частин[аи]|\.</exception> <!-- ст.,ч.,п. тощо -->
</token> </token>
<token postag_regexp="yes" postag="adj.*" min="0"> <token postag_regexp="yes" postag="adj.*" min="0">
Expand Down
Expand Up @@ -30,14 +30,16 @@ $Id$
<pattern> <pattern>
<token>будь</token> <token>будь</token>
<token>ласка</token> <token>ласка</token>
<token negate="yes" regexp="yes">[,\.!?:»]</token> <token negate="yes" regexp="yes">[,\.!?:»]|[.!?]{3}</token>
</pattern> </pattern>
<message>Відсутня права кома: <suggestion>\1 \2, \3</suggestion>.</message> <message>Відсутня права кома: <suggestion>\1 \2, \3</suggestion>.</message>
<example correction="будь ласка, у">Запитайте, <marker>будь ласка у</marker> водія.</example> <example correction="будь ласка, у">Запитайте, <marker>будь ласка у</marker> водія.</example>
</rule> </rule>
<rule> <rule>
<pattern> <pattern>
<token negate="yes" regexp="yes">[\p{Punct}–—\(«]<exception postag="SENT_START"></exception></token> <token negate="yes" regexp="yes">[\p{Punct}–—\(«]|[!?.]{3}
<exception postag="SENT_START"/>
</token>
<token>будь</token> <token>будь</token>
<token>ласка</token> <token>ласка</token>
</pattern> </pattern>
Expand All @@ -51,27 +53,28 @@ $Id$
<rule> <rule>
<pattern> <pattern>
<token regexp="yes">мабуть|по-перше|по-друге|по-третє|щоправда|о?крім того|а втім</token> <token regexp="yes">мабуть|по-перше|по-друге|по-третє|щоправда|о?крім того|а втім</token>
<token negate="yes" regexp="yes">[\p{Punct}–—»]</token> <token negate="yes" regexp="yes">[\p{Punct}–—»]|[!?.]{3}</token>
</pattern> </pattern>
<message>Відсутня права кома: <suggestion>\1, \2</suggestion>.</message> <message>Відсутня права кома: <suggestion>\1, \2</suggestion>.</message>
<example>Це, мабуть, його водій.</example> <example>Це, мабуть, його водій.</example>
<example correction="мабуть, його">Це, <marker>мабуть його</marker> водій.</example> <example correction="мабуть, його">Це, <marker>мабуть його</marker> водій.</example>
</rule> </rule>
<rule> <rule>
<pattern case_sensitive="yes"> <pattern case_sensitive="yes">
<token negate="yes" regexp="yes">[\p{Punct}–—\(«АаІій] <token negate="yes" regexp="yes">[\p{Punct}–—\(«АаІій]|[!?.]{3}
<exception postag="SENT_START"/> <exception postag="SENT_START"/>
</token> </token>
<token regexp="yes">мабуть|щоправда|о?крім того|втім</token> <token regexp="yes">мабуть|щоправда|о?крім того|втім</token>
</pattern> </pattern>
<message>Відсутня ліва кома: <suggestion>\1, \2</suggestion>.</message> <message>Відсутня ліва кома: <suggestion>\1, \2</suggestion>.</message>
<example>Мабуть, це водій.</example> <example>Мабуть, це водій.</example>
<example>Це, мабуть, водій.</example> <example>Це, мабуть, водій.</example>
<example>Але це... мабуть, водій.</example>
<example correction="Це, мабуть"><marker>Це мабуть</marker>, водій.</example> <example correction="Це, мабуть"><marker>Це мабуть</marker>, водій.</example>
</rule> </rule>
<rule> <rule>
<pattern case_sensitive="yes"> <pattern case_sensitive="yes">
<token negate="yes" regexp="yes">[\p{Punct}–—\(«АаІі]|[Цц]е <token negate="yes" regexp="yes">[\p{Punct}–—\(«АаІі]|[!?.]{3}|[Цц]е
<exception postag="SENT_START"/> <exception postag="SENT_START"/>
<exception>Ну</exception> <exception>Ну</exception>
</token> </token>
Expand All @@ -89,7 +92,7 @@ $Id$
<rule> <rule>
<pattern case_sensitive="yes"> <pattern case_sensitive="yes">
<marker> <marker>
<token regexp="yes" negate="yes">[\p{Punct}…«\(„–—]|[нН]у|[АаІіЙй]|але <token regexp="yes" negate="yes">[\p{Punct}…«\(„–—]|[!?.]{3}|[нН]у|[АаІіЙй]|але
<exception postag="SENT_START"></exception> <exception postag="SENT_START"></exception>
</token> </token>
<token regexp="yes">а|але|однак|проте|зате</token> <token regexp="yes">а|але|однак|проте|зате</token>
Expand Down

0 comments on commit 4257280

Please sign in to comment.