Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Armenian!

  • Loading branch information...
commit 7a97a5612ae184ab97368ebe875abb0a6ae476bb 1 parent 7d5dd37
@jdf authored
View
58 src/cue/lang/WordIterator.java
@@ -25,33 +25,33 @@
*
*/
public class WordIterator extends IterableText {
- private static final String LETTER = "[@+\\p{javaLetter}\\p{javaDigit}]";
- private static final String JOINER = "[-.:/'’\\p{M}\\u2032\\u00A0\\u200C\\u200D~]";
- private static final Pattern WORD = Pattern.compile(LETTER + "+(" + JOINER
- + "+" + LETTER + "+)*");
-
- private final Matcher m;
- private boolean hasNext;
-
- public WordIterator(final String text) {
- this.m = WORD.matcher(text == null ? "" : text);
- hasNext = m.find();
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
- }
-
- public String next() {
- if (!hasNext) {
- throw new NoSuchElementException();
- }
- final String s = m.group();
- hasNext = m.find();
- return s;
- }
-
- public boolean hasNext() {
- return hasNext;
- }
+ private static final String LETTER = "[@+\\p{javaLetterOrDigit}]";
+ private static final String JOINER = "[-.:/'’\\p{M}\\u2032\\u00A0\\u200C\\u200D~]";
+ private static final Pattern WORD = Pattern.compile(LETTER + "+(" + JOINER + "+" + LETTER
+ + "+)*");
+
+ private final Matcher m;
+ private boolean hasNext;
+
+ public WordIterator(final String text) {
+ this.m = WORD.matcher(text == null ? "" : text);
+ hasNext = m.find();
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ public String next() {
+ if (!hasNext) {
+ throw new NoSuchElementException();
+ }
+ final String s = m.group();
+ hasNext = m.find();
+ return s;
+ }
+
+ public boolean hasNext() {
+ return hasNext;
+ }
}
View
168 src/cue/lang/stop/StopWords.java
@@ -35,89 +35,87 @@
*
*/
public enum StopWords {
- Arabic(), Catalan(true), Croatian(), Czech(), Dutch(), //
- Danish(), English(), Esperanto(), Farsi(), Finnish(), //
- French(true), German(), Greek(), Hindi(), Hungarian(), //
- Italian(), Latin(), Norwegian(), Polish(), Portuguese(), //
- Romanian(), Russian(), Slovenian(), Slovak(), Spanish(), //
- Swedish(), Hebrew(), Turkish(), Custom();
-
- public static StopWords guess(final String text) {
- return guess(new Counter<String>(new WordIterator(text)));
- }
-
- public static StopWords guess(final Counter<String> wordCounter) {
- return guess(wordCounter.getMostFrequent(50));
- }
-
- public static StopWords guess(final Collection<String> words) {
- StopWords currentWinner = null;
- int currentMax = 0;
- for (final StopWords stopWords : StopWords.values()) {
- int count = 0;
- for (final String word : words) {
- if (stopWords.isStopWord(word)) {
- count++;
- }
- }
- if (count > currentMax) {
- currentWinner = stopWords;
- currentMax = count;
- }
- }
- return currentWinner;
- }
-
- public final boolean stripApostrophes;
- private final Set<String> stopwords = new HashSet<String>();
-
- private StopWords() {
- this(false);
- }
-
- private StopWords(final boolean stripApostrophes) {
- this.stripApostrophes = stripApostrophes;
- loadLanguage();
- }
-
- public boolean isStopWord(final String s) {
- if (s.length() == 1) {
- return true;
- }
- // check rightquotes as apostrophes
- return stopwords.contains(s.replace('\u2019', '\'').toLowerCase(
- Locale.ENGLISH));
- }
-
- private void loadLanguage() {
- final String wordlistResource = name().toLowerCase(Locale.ENGLISH);
- if (!wordlistResource.equals("custom")) {
- readStopWords(getClass().getResourceAsStream(wordlistResource),
- Charset.forName("UTF-8"));
- }
- }
-
- public void readStopWords(final InputStream inputStream,
- final Charset encoding) {
- try {
- final BufferedReader in = new BufferedReader(new InputStreamReader(
- inputStream, encoding));
- try {
- String line;
- while ((line = in.readLine()) != null) {
- line = line.replaceAll("\\|.*", "").trim();
- if (line.length() == 0) {
- continue;
- }
- for (final String w : line.split("\\s+")) {
- stopwords.add(w.toLowerCase(Locale.ENGLISH));
- }
- }
- } finally {
- in.close();
- }
- } catch (final IOException e) {
- throw new RuntimeException(e);
- }
- }
+ Arabic(), Armenian(), Catalan(true), Croatian(), Czech(), Dutch(), //
+ Danish(), English(), Esperanto(), Farsi(), Finnish(), //
+ French(true), German(), Greek(), Hindi(), Hungarian(), //
+ Italian(), Latin(), Norwegian(), Polish(), Portuguese(), //
+ Romanian(), Russian(), Slovenian(), Slovak(), Spanish(), //
+ Swedish(), Hebrew(), Turkish(), Custom();
+
+ public static StopWords guess(final String text) {
+ return guess(new Counter<String>(new WordIterator(text)));
+ }
+
+ public static StopWords guess(final Counter<String> wordCounter) {
+ return guess(wordCounter.getMostFrequent(50));
+ }
+
+ public static StopWords guess(final Collection<String> words) {
+ StopWords currentWinner = null;
+ int currentMax = 0;
+ for (final StopWords stopWords : StopWords.values()) {
+ int count = 0;
+ for (final String word : words) {
+ if (stopWords.isStopWord(word)) {
+ count++;
+ }
+ }
+ if (count > currentMax) {
+ currentWinner = stopWords;
+ currentMax = count;
+ }
+ }
+ return currentWinner;
+ }
+
+ public final boolean stripApostrophes;
+ private final Set<String> stopwords = new HashSet<String>();
+
+ private StopWords() {
+ this(false);
+ }
+
+ private StopWords(final boolean stripApostrophes) {
+ this.stripApostrophes = stripApostrophes;
+ loadLanguage();
+ }
+
+ public boolean isStopWord(final String s) {
+ if (s.length() == 1) {
+ return true;
+ }
+ // check rightquotes as apostrophes
+ return stopwords.contains(s.replace('\u2019', '\'').toLowerCase(Locale.ENGLISH));
+ }
+
+ private void loadLanguage() {
+ final String wordlistResource = name().toLowerCase(Locale.ENGLISH);
+ if (!wordlistResource.equals("custom")) {
+ readStopWords(getClass().getResourceAsStream(wordlistResource),
+ Charset.forName("UTF-8"));
+ }
+ }
+
+ public void readStopWords(final InputStream inputStream, final Charset encoding) {
+ try {
+ final BufferedReader in = new BufferedReader(new InputStreamReader(inputStream,
+ encoding));
+ try {
+ String line;
+ while ((line = in.readLine()) != null) {
+ line = line.replaceAll("\\|.*", "").trim();
+ if (line.length() == 0) {
+ continue;
+ }
+ for (final String w : line.split("\\s+")) {
+ stopwords.add(w.toLowerCase(Locale.ENGLISH));
+ }
+ }
+ } finally {
+ in.close();
+ }
+ } catch (final IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
}
View
45 src/cue/lang/stop/armenian
@@ -0,0 +1,45 @@
+այդ
+այլ
+այն
+այս
+դու
+դուք
+եմ
+են
+ենք
+ես
+եք
+էի
+էին
+էինք
+էիր
+էիք
+էր
+ըստ
+ին
+իսկ
+իր
+կամ
+համար
+հետ
+հետո
+մենք
+մեջ
+մի
+նա
+նաև
+նրա
+նրանք
+որ
+որը
+որոնք
+որպես
+ու
+ում
+պիտի
+վրա
Please sign in to comment.
Something went wrong with that request. Please try again.