Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Armenian!

  • Loading branch information...
commit 7a97a5612ae184ab97368ebe875abb0a6ae476bb 1 parent 7d5dd37
Jonathan Feinberg authored July 04, 2011
58  src/cue/lang/WordIterator.java
@@ -25,33 +25,33 @@
25 25
  * 
26 26
  */
27 27
 public class WordIterator extends IterableText {
28  
-	private static final String LETTER = "[@+\\p{javaLetter}\\p{javaDigit}]";
29  
-	private static final String JOINER = "[-.:/'’\\p{M}\\u2032\\u00A0\\u200C\\u200D~]";
30  
-	private static final Pattern WORD = Pattern.compile(LETTER + "+(" + JOINER
31  
-			+ "+" + LETTER + "+)*");
32  
-
33  
-	private final Matcher m;
34  
-	private boolean hasNext;
35  
-
36  
-	public WordIterator(final String text) {
37  
-		this.m = WORD.matcher(text == null ? "" : text);
38  
-		hasNext = m.find();
39  
-	}
40  
-
41  
-	public void remove() {
42  
-		throw new UnsupportedOperationException();
43  
-	}
44  
-
45  
-	public String next() {
46  
-		if (!hasNext) {
47  
-			throw new NoSuchElementException();
48  
-		}
49  
-		final String s = m.group();
50  
-		hasNext = m.find();
51  
-		return s;
52  
-	}
53  
-
54  
-	public boolean hasNext() {
55  
-		return hasNext;
56  
-	}
  28
+    private static final String LETTER = "[@+\\p{javaLetterOrDigit}]";
  29
+    private static final String JOINER = "[-.:/'’\\p{M}\\u2032\\u00A0\\u200C\\u200D~]";
  30
+    private static final Pattern WORD = Pattern.compile(LETTER + "+(" + JOINER + "+" + LETTER
  31
+            + "+)*");
  32
+
  33
+    private final Matcher m;
  34
+    private boolean hasNext;
  35
+
  36
+    public WordIterator(final String text) {
  37
+        this.m = WORD.matcher(text == null ? "" : text);
  38
+        hasNext = m.find();
  39
+    }
  40
+
  41
+    public void remove() {
  42
+        throw new UnsupportedOperationException();
  43
+    }
  44
+
  45
+    public String next() {
  46
+        if (!hasNext) {
  47
+            throw new NoSuchElementException();
  48
+        }
  49
+        final String s = m.group();
  50
+        hasNext = m.find();
  51
+        return s;
  52
+    }
  53
+
  54
+    public boolean hasNext() {
  55
+        return hasNext;
  56
+    }
57 57
 }
168  src/cue/lang/stop/StopWords.java
@@ -35,89 +35,87 @@
35 35
  * 
36 36
  */
37 37
 public enum StopWords {
38  
-	Arabic(), Catalan(true), Croatian(), Czech(), Dutch(), //
39  
-	Danish(), English(), Esperanto(), Farsi(), Finnish(), //
40  
-	French(true), German(), Greek(), Hindi(), Hungarian(), //
41  
-	Italian(), Latin(), Norwegian(), Polish(), Portuguese(), //
42  
-	Romanian(), Russian(), Slovenian(), Slovak(), Spanish(), //
43  
-	Swedish(), Hebrew(), Turkish(), Custom();
44  
-
45  
-	public static StopWords guess(final String text) {
46  
-		return guess(new Counter<String>(new WordIterator(text)));
47  
-	}
48  
-
49  
-	public static StopWords guess(final Counter<String> wordCounter) {
50  
-		return guess(wordCounter.getMostFrequent(50));
51  
-	}
52  
-
53  
-	public static StopWords guess(final Collection<String> words) {
54  
-		StopWords currentWinner = null;
55  
-		int currentMax = 0;
56  
-		for (final StopWords stopWords : StopWords.values()) {
57  
-			int count = 0;
58  
-			for (final String word : words) {
59  
-				if (stopWords.isStopWord(word)) {
60  
-					count++;
61  
-				}
62  
-			}
63  
-			if (count > currentMax) {
64  
-				currentWinner = stopWords;
65  
-				currentMax = count;
66  
-			}
67  
-		}
68  
-		return currentWinner;
69  
-	}
70  
-
71  
-	public final boolean stripApostrophes;
72  
-	private final Set<String> stopwords = new HashSet<String>();
73  
-
74  
-	private StopWords() {
75  
-		this(false);
76  
-	}
77  
-
78  
-	private StopWords(final boolean stripApostrophes) {
79  
-		this.stripApostrophes = stripApostrophes;
80  
-		loadLanguage();
81  
-	}
82  
-
83  
-	public boolean isStopWord(final String s) {
84  
-		if (s.length() == 1) {
85  
-			return true;
86  
-		}
87  
-		// check rightquotes as apostrophes
88  
-		return stopwords.contains(s.replace('\u2019', '\'').toLowerCase(
89  
-				Locale.ENGLISH));
90  
-	}
91  
-
92  
-	private void loadLanguage() {
93  
-		final String wordlistResource = name().toLowerCase(Locale.ENGLISH);
94  
-		if (!wordlistResource.equals("custom")) {
95  
-			readStopWords(getClass().getResourceAsStream(wordlistResource),
96  
-					Charset.forName("UTF-8"));
97  
-		}
98  
-	}
99  
-
100  
-	public void readStopWords(final InputStream inputStream,
101  
-			final Charset encoding) {
102  
-		try {
103  
-			final BufferedReader in = new BufferedReader(new InputStreamReader(
104  
-					inputStream, encoding));
105  
-			try {
106  
-				String line;
107  
-				while ((line = in.readLine()) != null) {
108  
-					line = line.replaceAll("\\|.*", "").trim();
109  
-					if (line.length() == 0) {
110  
-						continue;
111  
-					}
112  
-					for (final String w : line.split("\\s+")) {
113  
-						stopwords.add(w.toLowerCase(Locale.ENGLISH));
114  
-					}
115  
-				}
116  
-			} finally {
117  
-				in.close();
118  
-			}
119  
-		} catch (final IOException e) {
120  
-			throw new RuntimeException(e);
121  
-		}
122  
-	}
  38
+    Arabic(), Armenian(), Catalan(true), Croatian(), Czech(), Dutch(), //
  39
+    Danish(), English(), Esperanto(), Farsi(), Finnish(), //
  40
+    French(true), German(), Greek(), Hindi(), Hungarian(), //
  41
+    Italian(), Latin(), Norwegian(), Polish(), Portuguese(), //
  42
+    Romanian(), Russian(), Slovenian(), Slovak(), Spanish(), //
  43
+    Swedish(), Hebrew(), Turkish(), Custom();
  44
+
  45
+    public static StopWords guess(final String text) {
  46
+        return guess(new Counter<String>(new WordIterator(text)));
  47
+    }
  48
+
  49
+    public static StopWords guess(final Counter<String> wordCounter) {
  50
+        return guess(wordCounter.getMostFrequent(50));
  51
+    }
  52
+
  53
+    public static StopWords guess(final Collection<String> words) {
  54
+        StopWords currentWinner = null;
  55
+        int currentMax = 0;
  56
+        for (final StopWords stopWords : StopWords.values()) {
  57
+            int count = 0;
  58
+            for (final String word : words) {
  59
+                if (stopWords.isStopWord(word)) {
  60
+                    count++;
  61
+                }
  62
+            }
  63
+            if (count > currentMax) {
  64
+                currentWinner = stopWords;
  65
+                currentMax = count;
  66
+            }
  67
+        }
  68
+        return currentWinner;
  69
+    }
  70
+
  71
+    public final boolean stripApostrophes;
  72
+    private final Set<String> stopwords = new HashSet<String>();
  73
+
  74
+    private StopWords() {
  75
+        this(false);
  76
+    }
  77
+
  78
+    private StopWords(final boolean stripApostrophes) {
  79
+        this.stripApostrophes = stripApostrophes;
  80
+        loadLanguage();
  81
+    }
  82
+
  83
+    public boolean isStopWord(final String s) {
  84
+        if (s.length() == 1) {
  85
+            return true;
  86
+        }
  87
+        // check rightquotes as apostrophes
  88
+        return stopwords.contains(s.replace('\u2019', '\'').toLowerCase(Locale.ENGLISH));
  89
+    }
  90
+
  91
+    private void loadLanguage() {
  92
+        final String wordlistResource = name().toLowerCase(Locale.ENGLISH);
  93
+        if (!wordlistResource.equals("custom")) {
  94
+            readStopWords(getClass().getResourceAsStream(wordlistResource),
  95
+                    Charset.forName("UTF-8"));
  96
+        }
  97
+    }
  98
+
  99
+    public void readStopWords(final InputStream inputStream, final Charset encoding) {
  100
+        try {
  101
+            final BufferedReader in = new BufferedReader(new InputStreamReader(inputStream,
  102
+                    encoding));
  103
+            try {
  104
+                String line;
  105
+                while ((line = in.readLine()) != null) {
  106
+                    line = line.replaceAll("\\|.*", "").trim();
  107
+                    if (line.length() == 0) {
  108
+                        continue;
  109
+                    }
  110
+                    for (final String w : line.split("\\s+")) {
  111
+                        stopwords.add(w.toLowerCase(Locale.ENGLISH));
  112
+                    }
  113
+                }
  114
+            } finally {
  115
+                in.close();
  116
+            }
  117
+        } catch (final IOException e) {
  118
+            throw new RuntimeException(e);
  119
+        }
  120
+    }
123 121
 }
45  src/cue/lang/stop/armenian
... ...
@@ -0,0 +1,45 @@
  1
+այդ
  2
+այլ
  3
+այն
  4
+այս
  5
+դու
  6
+դուք
  7
+եմ
  8
+են
  9
+ենք
  10
+ես
  11
+եք
  12
  13
+էի
  14
+էին
  15
+էինք
  16
+էիր
  17
+էիք
  18
+էր
  19
+ըստ
  20
  21
  22
+ին
  23
+իսկ
  24
+իր
  25
+կամ
  26
+համար
  27
+հետ
  28
+հետո
  29
+մենք
  30
+մեջ
  31
+մի
  32
  33
+նա
  34
+նաև
  35
+նրա
  36
+նրանք
  37
+որ
  38
+որը
  39
+որոնք
  40
+որպես
  41
+ու
  42
+ում
  43
+պիտի
  44
+վրա
  45

0 notes on commit 7a97a56

Please sign in to comment.
Something went wrong with that request. Please try again.