-
Notifications
You must be signed in to change notification settings - Fork 10
/
DocumentFrequencyCounter.java
197 lines (161 loc) · 6.67 KB
/
DocumentFrequencyCounter.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
package me.foldl.corenlp_summarizer;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.ReaderInputStream;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import edu.stanford.nlp.trees.international.spanish.SpanishTreebankLanguagePack;
import edu.stanford.nlp.util.XMLUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Pattern;
public class DocumentFrequencyCounter {
private static final MaxentTagger tagger =
new MaxentTagger("edu/stanford/nlp/models/pos-tagger/spanish/spanish-distsim.tagger");
private static final int MAX_SENTENCE_LENGTH = 100;
private static final Pattern headingSeparator = Pattern.compile("[-=]{3,}");
private static final Pattern paragraphMarker =
Pattern.compile("</?(?:TEXT|P)>(\n|$)");
private static final SpanishTreebankLanguagePack tlp = new SpanishTreebankLanguagePack();
private static final TokenizerFactory<? extends HasWord> tokenizerFactory = tlp.getTokenizerFactory();
/**
* Get an IDF map for the given document string.
*
* @param document
* @return
*/
private static Counter<String> getIDFMapForDocument(String document) {
// Clean up -- remove some Gigaword patterns that slow things down
// / don't help anything
document = headingSeparator.matcher(document).replaceAll("");
DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(document));
preprocessor.setTokenizerFactory(tokenizerFactory);
Counter<String> idfMap = new ClassicCounter<String>();
for (List<HasWord> sentence : preprocessor) {
if (sentence.size() > MAX_SENTENCE_LENGTH)
continue;
List<TaggedWord> tagged = tagger.tagSentence(sentence);
for (TaggedWord w : tagged) {
if (w.tag().startsWith("n"))
idfMap.incrementCount(w.word());
}
}
return idfMap;
}
private static final String TAG_DOCUMENT = "DOC";
private static final String TAG_TEXT = "TEXT";
private static String getFullTextContent(Element e) throws TransformerException {
TransformerFactory transFactory = TransformerFactory.newInstance();
Transformer transformer = transFactory.newTransformer();
StringWriter buffer = new StringWriter();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
transformer.transform(new DOMSource(e),
new StreamResult(buffer));
String str = buffer.toString();
// Remove paragraph markers
str = paragraphMarker.matcher(str).replaceAll("");
return str;
}
/**
* Get an IDF map for all the documents in the given file.
* @param file
* @return
*/
private static Counter<String> getIDFMapForFile(Reader file)
throws SAXException, IOException, TransformerException {
DocumentBuilder parser = XMLUtils.getXmlParser();
Document xml = parser.parse(new ReaderInputStream(file));
NodeList docNodes = xml.getDocumentElement().getElementsByTagName(TAG_DOCUMENT);
Element doc;
Counter<String> idfMap = new ClassicCounter<String>();
for (int i = 0; i < docNodes.getLength(); i++) {
doc = (Element) docNodes.item(i);
NodeList texts = doc.getElementsByTagName(TAG_TEXT);
assert texts.getLength() == 1;
Element text = (Element) texts.item(0);
String textContent = getFullTextContent(text);
idfMap.addAll(getIDFMapForDocument(textContent));
// Increment magic counter
idfMap.incrementCount("__all__");
}
return idfMap;
}
private static final class FileIDFBuilder implements Callable<Counter<String>> {
private final File file;
public FileIDFBuilder(File file) {
this.file = file;
}
/**
* Computes a result, or throws an exception if unable to do so.
*
* @return computed result
* @throws Exception if unable to compute a result
*/
@Override
public Counter<String> call() throws Exception {
// We need to hallucinate some overarching document tag.. because the Gigaword files don't
// have them :/
String fileContents = IOUtils.slurpFile(file);
fileContents = "<docs>" + fileContents + "</docs>";
return getIDFMapForFile(new StringReader(fileContents));
}
}
private static final String OUT_FILE = "df-counts.ser";
private static final DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
public static void main(String[] args) throws InterruptedException, ExecutionException,
IOException {
ExecutorService pool = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
List<Future<Counter<String>>> futures = new ArrayList<Future<Counter<String>>>();
for (String filePath : args)
futures.add(pool.submit(new FileIDFBuilder(new File(filePath))));
int finished = 0;
Counter<String> overall = new ClassicCounter<String>();
for (Future<Counter<String>> future : futures) {
System.err.printf("%s: Polling future #%d / %d%n",
dateFormat.format(new Date()), finished + 1, args.length);
Counter<String> result = future.get();
finished++;
System.err.printf("%s: Finished future #%d / %d%n",
dateFormat.format(new Date()), finished, args.length);
System.err.printf("\tMerging counter.. ");
overall.addAll(result);
System.err.printf("done.%n");
}
pool.shutdown();
System.err.printf("\n%s: Saving to '%s'.. ", dateFormat.format(new Date()),
OUT_FILE);
ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(OUT_FILE));
oos.writeObject(overall);
System.err.printf("done.%n");
}
}