/
CleanByBagConf.java
128 lines (114 loc) · 5.18 KB
/
CleanByBagConf.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
/*
* This file is part of JaTeCS.
*
* JaTeCS is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* JaTeCS is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JaTeCS. If not, see <http://www.gnu.org/licenses/>.
*
* The software has been mainly developed by (in alphabetical order):
* - Andrea Esuli (andrea.esuli@isti.cnr.it)
* - Tiziano Fagni (tiziano.fagni@isti.cnr.it)
* - Alejandro Moreo Fernández (alejandro.moreo@isti.cnr.it)
* Other past contributors were:
* - Giacomo Berardi (giacomo.berardi@isti.cnr.it)
*/
package apps.trainingDataCleaning;
import gnu.trove.TShortObjectHashMap;
import it.cnr.jatecs.classification.ClassificationResult;
import it.cnr.jatecs.classification.adaboost.AdaBoostDataManager;
import it.cnr.jatecs.classification.bagging.BaggingClassifier;
import it.cnr.jatecs.classification.bagging.BaggingDataManager;
import it.cnr.jatecs.indexes.DB.interfaces.IIndex;
import it.cnr.jatecs.indexes.DB.troveCompact.TroveClassificationDBType;
import it.cnr.jatecs.indexes.DB.troveCompact.TroveContentDBType;
import it.cnr.jatecs.indexes.DB.troveCompact.TroveReadWriteHelper;
import it.cnr.jatecs.io.FileSystemStorageManager;
import it.cnr.jatecs.utils.Os;
import it.cnr.jatecs.utils.iterators.interfaces.IIntIterator;
import it.cnr.jatecs.utils.iterators.interfaces.IShortIterator;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Collections;
import java.util.Vector;
public class CleanByBagConf {
public static void main(String[] args) throws IOException {
if (args.length != 2) {
System.err
.println("Usage: CleanByBagConf <indexDirectory> <classifierDirectory>");
return;
}
File file = new File(args[0]);
String indexPath = file.getParentFile().getPath();
String indexName = file.getName();
file = new File(args[1]);
String classifierPath = file.getParentFile().getPath();
String classifierName = file.getName();
FileSystemStorageManager storageManager = new FileSystemStorageManager(
indexPath, false);
storageManager.open();
IIndex index = TroveReadWriteHelper.readIndex(storageManager,
indexName, TroveContentDBType.Full,
TroveClassificationDBType.Full);
storageManager.close();
AdaBoostDataManager internalDataManager = new AdaBoostDataManager();
BaggingDataManager dataManager = new BaggingDataManager(
internalDataManager);
storageManager = new FileSystemStorageManager(classifierPath, false);
storageManager.open();
BaggingClassifier classifier = (BaggingClassifier) dataManager.read(
storageManager, classifierName);
storageManager.close();
IIntIterator docs = index.getDocumentDB().getDocuments();
IShortIterator cats = index.getCategoryDB().getCategories();
TShortObjectHashMap<Vector<IdScorePair>> sets = new TShortObjectHashMap<Vector<IdScorePair>>();
while (cats.hasNext()) {
short cat = cats.next();
sets.put(cat, new Vector<IdScorePair>());
}
while (docs.hasNext()) {
int doc = docs.next();
ClassificationResult crVariance = classifier.computeVariance(index,
doc);
ClassificationResult crClassification = classifier.classify(index,
doc);
for (int i = 0; i < crClassification.categoryID.size(); ++i) {
short cat = crClassification.categoryID.getQuick(i);
double scoreCorrection = 1.0 / (Math.sqrt(crVariance.score
.getQuick(i)) + Double.MIN_VALUE);
double score = crClassification.score.getQuick(i);
int flag = -1;
if (index.getClassificationDB().hasDocumentCategory(doc, cat))
flag = 1;
Vector<IdScorePair> vector = (Vector<IdScorePair>) sets
.get(cat);
vector.add(new IdScorePair(doc, flag * score * scoreCorrection));
}
}
FileOutputStream fstream = new FileOutputStream(indexPath
+ Os.pathSeparator() + indexName + "_CONF-" + classifierName
+ ".txt");
PrintStream out = new PrintStream(fstream);
cats.begin();
while (cats.hasNext()) {
short cat = cats.next();
Vector<IdScorePair> vector = (Vector<IdScorePair>) sets.get(cat);
Collections.sort(vector);
for (int i = 0; i < vector.size(); ++i) {
IdScorePair p = vector.get(i);
out.println(p.Id() + "\t" + cat + "\t" + p.Score());
}
}
out.close();
}
}