/
OCRExtractAction.java
271 lines (222 loc) · 11 KB
/
OCRExtractAction.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
package es.keensoft.alfresco.ocr;
import java.io.Serializable;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ThreadPoolExecutor;
import org.alfresco.model.ContentModel;
import org.alfresco.repo.action.ParameterDefinitionImpl;
import org.alfresco.repo.action.executer.ActionExecuterAbstractBase;
import org.alfresco.repo.content.MimetypeMap;
import org.alfresco.repo.security.authentication.AuthenticationUtil;
import org.alfresco.repo.tenant.TenantUtil;
import org.alfresco.repo.transaction.RetryingTransactionHelper;
import org.alfresco.repo.transaction.RetryingTransactionHelper.RetryingTransactionCallback;
import org.alfresco.repo.version.VersionModel;
import org.alfresco.service.cmr.action.Action;
import org.alfresco.service.cmr.action.ParameterDefinition;
import org.alfresco.service.cmr.dictionary.DataTypeDefinition;
import org.alfresco.service.cmr.repository.ContentData;
import org.alfresco.service.cmr.repository.ContentReader;
import org.alfresco.service.cmr.repository.ContentService;
import org.alfresco.service.cmr.repository.ContentWriter;
import org.alfresco.service.cmr.repository.NodeRef;
import org.alfresco.service.cmr.repository.NodeService;
import org.alfresco.service.cmr.version.Version;
import org.alfresco.service.cmr.version.VersionService;
import org.alfresco.service.cmr.version.VersionType;
import org.alfresco.service.namespace.NamespaceService;
import org.alfresco.service.namespace.QName;
import org.alfresco.service.transaction.TransactionService;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import es.keensoft.alfresco.ocr.model.OCRdModel;
public class OCRExtractAction extends ActionExecuterAbstractBase {
private static final Log logger = LogFactory.getLog(OCRExtractAction.class);
private NodeService nodeService;
private ContentService contentService;
private VersionService versionService;
private TransactionService transactionService;
private OCRTransformWorker ocrTransformWorker;
private ThreadPoolExecutor threadPoolExecutor;
// Continue current operation in case of OCR error
private static final String PARAM_CONTINUE_ON_ERROR = "continue-on-error";
// Force asynchronous mode
private static final String PARAM_ASYNCHRONOUS = "asynchronous";
public void init() {
super.init();
}
@Override
protected void addParameterDefinitions(List<ParameterDefinition> paramList) {
paramList.add(
new ParameterDefinitionImpl(
PARAM_CONTINUE_ON_ERROR,
DataTypeDefinition.BOOLEAN,
false,
getParamDisplayLabel(PARAM_CONTINUE_ON_ERROR)));
paramList.add(
new ParameterDefinitionImpl(
PARAM_ASYNCHRONOUS,
DataTypeDefinition.BOOLEAN,
false,
getParamDisplayLabel(PARAM_ASYNCHRONOUS)));
}
@Override
protected void executeImpl(Action action, NodeRef actionedUponNodeRef) {
if (nodeService.hasAspect(actionedUponNodeRef, OCRdModel.ASPECT_OCRD)) {
String versionNode = nodeService.getProperty(actionedUponNodeRef, OCRdModel.PROP_APPLIED_VERSION).toString();
String versionOCR = versionService.getCurrentVersion(actionedUponNodeRef).getVersionLabel().toString();
if (versionNode.equals(versionOCR)) {
return;
}
}
ContentData contentData = (ContentData) nodeService.getProperty(actionedUponNodeRef, ContentModel.PROP_CONTENT);
// Exclude folders and other nodes without content
if (contentData != null) {
Boolean continueOnError = (Boolean) action.getParameterValue(PARAM_CONTINUE_ON_ERROR);
if (continueOnError == null) continueOnError = true;
Boolean forceAsync = (Boolean) action.getParameterValue(PARAM_ASYNCHRONOUS);
if (forceAsync == null) forceAsync = false;
// Share action set asynchronous as mandatory due to variations in response time for OCR processes when server is busy
if (forceAsync) {
Runnable runnable = new ExtractOCRTask(actionedUponNodeRef, contentData, AuthenticationUtil.getFullyAuthenticatedUser());
threadPoolExecutor.execute(runnable);
} else {
// # 5 Problem writing OCRed file
// As action.getExecuteAsychronously() returns always FALSE (it's an Alfresco issue):
// 1 - Try first with new Transaction
// 2 - In case of error, try then with the current Transaction
try {
executeInNewTransaction(actionedUponNodeRef, contentData);
} catch (Throwable throwableNewTransaction) {
logger.warn(actionedUponNodeRef + ": " + throwableNewTransaction.getMessage());
try {
// Current transaction
executeImplInternal(actionedUponNodeRef, contentData);
} catch (Throwable throwableCurrentTransaction) {
if (continueOnError) {
logger.warn(actionedUponNodeRef + ": " + throwableNewTransaction.getMessage());
} else {
throw throwableCurrentTransaction;
}
}
}
}
}
}
private class ExtractOCRTask implements Runnable {
private NodeRef nodeToBeOCRd;
private ContentData contentData;
private String userId;
private ExtractOCRTask(NodeRef nodeToBeOCRd, ContentData contentData, String userId) {
this.nodeToBeOCRd = nodeToBeOCRd;
this.contentData = contentData;
this.userId = userId;
}
@Override
public void run() {
AuthenticationUtil.pushAuthentication();
try {
AuthenticationUtil.runAs(new AuthenticationUtil.RunAsWork<Object>() {
@Override
public Object doWork() throws Exception {
return TenantUtil.runAsTenant(new TenantUtil.TenantRunAsWork<Void>()
{
public Void doWork() throws Exception
{
executeInNewTransaction(nodeToBeOCRd, contentData);
return null;
}
}, TenantUtil.getCurrentDomain());
}
} ,userId);
} finally {
AuthenticationUtil.popAuthentication();
}
}
}
// Avoid ConcurrencyFailureException by using RetryingTransactionHelper
private void executeInNewTransaction(final NodeRef nodeRef, final ContentData contentData) {
RetryingTransactionCallback<Void> callback = new RetryingTransactionCallback<Void>() {
@Override
public Void execute() throws Throwable {
executeImplInternal(nodeRef, contentData);
return null;
}
};
RetryingTransactionHelper txnHelper = transactionService.getRetryingTransactionHelper();
txnHelper.doInTransaction(callback, false, true);
}
private void executeImplInternal(NodeRef actionedUponNodeRef, ContentData contentData) {
String originalMimeType = contentData.getMimetype();
ContentReader reader = contentService.getReader(actionedUponNodeRef, ContentModel.PROP_CONTENT);
ContentWriter writer = contentService.getTempWriter();
writer.setMimetype(contentData.getMimetype());
try {
ocrTransformWorker.transform(reader, writer, null);
} catch (Exception e) {
throw new RuntimeException(e);
}
// Set initial version if it's a new one
versionService.ensureVersioningEnabled(actionedUponNodeRef, null);
if (!versionService.isVersioned(actionedUponNodeRef)) {
Map<String, Serializable> versionProperties = new HashMap<String, Serializable>();
versionProperties.put(Version.PROP_DESCRIPTION, "OCRd");
versionProperties.put(VersionModel.PROP_VERSION_TYPE, VersionType.MINOR);
versionService.createVersion(actionedUponNodeRef, versionProperties);
}
ContentWriter writeOriginalContent = null;
// Update original PDF file
if (originalMimeType.equals(MimetypeMap.MIMETYPE_PDF)) {
writeOriginalContent = contentService.getWriter(actionedUponNodeRef, ContentModel.PROP_CONTENT, true);
} else {
// Create new PDF file
String fileName = nodeService.getProperty(actionedUponNodeRef, ContentModel.PROP_NAME) + ".pdf";
Map<QName, Serializable> props = new HashMap<QName, Serializable>(1);
props.put(ContentModel.PROP_NAME, fileName);
NodeRef pdfNodeRef = createNode(nodeService.getPrimaryParent(actionedUponNodeRef).getParentRef(), fileName, props);
writeOriginalContent = contentService.getWriter(pdfNodeRef, ContentModel.PROP_CONTENT, true);
writeOriginalContent.setMimetype(MimetypeMap.MIMETYPE_PDF);
}
writeOriginalContent.putContent(writer.getReader());
// Manual versioning because of Alfresco insane rules for first version content nodes
versionService.ensureVersioningEnabled(actionedUponNodeRef, null);
Map<String, Serializable> versionProperties = new HashMap<String, Serializable>();
versionProperties.put(Version.PROP_DESCRIPTION, "OCRd");
versionProperties.put(VersionModel.PROP_VERSION_TYPE, VersionType.MINOR);
versionService.createVersion(actionedUponNodeRef, versionProperties);
// Set OCRd aspect to avoid future re-OCR process
Map<QName, Serializable> aspectProperties = new HashMap<QName, Serializable>();
aspectProperties.put(OCRdModel.PROP_PROCESSED_DATE, new Date());
aspectProperties.put(OCRdModel.PROP_APPLIED_VERSION, versionService.getCurrentVersion(actionedUponNodeRef).getVersionLabel());
nodeService.addAspect(actionedUponNodeRef, OCRdModel.ASPECT_OCRD, aspectProperties);
}
private NodeRef createNode(NodeRef parentNodeRef, String name, Map<QName, Serializable> props) {
return nodeService.createNode(
parentNodeRef,
ContentModel.ASSOC_CONTAINS,
QName.createQName(NamespaceService.CONTENT_MODEL_1_0_URI, name),
ContentModel.TYPE_CONTENT,
props).
getChildRef();
}
public void setNodeService(NodeService nodeService) {
this.nodeService = nodeService;
}
public void setContentService(ContentService contentService) {
this.contentService = contentService;
}
public void setOcrTransformWorker(OCRTransformWorker ocrTransformWorker) {
this.ocrTransformWorker = ocrTransformWorker;
}
public void setVersionService(VersionService versionService) {
this.versionService = versionService;
}
public void setTransactionService(TransactionService transactionService) {
this.transactionService = transactionService;
}
public void setThreadPoolExecutor(ThreadPoolExecutor threadPoolExecutor) {
this.threadPoolExecutor = threadPoolExecutor;
}
}