Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix #913: FileSystemDocumentLoader: ignore empty/blank documents, improved error/warn messages #920

Merged
merged 10 commits into from
Apr 16, 2024
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
package dev.langchain4j.data.document.parser.apache.pdfbox;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

import java.io.IOException;
import java.io.InputStream;

import static dev.langchain4j.internal.Utils.isNullOrBlank;

/**
* Parses PDF file into a {@link Document} using Apache PDFBox library
*/
Expand All @@ -20,6 +23,11 @@ public Document parse(InputStream inputStream) {
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(pdfDocument);
pdfDocument.close();

if (isNullOrBlank(text)) {
throw new BlankDocumentException();
}

return Document.from(text);
} catch (IOException e) {
throw new RuntimeException(e);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
package dev.langchain4j.data.document.parser.apache.pdfbox;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentParser;
import org.junit.jupiter.api.Test;

import java.io.InputStream;

import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;

class ApachePdfBoxDocumentParserTest {

Expand All @@ -21,4 +23,14 @@ void should_parse_pdf_file() {
assertThat(document.text()).isEqualToIgnoringWhitespace("test content");
assertThat(document.metadata().asMap()).isEmpty();
}

@Test
void should_throw_BlankDocumentException() {

DocumentParser parser = new ApachePdfBoxDocumentParser();
InputStream inputStream = getClass().getClassLoader().getResourceAsStream("blank-file.pdf");

assertThatThrownBy(() -> parser.parse(inputStream))
.isExactlyInstanceOf(BlankDocumentException.class);
}
}
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
package dev.langchain4j.data.document.parser.apache.poi;

import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.poi.EmptyFileException;
import org.apache.poi.extractor.ExtractorFactory;
import org.apache.poi.extractor.POITextExtractor;

import java.io.IOException;
import java.io.InputStream;

import static dev.langchain4j.internal.Utils.isNullOrBlank;

/**
* Parses Microsoft Office file into a {@link Document} using Apache POI library.
* This parser supports various file formats, including doc, docx, ppt, pptx, xls, and xlsx.
Expand All @@ -20,7 +24,14 @@ public class ApachePoiDocumentParser implements DocumentParser {
public Document parse(InputStream inputStream) {
try (POITextExtractor extractor = ExtractorFactory.createExtractor(inputStream)) {
String text = extractor.getText();

if (isNullOrBlank(text)) {
throw new BlankDocumentException();
}

return Document.from(text);
} catch (EmptyFileException e) {
throw new BlankDocumentException();
} catch (IOException e) {
throw new RuntimeException(e);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
package dev.langchain4j.data.document.parser.apache.poi;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentParser;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

import java.io.InputStream;

import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;

public class ApachePoiDocumentParserTest {

Expand Down Expand Up @@ -45,4 +47,21 @@ void should_parse_xls_files(String fileName) {
.isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content");
assertThat(document.metadata().asMap()).isEmpty();
}

@ParameterizedTest
@ValueSource(strings = {
"empty-file.txt",
"blank-file.txt",
"blank-file.docx",
"blank-file.pptx"
// "blank-file.xlsx" TODO
})
void should_throw_BlankDocumentException(String fileName) {

DocumentParser parser = new ApachePoiDocumentParser();
InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName);

assertThatThrownBy(() -> parser.parse(inputStream))
.isExactlyInstanceOf(BlankDocumentException.class);
}
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package dev.langchain4j.data.document.parser.apache.tika;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
Expand All @@ -12,6 +14,7 @@
import java.io.InputStream;

import static dev.langchain4j.internal.Utils.getOrDefault;
import static dev.langchain4j.internal.Utils.isNullOrBlank;

/**
* Parses files into {@link Document}s using Apache Tika library, automatically detecting the file format.
Expand Down Expand Up @@ -63,7 +66,16 @@ public Document parse(InputStream inputStream) {
try {
parser.parse(inputStream, contentHandler, metadata, parseContext);
String text = contentHandler.toString();

if (isNullOrBlank(text)) {
throw new BlankDocumentException();
}

return Document.from(text);
} catch (BlankDocumentException e) {
throw e;
} catch (ZeroByteFileException e) {
throw new BlankDocumentException();
} catch (Exception e) {
throw new RuntimeException(e);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package dev.langchain4j.data.document.parser.apache.tika;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.tika.parser.AutoDetectParser;
import org.junit.jupiter.params.ParameterizedTest;
Expand All @@ -9,6 +10,7 @@
import java.io.InputStream;

import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;

class ApacheTikaDocumentParserTest {

Expand Down Expand Up @@ -47,4 +49,21 @@ void should_parse_xls_files(String fileName) {
.isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content");
assertThat(document.metadata().asMap()).isEmpty();
}

@ParameterizedTest
@ValueSource(strings = {
"empty-file.txt",
"blank-file.txt",
"blank-file.docx",
"blank-file.pptx"
// "blank-file.xlsx" TODO
})
void should_throw_BlankDocumentException(String fileName) {

DocumentParser parser = new ApacheTikaDocumentParser();
InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName);

assertThatThrownBy(() -> parser.parse(inputStream))
.isExactlyInstanceOf(BlankDocumentException.class);
}
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
package dev.langchain4j.data.document;

public class BlankDocumentException extends RuntimeException {
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
* Utility class for loading documents.
*/
public class DocumentLoader {
private DocumentLoader() {}

private DocumentLoader() {
}

/**
* Loads a document from the given source using the given parser.
Expand All @@ -16,12 +18,15 @@ private DocumentLoader() {}
* @param source The source from which the document will be loaded.
* @param parser The parser that will be used to parse the document.
* @return The loaded document.
* @throws BlankDocumentException when the parsed {@link Document} is blank/empty.
*/
public static Document load(DocumentSource source, DocumentParser parser) {
try (InputStream inputStream = source.inputStream()) {
Document document = parser.parse(inputStream);
source.metadata().asMap().forEach((key, value) -> document.metadata().add(key, value));
return document;
} catch (BlankDocumentException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException("Failed to load document", e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,18 @@
import java.io.InputStream;

/**
* Defines the interface for parsing an InputStream into a Document.
* Defines the interface for parsing an {@link InputStream} into a {@link Document}.
* Different document types require specialized parsing logic.
*/
public interface DocumentParser {

/**
* Parses an InputStream into a Document.
* Parses a given {@link InputStream} into a {@link Document}.
* The specific implementation of this method will depend on the type of the document being parsed.
*
* @param inputStream The InputStream that contains the content of the document.
* @return The parsed Document.
* @param inputStream The {@link InputStream} that contains the content of the {@link Document}.
* @return The parsed {@link Document}.
* @throws BlankDocumentException when the parsed {@link Document} is blank/empty.
*/
Document parse(InputStream inputStream);
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package dev.langchain4j.data.document.loader;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentLoader;
import dev.langchain4j.data.document.DocumentParser;
import dev.langchain4j.data.document.parser.TextDocumentParser;
Expand Down Expand Up @@ -48,7 +49,7 @@ private FileSystemDocumentLoader() {
*/
public static Document loadDocument(Path filePath, DocumentParser documentParser) {
if (!isRegularFile(filePath)) {
throw illegalArgument("%s is not a file", filePath);
throw illegalArgument("'%s' is not a file", filePath);
}

return DocumentLoader.load(from(filePath), documentParser);
Expand Down Expand Up @@ -118,7 +119,7 @@ public static Document loadDocument(String filePath) {
*/
public static List<Document> loadDocuments(Path directoryPath, DocumentParser documentParser) {
if (!isDirectory(directoryPath)) {
throw illegalArgument("%s is not a directory", directoryPath);
throw illegalArgument("'%s' is not a directory", directoryPath);
}

try (Stream<Path> pathStream = Files.list(directoryPath)) {
Expand Down Expand Up @@ -200,7 +201,7 @@ public static List<Document> loadDocuments(Path directoryPath,
PathMatcher pathMatcher,
DocumentParser documentParser) {
if (!isDirectory(directoryPath)) {
throw illegalArgument("%s is not a directory", directoryPath);
throw illegalArgument("'%s' is not a directory", directoryPath);
}

try (Stream<Path> pathStream = Files.list(directoryPath)) {
Expand Down Expand Up @@ -294,7 +295,7 @@ public static List<Document> loadDocuments(String directoryPath, PathMatcher pat
*/
public static List<Document> loadDocumentsRecursively(Path directoryPath, DocumentParser documentParser) {
if (!isDirectory(directoryPath)) {
throw illegalArgument("%s is not a directory", directoryPath);
throw illegalArgument("'%s' is not a directory", directoryPath);
}

try (Stream<Path> pathStream = Files.walk(directoryPath)) {
Expand Down Expand Up @@ -379,7 +380,7 @@ public static List<Document> loadDocumentsRecursively(Path directoryPath,
PathMatcher pathMatcher,
DocumentParser documentParser) {
if (!isDirectory(directoryPath)) {
throw illegalArgument("%s is not a directory", directoryPath);
throw illegalArgument("'%s' is not a directory", directoryPath);
}

try (Stream<Path> pathStream = Files.walk(directoryPath)) {
Expand Down Expand Up @@ -486,8 +487,11 @@ private static List<Document> loadDocuments(Stream<Path> pathStream,
try {
Document document = loadDocument(file, documentParser);
documents.add(document);
} catch (BlankDocumentException ignored) {
// blank/empty documents are ignored
} catch (Exception e) {
log.warn("Failed to load document from " + file, e);
String message = e.getCause() != null ? e.getCause().getMessage() : e.getMessage();
log.warn("Failed to load '{}': {}", file, message);
}
});

Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
package dev.langchain4j.data.document.parser;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.BlankDocumentException;
import dev.langchain4j.data.document.DocumentParser;

import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.nio.charset.Charset;

import static dev.langchain4j.internal.Utils.isNullOrBlank;
import static dev.langchain4j.internal.ValidationUtils.ensureNotNull;
import static java.nio.charset.StandardCharsets.UTF_8;

Expand Down Expand Up @@ -36,7 +38,13 @@ public Document parse(InputStream inputStream) {

String text = new String(buffer.toByteArray(), charset);

if (isNullOrBlank(text)) {
throw new BlankDocumentException();
}

return Document.from(text);
} catch (BlankDocumentException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException(e);
}
Expand Down