Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Easy RAG #686

Merged
merged 22 commits into from
Mar 21, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions document-parsers/langchain4j-document-parser-apache-tika/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-parent</artifactId>
<version>0.28.0</version>
<relativePath>../../langchain4j-parent/pom.xml</relativePath>
</parent>

<artifactId>langchain4j-document-parser-apache-tika</artifactId>
<name>LangChain4j :: Document parser :: Apache Tika</name>
<packaging>jar</packaging>

<properties>
<apache.tika.version>2.9.1</apache.tika.version>
</properties>

<dependencies>

<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-core</artifactId>
</dependency>

<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${apache.tika.version}</version>
</dependency>

<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${apache.tika.version}</version>
</dependency>

<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<scope>test</scope>
</dependency>

</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package dev.langchain4j.data.document.parser.apache.tika;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;

import java.io.InputStream;

import static dev.langchain4j.internal.Utils.getOrDefault;

/**
* Parses files into {@link Document}s using Apache Tika library, automatically detecting the file format.
* This parser supports various file formats, including PDF, DOC, PPT, XLS.
* For detailed information on supported formats,
* please refer to the <a href="https://tika.apache.org/2.9.1/formats.html">Apache Tika documentation</a>.
*/
public class ApacheTikaDocumentParser implements DocumentParser {

private static final int UNLIMITED = -1;

private final Parser parser;
private final ContentHandler contentHandler;
private final Metadata metadata;
private final ParseContext parseContext;

/**
* Creates an instance of {@code ApacheTikaDocumentParser} with the default Tika components.
* It uses {@link AutoDetectParser}, {@link BodyContentHandler} without write limit,
* empty {@link Metadata} and empty {@link ParseContext}.
*/
public ApacheTikaDocumentParser() {
this(null, null, null, null);
}

/**
* Creates an instance of {@code ApacheTikaDocumentParser} with the provided Tika components.
* If some of the components is not provided ({@code null}, the default will be used.
* It uses {@link AutoDetectParser}, {@link BodyContentHandler} without write limit,
* empty {@link Metadata} and empty {@link ParseContext}.
*
* @param parser Tika parser to use. Default: {@link AutoDetectParser}
* @param contentHandler Tika content handler. Default: {@link BodyContentHandler} without write limit
* @param metadata Tika metadata. Default: empty {@link Metadata}
* @param parseContext Tika parse context. Default: empty {@link ParseContext}
*/
public ApacheTikaDocumentParser(Parser parser,
ContentHandler contentHandler,
Metadata metadata,
ParseContext parseContext) {
this.parser = getOrDefault(parser, new AutoDetectParser());
this.contentHandler = getOrDefault(contentHandler, new BodyContentHandler(UNLIMITED));
this.metadata = getOrDefault(metadata, new Metadata());
this.parseContext = getOrDefault(parseContext, new ParseContext());
}

// TODO allow automatically extract metadata (e.g. creator, last-author, created/modified timestamp, etc)

@Override
public Document parse(InputStream inputStream) {
try {
parser.parse(inputStream, contentHandler, metadata, parseContext);
String text = contentHandler.toString();
langchain4j marked this conversation as resolved.
Show resolved Hide resolved
return Document.from(text);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
langchain4j marked this conversation as resolved.
Show resolved Hide resolved
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package dev.langchain4j.data.document.parser.apache.tika;

import dev.langchain4j.data.document.Document;
import dev.langchain4j.data.document.DocumentParser;
import org.apache.tika.parser.AutoDetectParser;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

import java.io.InputStream;

import static org.assertj.core.api.Assertions.assertThat;

class ApacheTikaDocumentParserTest {

@ParameterizedTest
@ValueSource(strings = {
"test-file.doc",
"test-file.docx",
"test-file.ppt",
"test-file.pptx",
"test-file.pdf"
})
void should_parse_doc_ppt_and_pdf_files(String fileName) {

DocumentParser parser = new ApacheTikaDocumentParser();
InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName);

Document document = parser.parse(inputStream);

assertThat(document.text()).isEqualToIgnoringWhitespace("test content");
assertThat(document.metadata().asMap()).isEmpty();
}

@ParameterizedTest
@ValueSource(strings = {
"test-file.xls",
"test-file.xlsx"
})
void should_parse_xls_files(String fileName) {

DocumentParser parser = new ApacheTikaDocumentParser(new AutoDetectParser(), null, null, null);
InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName);

Document document = parser.parse(inputStream);

assertThat(document.text())
.isEqualToIgnoringWhitespace("Sheet1\ntest content\nSheet2\ntest content");
assertThat(document.metadata().asMap()).isEmpty();
}
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
8 changes: 7 additions & 1 deletion langchain4j-bom/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,12 @@

<!-- document parsers -->

<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-document-parser-apache-pdfbox</artifactId>
<version>${project.version}</version>
</dependency>

<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-document-parser-apache-poi</artifactId>
Expand All @@ -267,7 +273,7 @@

<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-document-parser-apache-pdfbox</artifactId>
<artifactId>langchain4j-document-parser-apache-tika</artifactId>
<version>${project.version}</version>
</dependency>

Expand Down
81 changes: 81 additions & 0 deletions langchain4j-easy-rag/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-parent</artifactId>
<version>0.28.0</version>
<relativePath>../langchain4j-parent/pom.xml</relativePath>
</parent>

<artifactId>langchain4j-easy-rag</artifactId>
<packaging>jar</packaging>

<name>LangChain4j :: Easy RAG</name>

<dependencies>

<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j</artifactId>
</dependency>

<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-document-parser-apache-tika</artifactId>
<version>${project.version}</version>
</dependency>

<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-embeddings-all-minilm-l6-v2-q</artifactId>
<version>${project.version}</version>
<!-- TODO-->
</dependency>

<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-open-ai</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<scope>test</scope>
</dependency>

<dependency>
<groupId>org.tinylog</groupId>
<artifactId>tinylog-impl</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.tinylog</groupId>
<artifactId>slf4j-tinylog</artifactId>
<scope>test</scope>
</dependency>

</dependencies>

</project>
Loading
Loading