In [87]:
import pandas as pd
import numpy as np
from langchain.document_loaders import DirectoryLoader, UnstructuredXMLLoader


In [88]:
file = "german-law/laws/Aufenthaltsverordnung/BJNR294510004.xml"

### Try: XML Loader

In [92]:
# load XML with UnstructuredXMLLoader
loader = UnstructuredXMLLoader(file_path = file)
docs = loader.load()

In [93]:
len(docs)

1

In [100]:
docs[0].page_content[:400]

'290 AufenthV Inhaltsübersicht Kapitel 1 Allgemeine Bestimmungen § 1 Begriffsbestimmungen Kapitel 2 Einreise und Aufenthalt im Bundesgebiet Abschnitt 1 Passpflicht für Ausländer § 2 Erfüllung der Passpflicht durch Eintragung in den Pass eines gesetzlichen Vertreters § 3 Zulassung nichtdeutscher amtlicher Ausweise als Passersatz § 4 Deutsche Passersatzpapiere für Ausländer § 5 Allgemeine Voraussetzu'

In [98]:
type(docs)

list

**Recursive Character Text Splitter**

Use recursive character text splitter to split texts into chunks of 1000

In [120]:
# Try with the CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200
)

r_texts = text_splitter.create_documents([docs[0].page_content])
print(len(r_texts))

254


In [121]:
# Try with the CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200
)

texts = text_splitter.create_documents([docs[0].page_content])
print(len(texts))

Created a chunk of size 1500, which is longer than the specified 1000
Created a chunk of size 1500, which is longer than the specified 1000
Created a chunk of size 1500, which is longer than the specified 1000
Created a chunk of size 1500, which is longer than the specified 1000
Created a chunk of size 1500, which is longer than the specified 1000
Created a chunk of size 7988, which is longer than the specified 1000
Created a chunk of size 1413, which is longer than the specified 1000
Created a chunk of size 1181, which is longer than the specified 1000
Created a chunk of size 1231, which is longer than the specified 1000
Created a chunk of size 1124, which is longer than the specified 1000
Created a chunk of size 1359, which is longer than the specified 1000
Created a chunk of size 1252, which is longer than the specified 1000
Created a chunk of size 1357, which is longer than the specified 1000
Created a chunk of size 1474, which is longer than the specified 1000
Created a chunk of s

130


**Embedding**

In [73]:
from langchain.embeddings import OllamaEmbeddings

In [74]:
embeddings_model = OllamaEmbeddings()

In [148]:
print(OllamaEmbeddings())

base_url='http://localhost:11434' model='llama2' embed_instruction='passage: ' query_instruction='query: ' mirostat=None mirostat_eta=None mirostat_tau=None num_ctx=None num_gpu=None num_thread=None repeat_last_n=None repeat_penalty=None temperature=None stop=None tfs_z=None top_k=None top_p=None model_kwargs=None


**Apply the embedding model**

In [122]:
# Apply to the character split texts
embeddings = embeddings_model.embed_documents(texts)

In [149]:
len(embeddings[0])

4096

In [124]:
# Apply to the recursively character split texts
embeddings_r_texts = embeddings_model.embed_documents(r_texts)

**Vector Store: define the database to use**

In [150]:
from langchain.vectorstores import Qdrant

**Qdrant texts from non-recursive splitter**

In [152]:
qdrant_texts = Qdrant.from_documents(
    documents=texts,
    embedding=embeddings_model,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="texts",
)

**Qdrant texts from recursive splitter**

In [None]:
qdrant_texts = Qdrant.from_documents(
    documents=r_texts,
    embedding=embeddings_model,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="r_texts",
)

**Create Vectorstore: with recursive character text splitter**

In [145]:
from langchain.vectorstores import Pinecone

In [146]:
import os

**LLM Setup**

In [61]:
from langchain.llms import Ollama

In [62]:
llm = Ollama(model='llama2', temperature=0)

**Retrieval Prompt**