**Preprocessing text (in Python)**

- Load articles
- Split into chunks (pieces of text)
- Add metadata
- Prepare for indexing in ChromaDB

In [None]:
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_core.documents import Document

# Load all files
data_path = Path("data")
files = list(data_path.glob("*.txt"))

all_docs = []

topic = "German Elections 2025" # General theme
# Dictionary with dates and sources 
metadata_lookup = {
    "article1_guardian": {
        "keywords": ["chaos", "Friedrich Merz", "Bundestag", "CDU", "coalition", "election", "whats next"],
        "date": "2025-05-06",
        "url": "https://www.theguardian.com/world/2025/may/06/chaos-in-the-german-bundestag-whats-next-for-friedrich-merz",
        "author":"The Guardian",   
        "language":"en",  
        "region":"Germany"
    },
    "article2_guardian": {
        "keywords":["Friedrich Merz", "German", "chancellor", "Europe"],
        "date": "2025-05-06",
        "url": "https://www.theguardian.com/world/live/2025/may/06/friedrich-merz-german-chancellor-europe-live-latest-news?filterKeyEvents=false&page=with%3Ablock-681a19a08f08145062c515ef#block-681a19a08f08145062c515ef",
        "author":"The Guardian",   
        "language":"en",  
        "region":"Germany"
    },
    "article3_fin_times": {
        "keywords":["Merz","wins", "second vote", "german", "chancellor"],
        "date": "2025-05-06",
        "url": "https://www.ft.com/content/48665ff1-b741-44dc-903e-2f54322a7127",
        "author":"The Financial time",   
        "language":"en",  
        "region":"Germany"
    },
    "article4_washpost": {
        "keywords":["Merz", "government", "Germany", "coalition","scholz"],
        "date": "2025-05-06",
        "url": "https://www.washingtonpost.com/world/2025/05/06/germany-government-merz-coalition/419e2d84-2a2f-11f0-a724-3bc879c9f843_story.html",
        "author":"The Washington post",   
        "language":"en",  
        "region":"Germany"
    },
    "article5_apnews": {
        "keywords":["Merz", "scholz", "_far_right", "afd", "Germany", "election"],
        "date": "2025-02-24",
        "url": "https://apnews.com/article/germany-election-merz-scholz-far-right-afd-ebf16ed38e0beaff7fed9a6d29b32a24",
        "author":"AP News",   
        "language":"en",  
        "region":"Germany"
    },
    "article7_deepnewz": {
        "keywords":["German President", "Steinmeier", "Dismisses", "Chancellor", "government", "14 Minister","scholz"],
        "date": "2025-03-25",
        "url": "https://deepnewz.com/germany/german-president-steinmeier-dismisses-chancellor-scholz-14-ministers-to-acting-bf3914a8",
        "author":"deepnewz.com",   
        "language":"en",  
        "region":"Germany"
    },
    "article6_wiki": {
        "keywords":["resignation", "German", "chancellor", "2024", "government", "crisis","scholz"],
        "date": "2025-04-30",
        "url": "https://en.wikipedia.org/wiki/2024_German_government_crisis",
        "author":"Wiki",   
        "language":"en",  
        "region":"Germany"
    },
    "article8_timesnownews": {
        "keywords":["German", "President", "Steinmeier", "dissolves", "Parliament","Scholz", "loses majority"],
        "date": "2024-12-27",
        "url": "https://www.timesnownews.com/world/europe/german-president-dissolves-parliament-as-scholz-loses-majority-snap-polls-set-for-february-article-116714492",
        "author":"timesnownews.com",   
        "language":"en",  
        "region":"Germany"
    },
    "article9_dw": {
        "keywords":["German election", "German", "loses confidence ", "vote","Scholz"],
        "date": "2024-12-17",
        "url": "https://www.dw.com/en/german-election-scholz-loses-confidence-vote/live-71063891",
        "author":"dw.com",   
        "language":"en",  
        "region":"Germany"
    }
}

for file in files:
    filename = file.stem
    loader = TextLoader(str(file), encoding='utf-8')
    docs = loader.load()

    for doc in docs:
        doc.metadata['source'] = filename
        doc.metadata['topic'] = topic

        # Add date and url from dictionary 'metadata_lookup')
        extra_meta = metadata_lookup.get(filename, {})

        keywords_list = extra_meta.get('keywords', [])
        doc.metadata['keywords'] = ", ".join(keywords_list) if isinstance(keywords_list, list) else str(keywords_list)

        doc.metadata['date'] = extra_meta.get('date', 'unknown')
        doc.metadata['url'] = extra_meta.get('url', 'unknown')
        doc.metadata['author'] = extra_meta.get('author', 'unknown')   
        doc.metadata['language'] = extra_meta.get('language', 'en')  
        doc.metadata['region'] = extra_meta.get('region', 'Germany')  

    all_docs.extend(docs)

# Split into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, # size of one chunk in symbols
    chunk_overlap=100 # overlap between chunks to maintain context
)
chunked_docs = splitter.split_documents(all_docs)

print(f"Total amount of chunks: {len(chunked_docs)}")
print(chunked_docs[0].metadata)
print(chunked_docs[0].page_content[:300])

Total amount of chunks: 121
{'source': 'article1_guardian', 'topic': 'German Elections 2025', 'keywords': 'chaos, Friedrich Merz, Bundestag, CDU, coalition, election, whats next', 'date': '2025-05-06', 'url': 'https://www.theguardian.com/world/2025/may/06/chaos-in-the-german-bundestag-whats-next-for-friedrich-merz', 'author': 'The Guardian', 'language': 'en', 'region': 'Germany'}
Title: What held up Friedrich Merz’s confirmation as chancellor and what’s next for Germany?
Source: The Guardian
Date: May 6, 2025
The CDU/CSU leader suffered an embarrassing surprise defeat in a first vote that was expected to be a formality.
Friedrich Merz has been confirmed as Germany’s new chan


**Indexing in ChromaDB**

- Adding ChromaDB with persistent storage.

In [2]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Choose an embedding
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create ChromaDB
db = Chroma.from_documents(
    documents=chunked_docs,
    embedding=embedding,
    persist_directory="./chroma_db"
)

# Save
db.persist()


  embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm
  db.persist()
