In [1]:
# !pip install pandas pymongo torchinfo --quiet

# Solution prototype

In [2]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

import re
import nltk
import torch

from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymongo import MongoClient, errors
from sentence_transformers import SentenceTransformer
from torchinfo import summary
from tqdm import tqdm

load_dotenv("../../.env")

  from tqdm.autonotebook import tqdm, trange


True

## Data extraction

In [3]:
MONGO_HOST = os.getenv("MONGO_HOST")
MONGO_DATABASE = 'insightfinder-dev'
DOCS_COLLECTION = 'documents'

In [4]:
def generate_data(query: dict, projection: dict = None, sort_order: list = None, limit: int = None):
    try:
        with MongoClient(MONGO_HOST) as mongo_client:
            db = mongo_client[MONGO_DATABASE]
            collection = db[DOCS_COLLECTION]
            res = collection.find(query, projection or {})
            if sort_order:
                res = res.sort(sort_order)
            if limit:
                res = res.limit(limit)
            yield from res
    except errors.PyMongoError as e:
        print(f"MongoDB error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


def preprocess_paragraphs(paragraphs):
    if not isinstance(paragraphs, list):
        return None
    return " ".join(p.strip() for p in paragraphs if len(p.strip()) > 0)

In [5]:
query = {
    "visited": True, 
    "parsed_date": {
        "$exists": True, "$ne": None,
        "$gte": "2024-05-01", "$lte": "2024-05-31",
    },
    "site_name": {
        "$exists": True, "$nin": ["faz"],
    }
}
projection = {"_id": 1, "url": 1, "parsed_date": 1, "title": 1, "description": 1, "paragraphs": 1, "site_name": 1}

# sort_order = [("parsed_date", -1)]
sort_order = None

limit = None

data_generator = generate_data(query, projection, sort_order, limit)
df = pd.DataFrame(data_generator)
df["paragraphs"] = df["paragraphs"].apply(preprocess_paragraphs)

In [6]:
df

Unnamed: 0,_id,url,description,title,site_name,parsed_date,paragraphs
0,66407ce2c0e28ab642bf44f4,https://www.handelsblatt.com/finanzen/banken-v...,Die Fondsmanagerin Alexandra Annecke kritisier...,„IT-Probleme bei der Migration der Postbank si...,handelsblatt,2024-05-10,Frankfurt. Die Fondsgesellschaft Union Investm...
1,66407ce2c0e28ab642bf44f5,https://www.handelsblatt.com/finanzen/banken-v...,Bei kontaktlosen Zahlungen per EC-Karte muss m...,Darum müssen Sie plötzlich kaum noch Ihre PIN ...,handelsblatt,2024-05-10,"Frankfurt. Üblich ist, dass man beim Bezahlen ..."
2,66407ce2c0e28ab642bf44f6,https://www.handelsblatt.com/finanzen/banken-v...,Die Autoversicherung leidet unter stark gestie...,Allianz lässt bei der Autoreparatur nun auch g...,handelsblatt,2024-05-10,München. Die Kfz- Versicherung der Allianz \...
3,66407ce2c0e28ab642bf44f7,https://www.handelsblatt.com/finanzen/banken-v...,Die spanische Großbank will die kleinere Konku...,BBVA macht feindliches Übernahmeangebot für Sa...,handelsblatt,2024-05-09,"Madrid, Düsseldorf. Die spanische Großbank BB..."
4,66407ce2c0e28ab642bf44f8,https://www.handelsblatt.com/finanzen/banken-v...,Erstmals werden Details zu den Vorwürfen bekan...,Großaktionär Förtsch mit neuen Vorwürfe gegen ...,handelsblatt,2024-05-08,"Frankfurt. Bernd Förtsch, größte Einzelaktionä..."
...,...,...,...,...,...,...,...
10030,667cfec66d6290962b720db2,https://www.tagesspiegel.de/berlin/u-haft-unte...,13 Richter und 13 Staatsanwälte werden für den...,"U-Haft, Unterbindungsgewahrsam, Schnellverfahr...",tagesspiegel,2024-05-01,"Nicht nur die Polizei, die mit bis zu 6000 Bea..."
10031,667cfec86d6290962b720db3,https://www.tagesspiegel.de/wissen/ungehorige-...,Zwischen zwei Gewichtsklassen von Himmelskörpe...,„Ungehörige“ Himmelsobjekte : Zu schwer für e...,tagesspiegel,2024-05-01,Mit modernen Teleskopen und Detektoren werden ...
10032,667cfecb6d6290962b720db4,https://www.tagesspiegel.de/politik/die-tur-de...,Deutschland hätte die EU-Osterweiterung „stärk...,"„Ukraine, Georgien, Moldau, Staaten des Westba...",tagesspiegel,2024-05-01,CDU-Chef Friedrich Merz bekennt sich zur Aufna...
10033,667cfece6d6290962b720db5,https://www.tagesspiegel.de/politik/streit-ube...,SPD und Grüne fordern zum Tag der Arbeit eine ...,Streit über Erhöhung des Mindestlohns : „Natü...,tagesspiegel,2024-05-01,Unmittelbar vor dem Tag der Arbeit streitet si...


In [7]:
pd.to_datetime(df["parsed_date"]).describe()

count                            10035
mean     2024-05-16 09:42:27.443945984
min                2024-05-01 00:00:00
25%                2024-05-08 00:00:00
50%                2024-05-16 00:00:00
75%                2024-05-24 00:00:00
max                2024-05-31 00:00:00
Name: parsed_date, dtype: object

In [8]:
df["site_name"].value_counts()

site_name
tagesspiegel    3970
spiegel         3150
heise           1102
handelsblatt    1018
tagesschau       795
Name: count, dtype: int64

## Build article documents

In [9]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/malek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/malek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/malek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
stop_words = set(stopwords.words('german'))
len(stop_words)

232

In [11]:
def preprocess_document(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    words = word_tokenize(text, language='german')
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

def build_documents(records):
    for idx, record in enumerate(records):
        parts = [record.get("title"), record.get("description"), record.get("paragraphs")]
        res = " ".join(p.strip() for p in parts if p is not None and len(p.strip()) > 0)
        res = preprocess_document(res)
        if len(res.strip()) > 0:
            yield idx, res

In [12]:
documents = list(build_documents(df[["title", "description", "paragraphs"]].to_dict("records")))
index = [d[0] for d in documents]
documents = [d[1] for d in documents]
len(documents)

10021

In [13]:
df.iloc[0]

_id                                     66407ce2c0e28ab642bf44f4
url            https://www.handelsblatt.com/finanzen/banken-v...
description    Die Fondsmanagerin Alexandra Annecke kritisier...
title          „IT-Probleme bei der Migration der Postbank si...
site_name                                           handelsblatt
parsed_date                                           2024-05-10
paragraphs     Frankfurt. Die Fondsgesellschaft Union Investm...
Name: 0, dtype: object

In [14]:
index[0]

0

In [15]:
documents[0]

'it probleme migration postbank blamage fondsmanagerin alexandra annecke kritisiert service probleme postbank vorstand aufsichtsrat deutschen bank entlasten gründe frankfurt fondsgesellschaft union investment übt scharfe kritik eingeschränkten fragemöglichkeiten virtuellen hauptversammlung deutschen bank fondsmanagerin alexandra annecke kündigt interview handelsblatt vorstand aufsichtsrat bank entlasten vermögensverwalter gehören rund 0 6 prozent aktien dürfte rang 20 30 größten anteilseigner liegen praktische konsequenzen votum entlastung symbolcharakter investoren drücken führung aktiengesellschaft misstrauen öffentlicher dissens institutionellen investoren führungsriege bank unangenehm'

In [16]:
pd.Series(documents).apply(lambda t: len(t.split())).describe(np.linspace(0, 1, 11))

count    10021.000000
mean       203.585969
std        178.477336
min          2.000000
0%           2.000000
10%         41.000000
20%         62.000000
30%         91.000000
40%        130.000000
50%        177.000000
60%        209.000000
70%        247.000000
80%        306.000000
90%        401.000000
100%      2645.000000
max       2645.000000
dtype: float64

## Create embeddings

In [17]:
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", device="cpu")
model.eval()

summary(model)



Layer (type:depth-idx)                                            Param #
SentenceTransformer                                               --
├─Transformer: 1-1                                                --
│    └─BertModel: 2-1                                             --
│    │    └─BertEmbeddings: 3-1                                   96,212,352
│    │    └─BertEncoder: 3-2                                      21,293,568
│    │    └─BertPooler: 3-3                                       147,840
├─Pooling: 1-2                                                    --
Total params: 117,653,760
Trainable params: 117,653,760
Non-trainable params: 0

In [18]:
@torch.no_grad()
def embed_document(document_or_batch: str | list, model):
    return model.encode(document_or_batch)

In [19]:
batch_size = 16
num_records = len(documents)
num_steps = num_records // batch_size + int(num_records % batch_size > 0)
pbar = tqdm(total=num_steps, position=0, leave=False)

for start_idx in range(0, num_records, batch_size):
    end_idx = min(start_idx + batch_size, num_records)
    batch = documents[start_idx:end_idx]
    embed_document(batch, model)
    pbar.update()

  3%|██▌                                                                             | 20/627 [00:15<07:53,  1.28it/s]
KeyboardInterrupt



# Implementing the entire processes

In [1]:
# dependencies

import json
import os
import numpy as np

import re
import nltk
import torch

from bson.objectid import ObjectId
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymongo import MongoClient, errors
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to /Users/malek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/malek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/malek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# globals

load_dotenv("../../.env")
MONGO_DATABASE = 'insightfinder-dev'
DOCS_COLLECTION = 'documents'
DOC_EMBEDDINGS_COLLECTION = 'embeddings'
MONGO_HOST = os.getenv("MONGO_HOST")

In [3]:
# constants

stop_words = set(stopwords.words('german'))

model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", device="cpu")
model.eval()



SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [7]:
def get_records_in_range(start_date: str = None, end_date: str = None, limit: int = None):

    date_filters = {"$exists": True, "$ne": None}
    if start_date:
        date_filters["$gte"] = start_date
    if end_date:
        date_filters["$lte"] = end_date

    query = {
        "$and": [
            {
                "visited": True, 
                "parsed_date": date_filters,
                "site_name": {"$exists": True, "$nin": ["faz"]},
            },
            {
                "$or": [
                    {"title": {"$exists": True, "$not": {"$in": ["", None]}}},
                    {"description": {"$exists": True, "$not": {"$in": ["", None]}}},
                    {"paragraphs": {"$exists": True, "$not": {"$in": ["", None]}}},
                ]
            }
        ]
    }
    try:
        with MongoClient(MONGO_HOST) as mongo_client:
            db = mongo_client[MONGO_DATABASE]
            collection = db[DOCS_COLLECTION]
            res = collection.find(query)
            if limit:
                res = res.limit(limit)
            yield from res
    except errors.PyMongoError as e:
        print(f"MongoDB error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def get_documents_without_embeddings(document_ids: list):
    try:
        with MongoClient(MONGO_HOST) as mongo_client:
            db = mongo_client[MONGO_DATABASE]
            collection = db[DOC_EMBEDDINGS_COLLECTION]
            existing_documents = collection.find({"_id": {"$in": document_ids}})
            existing_document_ids = {doc["_id"] for doc in existing_documents}
            return [doc_id for doc_id in document_ids if doc_id not in existing_document_ids]
    except errors.PyMongoError as e:
        print(f"MongoDB error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def join_list_of_strings(strings_list):
    if strings_list is None or not isinstance(strings_list, list):
        return strings_list
    return " ".join([x.strip() for x in strings_list if len(x.strip()) > 0])

def preprocess_document(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    words = word_tokenize(text, language='german')
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

def create_document_for_record(record):
    title = join_list_of_strings(record.get("title"))
    description = join_list_of_strings(record.get("description"))
    paragraphs = join_list_of_strings(record.get("paragraphs"))
    document = " ".join([x.strip() for x in [title, description, paragraphs] if x is not None])
    return preprocess_document(document)

@torch.no_grad()
def batch_embed_documents(documents: list, model):
    return model.encode(documents).tolist()

def batch_add_embeddings(payload: list):
    try:
        with MongoClient(MONGO_HOST) as mongo_client:
            db = mongo_client[MONGO_DATABASE]
            collection = db[DOC_EMBEDDINGS_COLLECTION]
            collection.insert_many(payload)
    except errors.PyMongoError as e:
        print(f"MongoDB error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def generate_batches(lst, n):
    if n <= 0:
        raise ValueError("Batch size must be a positive integer.")
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

In [8]:
# variables

batch_size = 16
start_date = "2024-07-27"
end_date = "2024-08-04"

In [9]:
records = list(get_records_in_range(start_date=start_date, end_date=end_date))
num_records = len(records)
num_steps = num_records // batch_size + int(num_records % batch_size > 0)
pbar = tqdm(total=num_steps, position=0, leave=True)

for batch in generate_batches(records, batch_size):
    batch_document_ids = [doc["_id"] for doc in batch]
    doc_ids_without_embeddings = get_documents_without_embeddings(batch_document_ids)
    if len(doc_ids_without_embeddings) != 0:
        effective_batch = [doc for doc in batch if doc["_id"] in set(doc_ids_without_embeddings)]
        documents = [create_document_for_record(record) for record in effective_batch]
        embeddings = batch_embed_documents(documents, model)
        payload = [{"_id": effective_batch[i]["_id"], "embedding": embeddings[i]} for i in range(len(effective_batch))]
        batch_add_embeddings(payload)
    pbar.update()

100%|█████████████████████████████████████████| 167/167 [29:53<00:00, 10.15s/it]