In [None]:
# !pip install pandas pymongo torchinfo --quiet

In [64]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

import re
import nltk
import torch

from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymongo import MongoClient, errors
from sentence_transformers import SentenceTransformer
from torchinfo import summary
from tqdm import tqdm

load_dotenv("../.env")

True

In [2]:
MONGO_DATABASE = 'insightfinder-dev'
MONGO_COLLECTION = 'content'
MONGO_HOST = os.getenv("MONGO_HOST")

# Data extraction

In [3]:
def generate_data(query: dict, projection: dict = None, sort_order: list = None, limit: int = None):
    try:
        with MongoClient(MONGO_HOST) as mongo_client:
            db = mongo_client[MONGO_DATABASE]
            collection = db[MONGO_COLLECTION]
            res = collection.find(query, projection or {})
            if sort_order:
                res = res.sort(sort_order)
            if limit:
                res = res.limit(limit)
            yield from res
    except errors.PyMongoError as e:
        print(f"MongoDB error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


def preprocess_paragraphs(paragraphs):
    if not isinstance(paragraphs, list):
        return None
    return " ".join(p.strip() for p in paragraphs if len(p.strip()) > 0)

In [4]:
query = {
    "visited": True, 
    "parsed_date": {
        "$exists": True, "$ne": None, "$gte": "2024-05-01",
    },
    "site_name": {
        "$exists": True, "$nin": ["faz"],
    }
}
projection = {"_id": 1, "url": 1, "parsed_date": 1, "title": 1, "description": 1, "paragraphs": 1, "site_name": 1}

# sort_order = [("parsed_date", -1)]
sort_order = None

limit = None

data_generator = generate_data(query, projection, sort_order, limit)
df = pd.DataFrame(data_generator)
df["paragraphs"] = df["paragraphs"].apply(preprocess_paragraphs)

In [5]:
df

Unnamed: 0,_id,url,description,title,site_name,parsed_date,paragraphs
0,66407ce2c0e28ab642bf44f4,https://www.handelsblatt.com/finanzen/banken-v...,Die Fondsmanagerin Alexandra Annecke kritisier...,„IT-Probleme bei der Migration der Postbank si...,handelsblatt,2024-05-10,Frankfurt. Die Fondsgesellschaft Union Investm...
1,66407ce2c0e28ab642bf44f5,https://www.handelsblatt.com/finanzen/banken-v...,Bei kontaktlosen Zahlungen per EC-Karte muss m...,Darum müssen Sie plötzlich kaum noch Ihre PIN ...,handelsblatt,2024-05-10,"Frankfurt. Üblich ist, dass man beim Bezahlen ..."
2,66407ce2c0e28ab642bf44f6,https://www.handelsblatt.com/finanzen/banken-v...,Die Autoversicherung leidet unter stark gestie...,Allianz lässt bei der Autoreparatur nun auch g...,handelsblatt,2024-05-10,München. Die Kfz- Versicherung der Allianz \...
3,66407ce2c0e28ab642bf44f7,https://www.handelsblatt.com/finanzen/banken-v...,Die spanische Großbank will die kleinere Konku...,BBVA macht feindliches Übernahmeangebot für Sa...,handelsblatt,2024-05-09,"Madrid, Düsseldorf. Die spanische Großbank BB..."
4,66407ce2c0e28ab642bf44f8,https://www.handelsblatt.com/finanzen/banken-v...,Erstmals werden Details zu den Vorwürfen bekan...,Großaktionär Förtsch mit neuen Vorwürfe gegen ...,handelsblatt,2024-05-08,"Frankfurt. Bernd Förtsch, größte Einzelaktionä..."
...,...,...,...,...,...,...,...
25390,6685a7ce6c1177bf3bc2ecad,https://www.tagesspiegel.de/potsdam/landeshaup...,Zwei Männer gerieten am Montagabend in Streit....,Mit Gegenständen beworfen : Betrunkene gehen ...,tagesspiegel,2024-07-02,Zwei Betrunkene haben sich am Montagabend in d...
25391,6685a7d46c1177bf3bc2ecae,https://www.tagesspiegel.de/potsdam/landeshaup...,Aus unbekannter Ursache geriet ein 53-Jähriger...,Autofahrer geriet in Gegenverkehr : Frontalzu...,tagesspiegel,2024-07-02,Im Potsdamer Ortsteil Nedlitz ist ein Autofahr...
25392,6685a7dd6c1177bf3bc2ecaf,https://www.tagesspiegel.de/internetrouter-fir...,Fritz!-Boxen sind in vielen Haushalten präsent...,Internetrouter-Firma : Fritz!-Box-Hersteller ...,tagesspiegel,2024-07-02,Das Bundeskartellamt hat eine hohe Geldbuße ge...
25393,6685a7e76c1177bf3bc2ecb0,https://www.tagesspiegel.de/regierungskonsulta...,Die deutsch-polnischen Regierungskonsultatione...,Regierungskonsultationen : Polen und Deutschl...,tagesspiegel,2024-07-02,Deutschland und Polen wollen mit einem Aktions...


In [6]:
pd.to_datetime(df["parsed_date"]).describe()

count                            25395
mean     2024-06-05 07:29:53.408151040
min                2024-05-01 00:00:00
25%                2024-05-21 00:00:00
50%                2024-06-08 00:00:00
75%                2024-06-21 00:00:00
max                2024-07-03 00:00:00
Name: parsed_date, dtype: object

In [7]:
df["site_name"].value_counts()

site_name
tagesspiegel    12049
spiegel          6797
handelsblatt     2575
heise            2350
tagesschau       1624
Name: count, dtype: int64

# Build article documents

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [11]:
stop_words = set(stopwords.words('german'))
len(stop_words)

232

In [23]:
def preprocess_document(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    words = word_tokenize(text, language='german')
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

def build_documents(records):
    for idx, record in enumerate(records):
        parts = [record.get("title"), record.get("description"), record.get("paragraphs")]
        res = " ".join(p.strip() for p in parts if p is not None and len(p.strip()) > 0)
        res = preprocess_document(res)
        if len(res.strip()) > 0:
            yield idx, res

In [24]:
documents = list(build_documents(df[["title", "description", "paragraphs"]].to_dict("records")))
index = [d[0] for d in documents]
documents = [d[1] for d in documents]
len(documents)

25370

In [25]:
df.iloc[0]

_id                                     66407ce2c0e28ab642bf44f4
url            https://www.handelsblatt.com/finanzen/banken-v...
description    Die Fondsmanagerin Alexandra Annecke kritisier...
title          „IT-Probleme bei der Migration der Postbank si...
site_name                                           handelsblatt
parsed_date                                           2024-05-10
paragraphs     Frankfurt. Die Fondsgesellschaft Union Investm...
Name: 0, dtype: object

In [26]:
index[0]

0

In [27]:
documents[0]

'it probleme migration postbank blamage fondsmanagerin alexandra annecke kritisiert service probleme postbank vorstand aufsichtsrat deutschen bank entlasten gründe frankfurt fondsgesellschaft union investment übt scharfe kritik eingeschränkten fragemöglichkeiten virtuellen hauptversammlung deutschen bank fondsmanagerin alexandra annecke kündigt interview handelsblatt vorstand aufsichtsrat bank entlasten vermögensverwalter gehören rund 0 6 prozent aktien dürfte rang 20 30 größten anteilseigner liegen praktische konsequenzen votum entlastung symbolcharakter investoren drücken führung aktiengesellschaft misstrauen öffentlicher dissens institutionellen investoren führungsriege bank unangenehm'

In [44]:
pd.Series(documents).apply(lambda t: len(t.split())).describe(np.linspace(0, 1, 11))

count    25370.000000
mean       339.290737
std        292.713831
min          2.000000
0%           2.000000
10%         71.000000
20%        109.000000
30%        164.000000
40%        220.000000
50%        284.000000
60%        339.000000
70%        404.000000
80%        506.000000
90%        684.000000
100%      5364.000000
max       5364.000000
dtype: float64

# Create embeddings

In [58]:
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", device="cpu")
model.eval()

summary(model)

Layer (type:depth-idx)                                            Param #
SentenceTransformer                                               --
├─Transformer: 1-1                                                --
│    └─BertModel: 2-1                                             --
│    │    └─BertEmbeddings: 3-1                                   96,212,352
│    │    └─BertEncoder: 3-2                                      21,293,568
│    │    └─BertPooler: 3-3                                       147,840
├─Pooling: 1-2                                                    --
Total params: 117,653,760
Trainable params: 117,653,760
Non-trainable params: 0

In [63]:
@torch.no_grad()
def embed_document(document_or_batch: str | list, model):
    return model.encode(document_or_batch)

In [71]:
batch_size = 16
num_records = len(documents)
num_steps = num_records // batch_size + int(num_records % batch_size > 0)
pbar = tqdm(total=num_steps, position=0, leave=False)

for start_idx in range(0, num_records, batch_size):
    end_idx = min(start_idx + batch_size, num_records)
    batch = documents[start_idx:end_idx]
    embed_document(batch, model)
    pbar.update()

  4%|█▍                                       | 57/1586 [00:45<20:49,  1.22it/s]
KeyboardInterrupt



# Implementing the entire processes

In [None]:
# dependencies

import os
import numpy as np

import re
import nltk
import torch

from bson.objectid import ObjectId
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymongo import MongoClient, errors
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [10]:
# globals

load_dotenv("../.env")
MONGO_DATABASE = 'insightfinder-dev'
MONGO_COLLECTION = 'content'
MONGO_HOST = os.getenv("MONGO_HOST")

In [22]:
# constants

stop_words = set(stopwords.words('german'))

model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", device="cpu")
model.eval()



SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [25]:
# variables

batch_size = 16

In [48]:
def get_records_without_embedding(limit: int = None):
    query = {
        "$and": [
            {
                "visited": True, 
                "parsed_date": {"$exists": True, "$ne": None},
                "site_name": {"$exists": True, "$nin": ["faz"]},
            },
            {
                "$or": [
                    {"embedding": {"$exists": False}},
                    {"embedding": {"$eq": None}}
                ],
            },
            {
                "$or": [
                    {"title": {"$exists": True, "$not": {"$in": ["", None]}}},
                    {"description": {"$exists": True, "$not": {"$in": ["", None]}}},
                    {"paragraphs": {"$exists": True, "$not": {"$in": ["", None]}}},
                ]
            }
        ]
    }
    try:
        with MongoClient(MONGO_HOST) as mongo_client:
            db = mongo_client[MONGO_DATABASE]
            collection = db[MONGO_COLLECTION]
            res = collection.find(query)
            if limit:
                res = res.limit(limit)
            yield from res
    except errors.PyMongoError as e:
        print(f"MongoDB error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def join_list_of_strings(strings_list):
    if strings_list is None or not isinstance(strings_list, list):
        return strings_list
    return " ".join([x.strip() for x in strings_list if len(x.strip()) > 0])

def preprocess_document(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    words = word_tokenize(text, language='german')
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

def create_document_for_record(record):
    title = join_list_of_strings(record.get("title"))
    description = join_list_of_strings(record.get("description"))
    paragraphs = join_list_of_strings(record.get("paragraphs"))
    document = " ".join([x.strip() for x in [title, description, paragraphs] if x is not None])
    return preprocess_document(document)

@torch.no_grad()
def batch_embed_documents(documents: list, model):
    return model.encode(documents).tolist()

def add_embedding_for_record(record_id: ObjectId, embedding: list):
    try:
        with MongoClient(MONGO_HOST) as mongo_client:
            db = mongo_client[MONGO_DATABASE]
            collection = db[MONGO_COLLECTION]
            collection.update_one(
                {"_id": record_id},
                {"$set": {"embedding": embedding}},
            )
    except errors.PyMongoError as e:
        print(f"MongoDB error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def generate_batches(lst, n):
    if n <= 0:
        raise ValueError("Batch size must be a positive integer.")
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

In [41]:
# generator = get_records_without_embedding(limit=1)
# record = next(generator)
# print(record["_id"])
# document = create_document_for_record(record)
# embedding = batch_embed_documents(document, model)
# add_embedding_for_record(record["_id"], embedding)

66407ce2c0e28ab642bf44f7


In [None]:
records = list(get_records_without_embedding())
num_records = len(records)
num_steps = num_records // batch_size + int(num_records % batch_size > 0)
pbar = tqdm(total=num_steps)

for batch in generate_batches(records, batch_size):
    documents = [create_document_for_record(record) for record in batch]
    embeddings = batch_embed_documents(documents, model)
    for idx in range(len(batch)):
        add_embedding_for_record(batch[idx]["_id"], embeddings[idx])
    pbar.update()


  0%|▏                                      | 7/1627 [03:16<12:36:51, 28.03s/it][A

  0%|                                        | 1/1620 [00:10<4:42:19, 10.46s/it][A
  0%|                                        | 2/1620 [00:21<4:43:30, 10.51s/it][A
  0%|                                        | 3/1620 [00:31<4:43:42, 10.53s/it][A
  0%|                                        | 4/1620 [00:41<4:42:25, 10.49s/it][A
  0%|                                        | 5/1620 [00:52<4:41:21, 10.45s/it][A
  0%|▏                                       | 6/1620 [01:03<4:44:17, 10.57s/it][A
  0%|▏                                       | 7/1620 [01:13<4:43:01, 10.53s/it][A
  0%|▏                                       | 8/1620 [01:24<4:41:50, 10.49s/it][A
  1%|▏                                       | 9/1620 [01:34<4:45:27, 10.63s/it][A
  1%|▏                                      | 10/1620 [01:45<4:46:14, 10.67s/it][A
  1%|▎                                      | 11/1620 [01:56<4:46:56, 10.7