In [None]:
#%pip install pandas pymongo bertopic spacy --quiet

In [None]:
#%pip install langchain-openai

In [1]:
import spacy

  return torch._C._cuda_getDeviceCount() > 0


In [1]:
spacy_model_name = "de_core_news_md"
!python -m spacy download {spacy_model_name} --quiet

/home/mirza/miniconda3/bin/python: No module named spacy


In [16]:
%pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [2]:
# sensorz notebook
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [3]:
import os
from pathlib import Path

import numpy as np
import openai
import pandas as pd
import spacy

from datetime import datetime
from dotenv import load_dotenv
from pymongo import MongoClient, errors
from tqdm import tqdm

# bertopic components
from bertopic import BERTopic
from bertopic.representation import OpenAI
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

load_dotenv("../.env")

  from .autonotebook import tqdm as notebook_tqdm


True

# Data extraction

In [4]:
MONGO_DATABASE = 'insightfinder-dev'
MONGO_COLLECTION = 'content'
MONGO_HOST = os.getenv("MONGO_HOST")

In [5]:
def generate_data(query: dict, projection: dict = None, sort_order: list = None, limit: int = None):
    try:
        with MongoClient(MONGO_HOST) as mongo_client:
            db = mongo_client[MONGO_DATABASE]
            collection = db[MONGO_COLLECTION]
            res = collection.find(query, projection or {})
            if sort_order:
                res = res.sort(sort_order)
            if limit:
                res = res.limit(limit)
            yield from res
    except errors.PyMongoError as e:
        print(f"MongoDB error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


def preprocess_paragraphs(paragraphs):
    if not isinstance(paragraphs, list):
        return None
    return " ".join(p.strip() for p in paragraphs if len(p.strip()) > 0)

In [8]:
#query = {"visited": True, "parsed_date": {"$exists": True, "$ne": None, "$gte": "2024-05-20"}}
query = {"visited": True, "parsed_date": {"$exists": True, "$ne": None, "$gte": "2024-05-20", "$lte": "2024-06-04"}}
projection = {"_id": 1, "url": 1, "parsed_date": 1, "title": 1, "description": 1, "paragraphs": 1, "site_name": 1}
sort_order = None #[("parsed_date", -1)]
limit = None

data_generator = generate_data(query, projection, sort_order, limit)
df = pd.DataFrame(data_generator)
df["paragraphs"] = df["paragraphs"].apply(preprocess_paragraphs)

NameError: name 'self' is not defined

In [None]:
df

In [None]:
df["site_name"].value_counts()

# Prepare data

In [9]:
def build_documents(record):
    components = [record.get("title"), record.get("description"), record.get("paragraphs")]
    document = " ".join(c.strip() for c in components if c is not None and len(c.strip()) > 0)
    document = document.strip()
    return document

In [10]:
df["document"] = df.apply(build_documents, axis=1)
df["document"].apply(len).describe()

count     7075.000000
mean      2415.627138
std       2157.369548
min          0.000000
25%        764.000000
50%       2039.000000
75%       3299.500000
max      27871.000000
Name: document, dtype: float64

In [11]:
df = df.loc[df["document"].str.len() > 0]
df.shape

(7065, 8)

In [14]:
nlp = spacy.load(spacy_model_name)

def preprocess_documents(document):
    doc = nlp(document)
    tokens = [
        token.lemma_.lower() for token in doc
        if not token.is_stop and not token.is_punct and not token.is_digit and token.lemma_.strip()
    ]
    document = " ".join(tokens)
    return document


tqdm.pandas()
df["processed_document"] = df["document"].progress_apply(preprocess_documents)
df["processed_document"].apply(len).describe()

OSError: [E050] Can't find model 'de_core_news_md'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
df.document.iloc[1]

In [None]:
df.processed_document.iloc[1]

# Topic modelling

In [None]:
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine")
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)
vectorizer_model = CountVectorizer(min_df=10, ngram_range=(1, 3))

topic_model = BERTopic(
    embedding_model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    umap_model=umap_model,
    ctfidf_model=ctfidf_model,
    vectorizer_model=vectorizer_model,
    verbose=True,
)

In [None]:
documents = df.processed_document.tolist()

topics, probs = topic_model.fit_transform(documents)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_heatmap()

## Topics over time

In [None]:
openai_client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
openai_model = OpenAI(
    openai_client,
    model="gpt-3.5-turbo",
    chat=True,
    tokenizer="char",
    doc_length=500,
    nr_docs=5,
)

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine")
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)
vectorizer_model = CountVectorizer(min_df=10, ngram_range=(1, 3))

topic_model = BERTopic(
    embedding_model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    umap_model=umap_model,
    ctfidf_model=ctfidf_model,
    vectorizer_model=vectorizer_model,
    representation_model=openai_model,
    verbose=True,
)

In [None]:
documents = df.processed_document.tolist()
dates = pd.to_datetime(df.parsed_date).tolist()

topics, probs = topic_model.fit_transform(documents)
topics_over_time = topic_model.topics_over_time(documents, dates, nr_bins=7)

In [None]:
sd, ed = df.parsed_date.min(), df.parsed_date.max()
sd = datetime.strptime(sd, "%Y-%m-%d").strftime("%y%m%d")
ed = datetime.strptime(ed, "%Y-%m-%d").strftime("%y%m%d")

output_path = Path(f"./data/topics/v2/topics_over_time_{sd}-{ed}.csv")
output_path.parent.mkdir(exist_ok=True, parents=True)
topics_over_time.to_csv(output_path, index=False)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

In [None]:
topics_over_time

In [None]:
df["predicted_topic"] = topics

output_path = Path(f"./data/topics/v2/predicted_topics_{sd}-{ed}.csv")
output_path.parent.mkdir(exist_ok=True, parents=True)
outcols = ["_id", "url", "title", "description", "paragraphs", "predicted_topic"]
df[outcols].to_csv(output_path, index=False)

In [None]:
df.loc[df["predicted_topic"] == 6]