# Advanced AI Content Analysis
Using LDA, BERTopic, GPT-4 topic labeling, and comparative visualizations

## 1. Setup & Imports

In [2]:
import os
import re
import sys
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
from tqdm import tqdm

# NLP & Topic Modeling
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from gensim import corpora, models
from gensim.models import CoherenceModel

from bertopic import BERTopic

# Generative AI
import openai

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)


True

## 2. Configuration: Folders & Keywords

In [3]:
# 2. Configuration: Folders & Keywords
FOLDER_PATHS = {
    "Part 1": r"C:\Users\91756\Downloads\Master_thesis\Parsed_Content\Parsed_Content\parsed_content_links_list_part_1",
    "Part 2": r"C:\Users\91756\Downloads\Master_thesis\Parsed_Content\Parsed_Content\parsed_content_links_list_part_2",
    "Part 3": r"C:\Users\91756\Downloads\Master_thesis\Parsed_Content\Parsed_Content\parsed_content_links_list_part_3",
    "Part 4": r"C:\Users\91756\Downloads\Master_thesis\Parsed_Content\Parsed_Content\parsed_content_links_list_part_4"
}

# Comprehensive AI-related keywords
AI_KEYWORDS = {
    "ai", "artificial intelligence", "machine learning", "deep learning", "neural network", "generative ai",
    "supervised learning", "unsupervised learning", "reinforcement learning", "transfer learning",
    "federated learning", "attention mechanism", "llm", "gpt", "bert", "diffusion model", "gan", "rnn", "cnn",
    "vlm", "gpt-4v", "llava", "nlp", "computer vision", "speech recognition", "autonomous systems",
    "recommender system", "robotic process automation", "tensorflow", "pytorch", "keras", "huggingface",
    "langchain", "openai", "anthropic", "mistral ai", "agi", "multimodal ai", "few-shot learning",
    "prompt engineering", "retrieval-augmented generation", "ai agents", "autonomous agents",
    "multi-agent systems", "embodied ai", "agent tool use", "llamaindex", "crewai", "autogen", "agentops",
    "semantic kernel", "haystack", "weaviate", "pinecone", "qdrant", "chroma", "transformers", "peft",
    "fastapi", "gradio", "streamlit", "guardrails", "rebuff", "guidance", "openai api", "gpt-4",
    "gpt-4-turbo", "embeddings", "moderation", "anthropic claude api", "claude 3", "google vertex ai",
    "palm 2", "gemini", "mistral ai api", "mistral 7b", "mixtral 8x7b", "cohere api", "command-r",
    "meta llama api", "llama 2", "llama 3", "perplexity api", "openai gpt-4v", "google gemini api",
    "anthropic claude 3 vision", "huggingface inference api", "blip-2", "llava", "openai whisper",
    "elevenlabs", "deepgram", "assemblyai", "google cloud vision", "aws rekognition",
    "azure computer vision", "roboflow", "openai embeddings", "cohere embed", "langchain api",
    "stability ai", "microsoft semantic kernel"
}

## 3. Preprocessing Function

In [4]:
def preprocess_text(text: str) -> list[str]:
    if not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english')) - {'ai'}
    return [w for w in tokens if w not in stop_words and len(w) > 1]


## 4. Load Documents from Folders

In [5]:
# Load raw text per folder
documents_by_folder = {}
for label, path in FOLDER_PATHS.items():
    docs = []
    for fname in os.listdir(path):
        if fname.lower().endswith('.txt'):
            with open(os.path.join(path, fname), 'r', encoding='utf-8', errors='ignore') as f:
                docs.append(f.read())
    documents_by_folder[label] = docs


## 5. LDA Topic Modeling

In [6]:
lda_results = {}
for label, docs in documents_by_folder.items():
    texts = [preprocess_text(doc) for doc in docs]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=8, passes=10)
    coherence = CoherenceModel(model=lda, texts=texts, dictionary=dictionary, coherence='c_v')
    lda_results[label] = {
        'model': lda,
        'dictionary': dictionary,
        'corpus': corpus,
        'coherence': coherence.get_coherence(),
        'topics': lda.print_topics()
    }
    print(f"{label} LDA coherence: {lda_results[label]['coherence']:.4f}")
    for topic in lda_results[label]['topics']:
        print(topic)
    print()


Part 1 LDA coherence: 0.5863
(0, '0.007*"file" + 0.006*"files" + 0.005*"annotations" + 0.005*"engl" + 0.004*"one" + 0.004*"see" + 0.004*"part" + 0.004*"code" + 0.003*"etc" + 0.003*"also"')
(1, '0.025*"page" + 0.016*"images" + 0.015*"hathitrust" + 0.009*"de" + 0.008*"company" + 0.008*"co" + 0.006*"catalogue" + 0.006*"library" + 0.006*"la" + 0.005*"collection"')
(2, '0.065*"images" + 0.065*"page" + 0.065*"hathitrust" + 0.022*"de" + 0.009*"history" + 0.008*"la" + 0.007*"et" + 0.006*"der" + 0.006*"des" + 0.005*"us"')
(3, '0.009*"pdf" + 0.007*"phd" + 0.006*"university" + 0.006*"health" + 0.006*"view" + 0.005*"professor" + 0.005*"learning" + 0.004*"article" + 0.004*"study" + 0.004*"california"')
(4, '0.018*"ai" + 0.008*"data" + 0.007*"use" + 0.007*"learning" + 0.005*"models" + 0.005*"using" + 0.004*"information" + 0.004*"tools" + 0.004*"generative" + 0.004*"language"')
(5, '0.010*"reply" + 0.005*"one" + 0.005*"pm" + 0.005*"like" + 0.004*"people" + 0.004*"time" + 0.004*"said" + 0.004*"would" 

## 6. BERTopic Modeling

In [7]:
bertopic_results = {}
for label, docs in documents_by_folder.items():
    topic_model = BERTopic(verbose=False)
    topics, probs = topic_model.fit_transform(docs)
    # compute an approximate coherence via class-based TF-IDF topic keywords
    topic_info = topic_model.get_topic_info()
    bertopic_results[label] = {
        'model': topic_model,
        'topics': topic_info.head(10)
    }
    print(f"{label} BERTopic extracted {topic_info.shape[0]-1} topics")
    display(topic_info.head(10))


Part 1 BERTopic extracted 2 topics


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1975,-1_the_and_of_to,"[the, and, of, to, by, at, page, images, hathi...",[URL: https://firstyear.tulane.edu/fall-2024-t...
1,0,2381,0_the_and_of_to,"[the, and, of, to, in, for, on, is, at, with]",[URL: https://oregonnews.uoregon.edu/lccn/sn83...
2,1,14,1_annotations_files_alignments_genome,"[annotations, files, alignments, genome, etc, ...",[URL: https://digital.wpi.edu/catalog?f%5Bmemb...


Part 2 BERTopic extracted 2 topics


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2550,-1_the_and_of_to,"[the, and, of, to, in, for, is, that, by, on]",[URL: https://history.northwestern.edu/courses...
1,0,1446,0_the_and_of_to,"[the, and, of, to, in, for, that, you, is, with]",[URL: https://teaching.uoregon.edu/starter-syl...
2,1,19,1_the_to_of_ai,"[the, to, of, ai, and, in, is, that, for, on]",[URL: https://insights.sei.cmu.edu/blog/contex...


Part 3 BERTopic extracted 2 topics


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2012,-1_the_and_of_to,"[the, and, of, to, in, for, by, at, page, images]",[URL: https://as.vanderbilt.edu/english/underg...
1,0,2330,0_the_and_of_to,"[the, and, of, to, in, for, that, is, you, with]",[URL: https://physics.as.miami.edu/research/ne...
2,1,17,1_privacy_agree_to_commitment,"[privacy, agree, to, commitment, vcus, commonw...",[URL: https://go.vcu.edu/ai\nTitle: Commitment...


Part 4 BERTopic extracted 2 topics


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1424,-1_the_and_of_to,"[the, and, of, to, in, for, is, on, that, ai]",[URL: http://lrieber.coe.uga.edu/mayer2005/\nT...
1,0,2944,0_the_and_of_to,"[the, and, of, to, in, for, is, with, that, on]",[URL: https://oregonnews.uoregon.edu/lccn/sn83...
2,1,28,1_privacy_agree_commitment_to,"[privacy, agree, commitment, to, vcus, cookies...",[URL: https://graduate.vcu.edu/\nTitle: Commit...


## 7. GPT-4 Topic Labeling

In [8]:
pip install transformers accelerate torch

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [9]:
import os
import re
import json
from transformers import pipeline

# ─── 1) Initialize the local chat model ─────────────────────────────────────────
# Adjust `model_name` if you choose a different HF model
model_name = "meta-llama/Llama-2-7b-chat-hf"

chat = pipeline(
    "text-generation",
    model=model_name,
    torch_dtype="auto",
    device_map="auto",
    trust_remote_code=True  # some chat models need this
)

# ─── 2) Define the topic‐labeling function ────────────────────────────────────────
def label_topics_local(docs, n_topics=5):
    """
    Uses a local LLM to identify the top `n_topics` topics in `docs`.
    Returns a dict: {"topic_1": [...], "topic_2": [...], ...}.
    """
    # Prepare a short sample of up to 10 docs, 500 chars each
    sample_lines = []
    for d in docs[:10]:
        clean = d.replace("\n", " ")[:500]
        sample_lines.append(f"- {clean}")
    sample = "\n".join(sample_lines)

    # Construct the prompt (double‐brace JSON skeleton)
    prompt = f"""
You are an AI assistant. Given the following documents, identify the top {n_topics} topics,
each with 3–5 keywords. Return **only** valid JSON of the form {{"topic_1": ["kw1","kw2"], ...}}.

Documents:
{sample}
"""
    # Invoke the model
    out = chat(prompt, max_new_tokens=256, do_sample=False)[0]["generated_text"]

    # Extract the JSON blob
    match = re.search(r"\{.*\}", out, re.DOTALL)
    if not match:
        raise ValueError(f"Could not parse JSON from model output:\n{out}")
    return json.loads(match.group(0))

# ─── 3) Example usage over your foldered documents ────────────────────────────────
# Assume you already have: documents_by_folder: dict[str, list[str]]
# e.g. documents_by_folder = {"Part 1": [...], ...}

gpt4_labels_local = {}
for label, docs in documents_by_folder.items():
    print(f"🔍 Labeling {label} with local LLM…")
    try:
        labels = label_topics_local(docs, n_topics=5)
    except Exception as e:
        print(f"  ⚠️ Error on {label}: {e}")
        labels = {}
    gpt4_labels_local[label] = labels
    print(json.dumps(labels, indent=2))

# Now gpt4_labels_local holds your topic→keywords mappings per folder.


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-2-7b-chat-hf.
401 Client Error. (Request ID: Root=1-682716db-7627e1d11bfc948821111341;87599f87-5c2a-427d-969f-59859a61572f)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-chat-hf is restricted. You must have access to it and be authenticated to access it. Please log in.

## 8. Comparison & Visualization

In [None]:
# Compare LDA coherence
lda_coherence = {lbl: res['coherence'] for lbl, res in lda_results.items()}
plt.figure(figsize=(6,4))
pd.Series(lda_coherence).plot.bar()
plt.title('LDA Coherence by Folder')
plt.ylabel('Coherence (c_v)')
plt.show()

# BERTopic topic counts
ber_counts = {lbl: res['topics'].shape[0]-1 for lbl, res in bertopic_results.items()}
plt.figure(figsize=(6,4))
pd.Series(ber_counts).plot.bar(color='green')
plt.title('BERTopic Number of Topics by Folder')
plt.ylabel('Number of Topics')
plt.show()

# Display GPT-4 labels
for lbl, topics in gpt4_labels.items():
    print(f"### {lbl} GPT-4 Topics")
    display(pd.DataFrame.from_dict(topics, orient='index', columns=['keywords']))


## 9. Word Clouds from LDA & BERTopic

In [None]:
# LDA topic word clouds
for lbl, res in lda_results.items():
    for tid, t in res['model'].show_topics(formatted=False):
        words = dict(t)
        wc = WordCloud(width=400, height=300).generate_from_frequencies(words)
        plt.figure(figsize=(4,3))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title(f"{lbl} LDA Topic {tid}")
        plt.show()

# BERTopic topic word clouds
for lbl, res in bertopic_results.items():
    topics = res['model'].get_topics()
    for tid in list(topics.keys())[:5]:
        wc = WordCloud(width=400, height=300).generate_from_frequencies(dict(topics[tid]))
        plt.figure(figsize=(4,3))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis('off')
        plt.title(f"{lbl} BERTopic {tid}")
        plt.show()
