In [1]:
import os
import re
import sys
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from collections import Counter
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

warnings.filterwarnings("ignore")

# Download NLTK data if not already present
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# ─── Configuration ───────────────────────────────────────────────────────────────

# Folders to analyze
FOLDER_PATHS = {
    "Part 1": r"C:\Users\91756\Downloads\Master_thesis\Parsed_Content\Parsed_Content\parsed_content_links_list_part_1",
    "Part 2": r"C:\Users\91756\Downloads\Master_thesis\Parsed_Content\Parsed_Content\parsed_content_links_list_part_2",
    "Part 3": r"C:\Users\91756\Downloads\Master_thesis\Parsed_Content\Parsed_Content\parsed_content_links_list_part_3",
    "Part 4": r"C:\Users\91756\Downloads\Master_thesis\Parsed_Content\Parsed_Content\parsed_content_links_list_part_4"
}

# Comprehensive AI-related keywords
AI_KEYWORDS = {
    # Core AI Terms
    "ai", "artificial intelligence", "machine learning", "deep learning", "neural network", "generative ai",
    # AI Techniques & Methods
    "supervised learning", "unsupervised learning", "reinforcement learning", "transfer learning",
    "federated learning", "attention mechanism",
    # AI Model Types
    "llm", "gpt", "bert", "diffusion model", "gan", "rnn", "cnn", "vlm", "gpt-4v", "llava",
    # AI Applications
    "nlp", "computer vision", "speech recognition", "autonomous systems", "recommender system", "robotic process automation",
    # AI Tools & Frameworks
    "tensorflow", "pytorch", "keras", "huggingface", "langchain", "openai", "anthropic", "mistral ai",
    # Emerging AI Concepts
    "agi", "multimodal ai", "few-shot learning", "prompt engineering", "retrieval-augmented generation",
    # Agent Ecosystem
    "ai agents", "autonomous agents", "multi-agent systems", "embodied ai", "agent tool use",
    # Libraries & Frameworks for AI Agents
    "llamaindex", "crewai", "autogen", "agentops", "semantic kernel", "haystack",
    "weaviate", "pinecone", "qdrant", "chroma", "transformers", "peft", "fastapi",
    "gradio", "streamlit", "guardrails", "rebuff", "guidance",
    # AI APIs
    "openai api", "gpt-4", "gpt-4-turbo", "embeddings", "moderation",
    "anthropic claude api", "claude 3", "google vertex ai", "palm 2", "gemini",
    "mistral ai api", "mistral 7b", "mixtral 8x7b", "cohere api", "command-r",
    "meta llama api", "llama 2", "llama 3", "perplexity api",
    "openai gpt-4v", "google gemini api", "anthropic claude 3 vision",
    "huggingface inference api", "blip-2", "llava",
    "openai whisper", "elevenlabs", "deepgram", "assemblyai",
    "google cloud vision", "aws rekognition", "azure computer vision",
    "roboflow", "openai embeddings", "cohere embed",
    "langchain api", "stability ai", "microsoft semantic kernel"
}

# ─── Helper Functions ────────────────────────────────────────────────────────────

def preprocess_text(text: str) -> list[str]:
    """Lowercase, strip non-alpha, tokenize and remove stopwords (except 'ai')."""
    if not isinstance(text, str):
        return []
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english')) - {'ai'}
    return [w for w in tokens if w not in stop_words and len(w) > 1]

def analyze_ai_content(file_path: str) -> tuple[float, list[str]]:
    """Return (AI_percentage, list_of_ai_words) for a single text file."""
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
        if not content.strip():
            return 0.0, []
        words = preprocess_text(content)
        if len(words) < 10:
            return 0.0, []
        ai_words = [w for w in words if w in AI_KEYWORDS]
        ai_pct = len(ai_words) / len(words) * 100
        return ai_pct, ai_words
    except Exception as e:
        print(f"Error processing {os.path.basename(file_path)}: {e}")
        return 0.0, []

def analyze_files(folder_path: str, max_files: int = None) -> tuple[pd.DataFrame, list[str]]:
    """Analyze all .txt files in folder; return DataFrame and aggregated AI words."""
    results = []
    all_ai_words: list[str] = []
    files = sorted([f for f in os.listdir(folder_path) if f.lower().endswith('.txt')])
    if max_files:
        files = files[:max_files]
    for fname in tqdm(files, desc=f"Analyzing {os.path.basename(folder_path)}"):
        full = os.path.join(folder_path, fname)
        pct, words = analyze_ai_content(full)
        results.append({
            "Filename": fname,
            "AI_Percentage": pct,
            "AI_Word_Count": len(words)
        })
        all_ai_words.extend(words)
    return pd.DataFrame(results), all_ai_words

def create_visualizations(df: pd.DataFrame, ai_words: list[str], output_dir: str):
    """Save histogram, bar-chart, pie and word-cloud to output_dir."""
    os.makedirs(output_dir, exist_ok=True)
    plt.figure(figsize=(18, 12))

    # Histogram: AI %
    plt.subplot(2, 2, 1)
    df['AI_Percentage'].plot.hist(bins=20, edgecolor='black')
    plt.title('Distribution of AI Content %')
    plt.xlabel('AI %')
    plt.ylabel('File Count')
    plt.grid(axis='y', alpha=0.3)

    # Bar chart: Top AI terms
    plt.subplot(2, 2, 2)
    top = pd.DataFrame(Counter(ai_words).most_common(10), columns=['Word', 'Count'])
    top.plot.barh(x='Word', y='Count', legend=False, ax=plt.gca())
    plt.title('Top 10 AI Terms')
    plt.xlabel('Frequency')

    # Pie: files with AI
    plt.subplot(2, 2, 3)
    has_ai = len(df[df['AI_Word_Count'] > 0])
    plt.pie([has_ai, len(df)-has_ai],
            labels=['Contains AI', 'No AI'],
            autopct='%1.1f%%',
            startangle=90)
    plt.title('Files Containing AI')

    # Word Cloud
    plt.subplot(2, 2, 4)
    wc = WordCloud(width=800, height=500, background_color='white')
    wc.generate(' '.join(ai_words))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title('AI Terms Word Cloud')

    plt.tight_layout(pad=3)
    plt.savefig(os.path.join(output_dir, 'visualizations.png'), dpi=300)
    plt.close()

def generate_report(df: pd.DataFrame, ai_words: list[str], output_dir: str):
    """Write CSV, text report and print summary to console."""
    os.makedirs(output_dir, exist_ok=True)
    df.to_csv(os.path.join(output_dir, 'ai_content_analysis.csv'), index=False)
    total = len(df)
    with_ai = len(df[df['AI_Word_Count'] > 0])
    avg_pct = df['AI_Percentage'].mean()
    report = (
        f"AI CONTENT ANALYSIS REPORT\n"
        f"{'='*40}\n"
        f"Total files analyzed: {total}\n"
        f"Files containing AI content: {with_ai} ({with_ai/total*100:.1f}%)\n"
        f"Average AI content %: {avg_pct:.2f}%\n\n"
        "Top 5 files by AI %:\n"
        f"{df.sort_values('AI_Percentage', ascending=False).head(5).to_string(index=False)}\n\n"
        "Top 10 AI terms:\n"
        f"{Counter(ai_words).most_common(10)}\n"
    )
    with open(os.path.join(output_dir, 'report.txt'), 'w') as f:
        f.write(report)
    print(report)
    print(f"Results saved in: {os.path.abspath(output_dir)}\n")

# ─── Main Execution ──────────────────────────────────────────────────────────────

def main():
    folder_stats = {}
    # Analyze each folder independently
    for label, path in FOLDER_PATHS.items():
        print(f"\n--- Processing: {label} ---")
        df, words = analyze_files(path)
        out_dir = os.path.join("results", label.replace(" ", "_"))
        create_visualizations(df, words, out_dir)
        generate_report(df, words, out_dir)
        folder_stats[label] = {
            "df": df,
            "words": words,
            "avg_pct": df['AI_Percentage'].mean(),
            "pct_with_ai": len(df[df['AI_Word_Count']>0]) / len(df) * 100,
            "total": len(df)
        }

    # Comparison summary
    summary = pd.DataFrame([
        {
            "Folder": lbl,
            "Total Files": stats["total"],
            "Avg AI %": stats["avg_pct"],
            "% Files w/ AI": stats["pct_with_ai"]
        }
        for lbl, stats in folder_stats.items()
    ]).set_index("Folder")

    # Save and display summary
    os.makedirs("results", exist_ok=True)
    summary.to_csv(os.path.join("results", "summary_comparison.csv"))
    print("\n=== Summary Comparison ===")
    print(summary)

    # Comparative plots
    plt.figure(figsize=(8,4))
    summary['Avg AI %'].plot.bar()
    plt.title("Average AI % by Folder")
    plt.ylabel("Avg AI %")
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig("results/avg_ai_comparison.png", dpi=300)
    plt.close()

    plt.figure(figsize=(8,4))
    summary['% Files w/ AI'].plot.bar(color='orange')
    plt.title("% Files with AI by Folder")
    plt.ylabel("% Files w/ AI")
    plt.ylim(0,100)
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.savefig("results/pct_with_ai_comparison.png", dpi=300)
    plt.close()

    print("\nComparative plots saved in 'results/'.")

if __name__ == "__main__":
    main()



--- Processing: Part 1 ---


Analyzing parsed_content_links_list_part_1: 100%|██████████| 4370/4370 [00:24<00:00, 175.39it/s]


AI CONTENT ANALYSIS REPORT
Total files analyzed: 4370
Files containing AI content: 2564 (58.7%)
Average AI content %: 1.20%

Top 5 files by AI %:
Filename  AI_Percentage  AI_Word_Count
1555.txt      23.076923              3
1014.txt      16.666667              4
2875.txt      16.528926             20
4297.txt      15.032680             23
 587.txt      14.432990             28

Top 10 AI terms:
[('ai', 26755), ('llm', 1016), ('guidance', 1000), ('gpt', 734), ('nlp', 703), ('openai', 395), ('embeddings', 362), ('bert', 361), ('transformers', 261), ('gemini', 206)]

Results saved in: C:\Users\91756\Downloads\Master_thesis\results\Part_1


--- Processing: Part 2 ---


Analyzing parsed_content_links_list_part_2: 100%|██████████| 4015/4015 [00:22<00:00, 177.76it/s]


AI CONTENT ANALYSIS REPORT
Total files analyzed: 4015
Files containing AI content: 2461 (61.3%)
Average AI content %: 1.29%

Top 5 files by AI %:
 Filename  AI_Percentage  AI_Word_Count
 5693.txt      29.411765              5
10242.txt      20.000000              2
 7585.txt      17.266187             24
 7478.txt      16.393443             20
 6229.txt      13.953488             12

Top 10 AI terms:
[('ai', 27949), ('guidance', 1005), ('gpt', 715), ('llm', 518), ('openai', 463), ('nlp', 366), ('gemini', 182), ('transformers', 107), ('embeddings', 90), ('moderation', 87)]

Results saved in: C:\Users\91756\Downloads\Master_thesis\results\Part_2


--- Processing: Part 3 ---


Analyzing parsed_content_links_list_part_3: 100%|██████████| 4359/4359 [00:25<00:00, 167.72it/s]


AI CONTENT ANALYSIS REPORT
Total files analyzed: 4359
Files containing AI content: 2656 (60.9%)
Average AI content %: 1.24%

Top 5 files by AI %:
 Filename  AI_Percentage  AI_Word_Count
12612.txt      24.000000              6
11452.txt      19.230769             25
15163.txt      16.793893             22
14484.txt      16.406250             21
15211.txt      14.953271             16

Top 10 AI terms:
[('ai', 27515), ('guidance', 977), ('gpt', 716), ('llm', 681), ('openai', 538), ('nlp', 333), ('gemini', 271), ('transformers', 155), ('gan', 85), ('moderation', 84)]

Results saved in: C:\Users\91756\Downloads\Master_thesis\results\Part_3


--- Processing: Part 4 ---


Analyzing parsed_content_links_list_part_4: 100%|██████████| 4396/4396 [00:28<00:00, 154.77it/s]


AI CONTENT ANALYSIS REPORT
Total files analyzed: 4396
Files containing AI content: 2640 (60.1%)
Average AI content %: 1.26%

Top 5 files by AI %:
 Filename  AI_Percentage  AI_Word_Count
19535.txt      25.000000              3
16843.txt      23.076923              3
21104.txt      20.000000              2
18024.txt      20.000000              2
19069.txt      16.666667             28

Top 10 AI terms:
[('ai', 28649), ('guidance', 1273), ('llm', 1092), ('gpt', 657), ('openai', 432), ('nlp', 386), ('gemini', 201), ('transformers', 112), ('embeddings', 80), ('cnn', 74)]

Results saved in: C:\Users\91756\Downloads\Master_thesis\results\Part_4


=== Summary Comparison ===
        Total Files  Avg AI %  % Files w/ AI
Folder                                      
Part 1         4370  1.198520      58.672769
Part 2         4015  1.291634      61.295143
Part 3         4359  1.242018      60.931406
Part 4         4396  1.255026      60.054595

Comparative plots saved in 'results/'.
