In [1]:
import os
import json
import re
from collections import defaultdict
import networkx as nx
from services.extraction import ExtractionService
from utils.doc_utils import load_docx_documents
from dotenv import load_dotenv
from langchain_core.documents import Document
from concurrent.futures import ThreadPoolExecutor
import langextract as lx

_ = load_dotenv()

def process_document(doc: Document):
    """
    Processes a single document to extract concepts.
    """
    extraction_service = ExtractionService(documents=[doc])
    result = extraction_service.compress_and_extract_concepts()
    return result

# --- Main Loop ---
output_dir = "outputs"
os.makedirs(output_dir, exist_ok=True)

documents = load_docx_documents("unstructured_script")
results = []

# Use ThreadPoolExecutor to process documents in parallel
with ThreadPoolExecutor(max_workers=4) as executor:
    results = list(executor.map(process_document, documents))

# Filter out failed results (which are dicts) and keep successful ones
successful_results = [res for res in results if hasattr(res, 'extractions')]

# Save successful results to a JSONL file
jsonl_path = os.path.join(output_dir, "extraction_results.jsonl")
lx.io.save_annotated_documents(successful_results, output_name=jsonl_path)

print(f"Saved {len(successful_results)} successful extractions to '{jsonl_path}'")

# --- Now, let's process the saved JSONs into individual graph files as before ---

def create_graph_from_extraction(result: lx.data.AnnotatedDocument):
    # Post-process the extractions for consolidated and cleaned output
    topics = [extraction.extraction_text for extraction in result.extractions if extraction.extraction_class == 'TOPIC']
    summaries = [extraction.extraction_text for extraction in result.extractions if extraction.extraction_class == 'SUMMARY']
    concepts = [extraction for extraction in result.extractions if extraction.extraction_class == 'CONCEPT']

    # --- 1. Consolidate Topic and Summary ---
    final_topic = topics[0] if topics else "No Topic Found"
    final_summary = " ".join(summaries)

    # --- 2. Advanced Concept Deduplication and Merging ---
    def normalize_concept_text(text):
        """Normalizes text for better grouping of concepts."""
        text = text.lower()
        text = re.sub(r'\(.*\)', '', text).strip()
        if text.endswith('s'):
            text = text[:-1]
        text = re.sub(r'\s+', '', text)
        return text

    merged_concepts = defaultdict(lambda: {'definitions': set(), 'original_names': set()})

    for concept in concepts:
        normalized_name = normalize_concept_text(concept.extraction_text)
        definition = concept.attributes.get('definition')
        if definition:
            merged_concepts[normalized_name]['definitions'].add(definition)
        merged_concepts[normalized_name]['original_names'].add(concept.extraction_text)

    # --- 3. Build Knowledge Graph ---
    G = nx.DiGraph()

    if final_topic != "No Topic Found":
        G.add_node(final_topic, type='Topic', summary=final_summary)

        for normalized_name, data in merged_concepts.items():
            display_name = max(data['original_names'], key=len)
            full_definition = " | ".join(sorted(list(data['definitions'])))
            G.add_node(display_name, type='Concept', definition=full_definition)
            G.add_edge(display_name, final_topic, relation='is_concept_in')
            
    return G

for result in successful_results:
    try:
        graph = create_graph_from_extraction(result)
        
        # Get the original filename and create a JSON filename
        # The source might be in the document metadata if it was processed that way
        original_filename = os.path.basename(result.documents[0].metadata.get("source", "unknown.docx"))
        json_filename = os.path.splitext(original_filename)[0] + ".json"
        output_path = os.path.join(output_dir, json_filename)
        
        # Convert graph to a serializable format and save as JSON
        graph_data = nx.node_link_data(graph)
        
        with open(output_path, 'w') as f:
            json.dump(graph_data, f, indent=4)
            
        print(f"Saved knowledge graph for '{original_filename}' to '{output_path}'")
    except Exception as e:
        original_filename = os.path.basename(result.documents[0].metadata.get("source", "unknown.docx"))
        print(f"Failed to create or save graph for document '{original_filename}': {e}")

print("All documents have been processed.")



Original text length: 5710
After preprocessing length: 5425
Preprocessing removed: 285 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, in this session, we will discuss
Some idea ab...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro
Original text length: 13464
After preprocessing length: 13335
Preprocessing removed: 129 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, 
in this session, we discuss in memory comput...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro
Original text length: 14313
After preprocessing length: 14118
Preprocessing removed: 195 characters

First 200 chars of cleaned text:
Hello everyone, I am

The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.


✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'transcript BD 2-3 external data acquisition.docx' to 'outputs/transcript BD 2-3 external data acquisition.json'
Original text length: 5513
After preprocessing length: 5352
Preprocessing removed: 161 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, in this session, we will discuss Data Cleanin...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'big data analysis 3-3.docx' to 'outputs/big data analysis 3-3.json'
Original text length: 2039
After preprocessing length: 1972
Preprocessing removed: 67 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, In this chapter we will learn some popular al...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'BDA 5-22.docx' to 'outputs/BDA 5-22.json'
Original text length: 10334
After preprocessing length: 10101
Preprocessing removed: 233 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering，
School of Computer Science, Beijing Institute of Technology, 
in this session, we will discuss about big dat...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'transcript BD 1-5 lifecycle.docx' to 'outputs/transcript BD 1-5 lifecycle.json'
Original text length: 5852
After preprocessing length: 5755
Preprocessing removed: 97 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering，
School of Computer Science, Beijing Institute of Technology, 
in this session, we will discuss about Big Dat...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'transcript BD 1-6 processing flow.docx' to 'outputs/transcript BD 1-6 processing flow.json'
Original text length: 2989
After preprocessing length: 2940
Preprocessing removed: 49 characters

First 200 chars of cleaned text:
types of NoSQL
Key Value Pair Based
Data is stored in key/value pairs. It is designed in such a way to handle lots of data and heavy load.
Key-value pair storage databases store data as a hash table w...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for '4 types of NoSQL.docx' to 'outputs/4 types of NoSQL.json'
Original text length: 7238
After preprocessing length: 7113
Preprocessing removed: 125 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, in this session, we will discuss
Some idea ab...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'transcript BD 2-1 resources-attack  explaination.docx' to 'outputs/transcript BD 2-1 resources-attack  explaination.json'
Original text length: 5224
After preprocessing length: 5137
Preprocessing removed: 87 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from  Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology , from this session ,
we start to learn Data ...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'BDA 4-6.docx' to 'outputs/BDA 4-6.json'
Original text length: 5761
After preprocessing length: 5661
Preprocessing removed: 100 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, in this session, we will discuss Data Quality...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'Big data 3-2.docx' to 'outputs/Big data 3-2.json'
Original text length: 3723
After preprocessing length: 3613
Preprocessing removed: 110 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, in this session, we will discuss  Data transf...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'Big data analysis 3-4.docx' to 'outputs/Big data analysis 3-4.json'
Original text length: 9628
After preprocessing length: 9516
Preprocessing removed: 112 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, in this session, we will discuss about The fo...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'BDA5-6.docx' to 'outputs/BDA5-6.json'
Original text length: 7811
After preprocessing length: 7693
Preprocessing removed: 118 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, from this session,
we start to learn Data sto...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'BDA5-7.docx' to 'outputs/BDA5-7.json'
Original text length: 8107
After preprocessing length: 7453
Preprocessing removed: 654 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, in this session, we will discuss about big da...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'transcript BD 1-3 The fourth paradigm.docx' to 'outputs/transcript BD 1-3 The fourth paradigm.json'
Original text length: 14192
After preprocessing length: 13816
Preprocessing removed: 376 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, from this session on, we will discuss somethi...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'BDA 4-1.docx' to 'outputs/BDA 4-1.json'
Original text length: 5340
After preprocessing length: 5170
Preprocessing removed: 170 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, in the scope of big data, there are mainly tw...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'transcript BD 1-4 Big Data Characters.docx' to 'outputs/transcript BD 1-4 Big Data Characters.json'
Original text length: 7429
After preprocessing length: 7135
Preprocessing removed: 294 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, from this session,
we start to learn Data pro...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'transcript BD 1-2 structured and unstructured data.docx' to 'outputs/transcript BD 1-2 structured and unstructured data.json'
Original text length: 6321
After preprocessing length: 6144
Preprocessing removed: 177 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from  Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology , in this session we discuss Data processing ...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'BDA5-2.docx' to 'outputs/BDA5-2.json'
Original text length: 6167
After preprocessing length: 6057
Preprocessing removed: 110 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, in this session, we will discuss
Some idea ab...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'transcript BD 2-1 Data resources.docx' to 'outputs/transcript BD 2-1 Data resources.json'
Original text length: 10256
After preprocessing length: 10107
Preprocessing removed: 149 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, in this session we will talk about the powerf...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'BDA5-1.docx' to 'outputs/BDA5-1.json'
Original text length: 5739
After preprocessing length: 5615
Preprocessing removed: 124 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, in this session, we will discuss
Deep web dat...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'transcript BD 2-4 deep web.docx' to 'outputs/transcript BD 2-4 deep web.json'
Original text length: 5822
After preprocessing length: 5736
Preprocessing removed: 86 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, 
in this session, we discuss about what is No...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'BDA 4-3.docx' to 'outputs/BDA 4-3.json'
Original text length: 8714
After preprocessing length: 8605
Preprocessing removed: 109 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering，
School of Computer Science, Beijing Institute of Technology, 
in this session, we will discuss big data gene...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'BDA 6-1.docx' to 'outputs/BDA 6-1.json'
Original text length: 7930
After preprocessing length: 7835
Preprocessing removed: 95 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from  Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology , in this session we discuss about distribute...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'transcript BD 1-7 architecture.docx' to 'outputs/transcript BD 1-7 architecture.json'
Original text length: 10415
After preprocessing length: 10214
Preprocessing removed: 201 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology
In this chapter, we introduced some useful pla...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'BDA 4-2.docx' to 'outputs/BDA 4-2.json'
Original text length: 5392
After preprocessing length: 5179
Preprocessing removed: 213 characters

First 200 chars of cleaned text:
School of Computer Science, in Beijing Institute of Technology, in this session, we will discuss Data Preprocessing.
Data Preprocessing mainly includes data cleaning Data transformation and data reduc...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'Big data 3-1.docx' to 'outputs/Big data 3-1.json'
Original text length: 4109
After preprocessing length: 4039
Preprocessing removed: 70 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from  Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology , in this session we discuss batch processing...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'BDA5-3.docx' to 'outputs/BDA5-3.json'
Original text length: 5100
After preprocessing length: 4899
Preprocessing removed: 201 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, in this session, we will discuss Some idea ab...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'BDA 6-2.docx' to 'outputs/BDA 6-2.json'
Original text length: 16549
After preprocessing length: 16403
Preprocessing removed: 146 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology, in this session we discuss Distributed Graph ...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'transcript BD 1-1 Concepts.docx' to 'outputs/transcript BD 1-1 Concepts.json'
Original text length: 4576
After preprocessing length: 4465
Preprocessing removed: 111 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology
in last session we learned 3 main Recommendati...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




✅ Successfully extracted concepts with preprocessing + LLM compression
Saved knowledge graph for 'BDA 6-4.docx' to 'outputs/BDA 6-4.json'
Original text length: 6815
After preprocessing length: 6707
Preprocessing removed: 108 characters

First 200 chars of cleaned text:
Hello everyone, I am Haiying Che, from  Institute of Data Science and knowledge Engineering
School of Computer Science, in Beijing Institute of Technology , from this session ,
we start to learn Data ...
🔄 Using custom OpenAI endpoint: https://api.xiaocaseai.com/v1
📋 Model: gemini-2.5-pro




❌ Extraction failed: Gemini API error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-pro'}, 'quotaValue': '50'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '2s'}]}}
Failed to process document 'BDA 4-5.docx': 'dict' object has no attribute 'extractions'
Original text length: 6



❌ Extraction failed: Gemini API error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-pro'}, 'quotaValue': '50'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '0s'}]}}
Failed to process document 'Bigdata analysis 3-5.docx': 'dict' object has no attribute 'extractions'
Original t



❌ Extraction failed: Gemini API error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-pro'}, 'quotaValue': '50'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '58s'}]}}
Failed to process document 'BDA5-4.docx': 'dict' object has no attribute 'extractions'
Original text length: 7



❌ Extraction failed: Gemini API error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-pro'}, 'quotaValue': '50'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '56s'}]}}
Failed to process document 'transcript BD 2-2 internal data acquisition.docx': 'dict' object has no attribute 



❌ Extraction failed: Gemini API error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'model': 'gemini-2.5-pro', 'location': 'global'}, 'quotaValue': '50'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '54s'}]}}
Failed to process document 'BDA5-5.docx': 'dict' object has no attribute 'extractions'
Original text length: 5



❌ Extraction failed: Gemini API error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-pro'}, 'quotaValue': '50'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '52s'}]}}
Failed to process document 'BDA 4-4.docx': 'dict' object has no attribute 'extractions'
Original text length: 



❌ Extraction failed: Gemini API error: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'model': 'gemini-2.5-pro', 'location': 'global'}, 'quotaValue': '50'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '50s'}]}}
Failed to process document 'BDA 6-5.docx': 'dict' object has no attribute 'extractions'
✅ Successfully extract

In [2]:
successful_results

NameError: name 'successful_results' is not defined

In [None]:
import langextract as lx
import os

def generate_visualization(jsonl_path, output_html_path):
    """
    Generates an interactive HTML visualization from a JSONL file of extractions.
    """
    if not os.path.exists(jsonl_path):
        print(f"Error: JSONL file not found at '{jsonl_path}'")
        return

    print(f"Generating visualization from '{jsonl_path}'...")
    
    # Generate the visualization from the file
    html_content = lx.visualize(jsonl_path)
    
    with open(output_html_path, "w", encoding="utf-8") as f:
        if hasattr(html_content, 'data'):
            f.write(html_content.data)  # For Jupyter/Colab environments
        else:
            f.write(html_content)
            
    print(f"Successfully saved visualization to '{output_html_path}'")

# --- Generate Visualization ---
# Define the path to the JSONL file created in the previous cell
jsonl_file = os.path.join("outputs", "extraction_results.jsonl")
output_html_file = "visualization.html"

generate_visualization(jsonl_file, output_html_file)