In [9]:
import pdfplumber
import re
import spacy

# Load the English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

In [5]:
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        full_text = ""
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text + "\n"
    return full_text

pdf_path = 'C:/Users/hruth/GENAI/RAG/constitution.pdf'  # Make sure the path is correct
constitution_text = extract_text_from_pdf(pdf_path)

In [6]:
print(constitution_text[:500])  # Print the first 500 characters to verify content

CONSTITUTION OF THE UNITED STATES OF AMERICA—17871
WETHEPEOPLEof the United States, in Order to SECTION. 2. 1The House of Representatives
form a more perfect Union, establish Justice, shall be composed of Members chosen every sec-
insure domestic Tranquility, provide for the ond Year by the People of the several States,
common defence, promote the general Welfare, and the Electors in each State shall have the
and secure the Blessings of Liberty to our- Qualifications requisite for Electors of th


In [7]:
def clean_and_segment_text(text):
    # Clean text
    text = re.sub(r'\s+', ' ', text)
    # Assuming each article starts with "Article" followed by Roman numerals
    articles = re.split(r'(?=Article\s+[IVXLCDM]+)', text)
    return articles

cleaned_articles = clean_and_segment_text(constitution_text)

In [10]:
def extract_entities(articles):
    entities = []
    for article in articles:
        doc = nlp(article)
        article_entities = []
        for ent in doc.ents:
            article_entities.append((ent.text, ent.label_))
        entities.append(article_entities)
    return entities

constitution_entities = extract_entities(cleaned_articles)

In [11]:
# Print entities from the first article
for entity in constitution_entities[0]:
    print(entity)

('the United States', 'GPE')
('Order', 'PERSON')
('2', 'CARDINAL')
('1The', 'CARDINAL')
('House of Representatives', 'ORG')
('Justice', 'ORG')
('Tranquility', 'ORG')
('the ond Year', 'DATE')
('States', 'GPE')
('State', 'ORG')
('the Blessings of Liberty', 'ORG')
('Qualifications', 'NORP')
('Branch', 'PERSON')
('the State Legislature', 'ORG')
('Constitution', 'LAW')
('the United States', 'GPE')
('2No', 'CARDINAL')
('America', 'GPE')
('seven Years', 'DATE')
('ARTICLE', 'FAC')
('I. United States', 'GPE')
('State', 'ORG')
('Congress', 'ORG')
('Senate', 'ORG')
('3Representatives', 'CARDINAL')
('Taxes', 'ORG')
('House of Representatives', 'ORG')
('States', 'GPE')
('Numbers', 'ORG')
('Constitution', 'LAW')
('Washington', 'PERSON')
('12', 'CARDINAL')
('States', 'GPE')
('Indians', 'NORP')
('three fifths', 'CARDINAL')
('May 1785', 'DATE')
('Congress', 'ORG')
('three Years', 'DATE')
('the Articles of Confederation', 'ORG')
('first', 'ORDINAL')
('Congress', 'ORG')
('January 1786', 'DATE')
('the Leg

In [12]:
def extract_relationships(entities, articles):
    relationships = []
    for article_entities, text in zip(entities, articles):
        article_relationships = []
        doc = nlp(text)
        for ent1 in doc.ents:
            for ent2 in doc.ents:
                if ent1 != ent2:  # Ensure different entities
                    # Example condition: finding relationships based on proximity or specific keywords
                    if ent1.label_ == "ORG" and "passes" in text and ent2.label_ == "LAW":
                        article_relationships.append((ent1.text, "passes", ent2.text))
        relationships.append(article_relationships)
    return relationships

constitution_relationships = extract_relationships(constitution_entities, cleaned_articles)

In [13]:
# Assuming constitution_entities and constitution_relationships are already defined
node_set = set()
edge_list = []

for article_entities in constitution_entities:
    for entity in article_entities:
        node_set.add((entity[0], entity[1]))  # (entity_text, entity_type)

for article_relationships in constitution_relationships:
    for relation in article_relationships:
        edge_list.append(relation)  # (source_entity, relationship, target_entity)

# Convert sets to list for easier processing later
node_list = list(node_set)

In [14]:
node_dict = {node[0]: idx for idx, node in enumerate(node_list)}
normalized_edges = [(node_dict[rel[0]], rel[1], node_dict[rel[2]]) for rel in edge_list]

In [2]:
import chromadb

# Assuming you have a ChromaDB client instance
client = chromadb.Client()

for node in node_list:
    client.insert_node(node_id=node_dict[node[0]], name=node[0], type=node[1])

ModuleNotFoundError: No module named 'chromadb'