In [11]:
import spacy
nlp = spacy.load("en_core_web_sm")

def parse(text: str):
    doc = nlp(text)

    print("Entities:")
    for ent in doc.ents:
        print(ent.text, ent.label_)
    
    print("\n")
    print("Tokens:")
    for token in doc:
        print(token.text, token.pos_, token.dep_)


parse("Obama was born in that place in 1961.")
# parse("Bob thinks Tom is a good person.")
# parse("Tom is punched by Bob.")
# parse("Do you remember that guy who threw the ball out the window?")
# parse("My brother's name is Remmy.")
# parse("Remmy is the name of my brother")
# parse("Microsoft is a company.")
# parse("Bob the person thinks Tom is a good cat.")

Entities:
Obama PERSON
1961 DATE


Tokens:
Obama PROPN nsubjpass
was AUX auxpass
born VERB ROOT
in ADP prep
that DET det
place NOUN pobj
in ADP prep
1961 NUM pobj
. PUNCT punct


In [15]:
import spacy
from spacy import displacy

# Load English language model
nlp = spacy.load("en_core_web_sm")

def parse_sentence(sentence):
    """Parse sentence to extract subjects, objects, and predicates."""
    doc = nlp(sentence)
    
    subjects = []
    direct_objects = []
    indirect_objects = []
    predicates = []
    
    for token in doc:
        # Find subjects (nsubj dependency)
        if token.dep_ == "nsubj":
            subjects.append({
                'text': token.text,
                'pos': token.pos_,
                'lemma': token.lemma_
            })
        
        # Find objects (dobj dependency)
        elif token.dep_ == "dobj":
            direct_objects.append({
                'text': token.text,
                'pos': token.pos_,
                'lemma': token.lemma_
            })

        # Find indirect objects (iobj dependency)
        elif token.dep_ == "iobj":
            indirect_objects.append({
                'text': token.text,
                'pos': token.pos_,
                'lemma': token.lemma_
            })
        
        # Find predicates (verbs)
        elif token.pos_ == "VERB":
            predicates.append({
                'text': token.text,
                'pos': token.pos_,
                'lemma': token.lemma_,
                'tense': token.morph.get('Tense', [])
            })
    
    return {
        'subjects': subjects,
        'direct_objects': direct_objects,
        'indirect_objects': indirect_objects,
        'predicates': predicates,
        'full_sentence': sentence
    }

# Example usage
sentences = [
    "Bob the person thinks Tom is a good cat.",
    "The dog chased the ball quickly.",
    "Alice gave Bob a present yesterday."
]

for sentence in sentences:
    displacy.render(nlp(sentence), style="ent", jupyter=True)
    result = parse_sentence(sentence)
    print(f"\nSentence: {result['full_sentence']}")
    print(f"Subjects: {[s['text'] for s in result['subjects']]}")
    print(f"Direct Objects: {[o['text'] for o in result['direct_objects']]}")
    print(f"Indirect Objects: {[o['text'] for o in result['indirect_objects']]}")
    print(f"Predicates: {[p['text'] for p in result['predicates']]}")
    print("-" * 50)

# More detailed analysis
def detailed_parse(sentence):
    """More detailed parsing with dependency tree visualization."""
    doc = nlp(sentence)
    
    print(f"Detailed analysis of: '{sentence}'")
    print("\nToken\t\tPOS\tDependency\tHead")
    print("-" * 40)
    
    for token in doc:
        print(f"{token.text:<12}\t{token.pos_:<8}\t{token.dep_:<12}\t{token.head.text}")
    
    print(f"\nNoun chunks: {[chunk.text for chunk in doc.noun_chunks]}")
    print(f"Named entities: {[(ent.text, ent.label_) for ent in doc.ents]}")

# Run detailed analysis
detailed_parse("Bob the person thinks Tom is a good cat.")


Sentence: Bob the person thinks Tom is a good cat.
Subjects: ['person', 'Tom']
Direct Objects: []
Indirect Objects: []
Predicates: ['thinks']
--------------------------------------------------



Sentence: The dog chased the ball quickly.
Subjects: ['dog']
Direct Objects: ['ball']
Indirect Objects: []
Predicates: ['chased']
--------------------------------------------------



Sentence: Alice gave Bob a present yesterday.
Subjects: ['Alice']
Direct Objects: ['present']
Indirect Objects: []
Predicates: ['gave']
--------------------------------------------------
Detailed analysis of: 'Bob the person thinks Tom is a good cat.'

Token		POS	Dependency	Head
----------------------------------------
Bob         	PROPN   	npadvmod    	thinks
the         	DET     	det         	person
person      	NOUN    	nsubj       	thinks
thinks      	VERB    	ROOT        	thinks
Tom         	PROPN   	nsubj       	is
is          	AUX     	ccomp       	thinks
a           	DET     	det         	cat
good        	ADJ     	amod        	cat
cat         	NOUN    	attr        	is
.           	PUNCT   	punct       	thinks

Noun chunks: ['the person', 'Tom', 'a good cat']
Named entities: [('Bob', 'PERSON'), ('Tom', 'PERSON')]


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def decompose_complex_sentence(sentence):
    """Decompose complex sentences into component clauses."""
    doc = nlp(sentence)
    
    clauses = []
    
    # Find main clauses and subordinate clauses
    for token in doc:
        # Look for clause boundaries using dependency patterns
        if token.dep_ in ["ccomp", "xcomp", "advcl", "acl"]:
            # This is a subordinate clause
            clause_text = extract_clause(doc, token)
            clauses.append({
                'type': 'subordinate',
                'dependency': token.dep_,
                'text': clause_text,
                'head_verb': token.head.text
            })
    
    # Extract main clause
    main_clause = extract_main_clause(doc)
    clauses.insert(0, {
        'type': 'main',
        'text': main_clause,
        'dependency': 'ROOT'
    })
    
    return clauses

def extract_clause(doc, start_token):
    """Extract a complete clause starting from a token."""
    clause_tokens = []
    
    # Get the subtree of the token
    for token in start_token.subtree:
        clause_tokens.append(token)
    
    # Sort by position in sentence
    clause_tokens.sort(key=lambda x: x.i)
    
    return ' '.join([token.text for token in clause_tokens])

def extract_main_clause(doc):
    """Extract the main clause of the sentence."""
    main_tokens = []
    
    for token in doc:
        # Skip tokens that are part of subordinate clauses
        if not any(token in clause_token.subtree for clause_token in doc if clause_token.dep_ in ["ccomp", "xcomp", "advcl", "acl"]):
            main_tokens.append(token)
    
    return ' '.join([token.text for token in main_tokens])

def parse_with_clauses(sentence):
    """Parse sentence and show clause structure."""
    doc = nlp(sentence)
    
    print(f"Original: {sentence}")
    print("\nClause Decomposition:")
    
    clauses = decompose_complex_sentence(sentence)
    
    for i, clause in enumerate(clauses):
        print(f"\nClause {i+1} ({clause['type']}):")
        print(f"  Text: {clause['text']}")
        if clause['type'] == 'subordinate':
            print(f"  Dependency: {clause['dependency']}")
            print(f"  Head verb: {clause['head_verb']}")
    
    print("\nDependency Tree:")
    for token in doc:
        print(f"{token.text:<15} {token.dep_:<10} -> {token.head.text}")

# Test with your example
complex_sentences = [
    "Tom is worried that Bill will tell Tammy that Joe is cheating on Patricia.",
    "I think that she believes he knows the truth.",
    "The cat that the dog chased ran quickly."
]

for sentence in complex_sentences:
    parse_with_clauses(sentence)
    print("\n" + "="*60 + "\n")

# Alternative approach using sentence boundaries
def extract_embedded_clauses(sentence):
    """Extract clauses using different dependency patterns."""
    doc = nlp(sentence)
    
    clauses = []
    
    # Find different types of embedded clauses
    for token in doc:
        if token.dep_ == "ccomp":  # Complement clause
            clauses.append({
                'type': 'complement',
                'text': extract_clause(doc, token),
                'connector': token.head.text
            })
        elif token.dep_ == "advcl":  # Adverbial clause
            clauses.append({
                'type': 'adverbial',
                'text': extract_clause(doc, token),
                'connector': token.head.text
            })
        elif token.dep_ == "acl":  # Relative clause
            clauses.append({
                'type': 'relative',
                'text': extract_clause(doc, token),
                'connector': token.head.text
            })
    
    return clauses

# Test the alternative approach
print("Alternative Clause Extraction:")
for sentence in complex_sentences:
    print(f"\nSentence: {sentence}")
    clauses = extract_embedded_clauses(sentence)
    for clause in clauses:
        print(f"  {clause['type'].title()} clause: {clause['text']}")

Original: Tom is worried that Bill will tell Tammy that Joe is cheating on Patricia.

Clause Decomposition:

Clause 1 (main):
  Text: Tom is worried .

Clause 2 (subordinate):
  Text: that Bill will tell Tammy that Joe is cheating on Patricia
  Dependency: ccomp
  Head verb: worried

Clause 3 (subordinate):
  Text: that Joe is cheating on Patricia
  Dependency: ccomp
  Head verb: tell

Dependency Tree:
Tom             nsubj      -> is
is              ROOT       -> is
worried         acomp      -> is
that            mark       -> tell
Bill            nsubj      -> tell
will            aux        -> tell
tell            ccomp      -> worried
Tammy           dobj       -> tell
that            mark       -> cheating
Joe             nsubj      -> cheating
is              aux        -> cheating
cheating        ccomp      -> tell
on              prep       -> cheating
Patricia        pobj       -> on
.               punct      -> is


Original: I think that she believes he knows the truth.

C