In [1]:
import spacy
from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Span
from spacy.language import Language

# Ensure we don't register the factory twice
if "citation_component" not in Language.factories:
    @Language.factory("citation_component")
    def create_citation_component(nlp, name):
        """Factory function to create the citation matcher component"""
        matcher = Matcher(nlp.vocab)

        # Define the pattern for APA-style citations (e.g., (American Psychiatric Association, 2013))
        pattern = [
            {"ORTH": "("},  # Opening parenthesis
            {"IS_TITLE": True, "OP": "+"},  # One or more capitalized words
            {"TEXT": ","},  # Comma
            {"TEXT": {"REGEX": r"\d{4}"}},  # Four-digit year
            {"ORTH": ")"},  # Closing parenthesis
        ]

        matcher.add("CITATION", [pattern])

        def citation_component(doc):
            """Custom pipeline component to detect APA citations."""
            matches = matcher(doc)
            new_spans = [Span(doc, start, end, label="CITATION") for _, start, end in matches]

            # Remove conflicting entities (ORG, DATE) that overlap with citations
            existing_spans = []
            for ent in doc.ents:
                if not any(token in ent for span in new_spans for token in span):
                    existing_spans.append(ent)

            # Merge new citations with filtered entities
            doc.set_ents(existing_spans + new_spans, default="unmodified")
            return doc

        return citation_component

# Ensure we don't register the factory twice
# if "dependency_matcher_component" not in Language.factories:
#     @Language.factory("dependency_matcher_component")
#     def create_dependency_matcher_component(nlp, name):
#         """Factory function to create the citation matcher component"""
#         dependency_matcher = DependencyMatcher(nlp.vocab)
# 
#         dependency_patterns = [
#             # {
#             #     "label": "DISORDER_SYMPTOM",
#             #     "pattern": [
#             #         {"dep": "nsubj", "ent_type": "DISORDER"},  # Disorder entity (subject)
#             #         {"dep": "ROOT", "pos": {"in": ["VERB"]}},  # Verb (root of the sentence)
#             #         {"dep": "dobj", "ent_type": "SYMPTOM"}  # Symptom entity (direct object)
#             #     ]
#             # },
#             # # Adding more patterns
#             # {
#             #     "label": "DISORDER_SYMPTOM",
#             #     "pattern": [
#             #         {"dep": "nsubj", "ent_type": "DISORDER"},  # Disorder entity (subject)
#             #         {"dep": "ROOT", "pos": {"in": ["VERB"]}},  # Verb (root of the sentence)
#             #         {"dep": "prep", "ent_type": "SYMPTOM"}  # Symptom entity (prepositional object)
#             #     ]
#             # }
#             # anchor token: founded
#             {
#                 "RIGHT_ID": "DISORDER",
#                 "RIGHT_ATTRS": {"ent_type": "DISORDER"}
#             },
#             # founded -> subject
#             {
#                 "RIGHT_ID": "DISORDER",
#                 "LEFT_ID": "DISORDER",
#                 "RIGHT_ATTRS": {"DEP": "ROOT", "pos": {"in": ["VERB"]}}
#             },
#             # "founded" follows "initially"
#             {
#                 "RIGHT_ID": "DISORDER",
#                 "LEFT_ID": "DISORDER",
#                 "RIGHT_ATTRS": {"ent_type": "SYMPTOM"}
#             }
#         ]
# 
#         for pattern in dependency_patterns:
#             dependency_matcher.add(pattern["label"], [pattern["pattern"]])
# 
#         def dependency_matcher_component(doc):
#             matches = dependency_matcher(doc)
# 
#             # Iterate through all matches found
#             for match_id, token_ids in matches:
#                 match_id_str = doc.vocab.strings[match_id]
#                 span = doc[token_ids[0]:token_ids[-1] + 1]
#                 print(f"Match '{match_id_str}': {span.text}")
# 
#             return doc
# 
#         return dependency_matcher_component

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Add the component only if not already in the pipeline
if "citation_component" not in nlp.pipe_names:
    nlp.add_pipe("citation_component", after="ner")

# if "dependency_matcher_component" not in nlp.pipe_names:
#     nlp.add_pipe("dependency_matcher_component", after="ner")


patterns = [
    {"label": "DISORDER", "pattern": [{"lemma": "panic"}, {"lemma": "attack"}]},
    # matches "panic attack" and "panic attacks"
    {"label": "DISORDER", "pattern": [{"lemma": "anxiety"}, {"lemma": "disorder"}]},  # matches "anxiety disorder"
    {"label": "DISORDER", "pattern": [{"lemma": "depression"}, {"lemma": "disorder"}]},  # matches "depression disorder"
    {"label": "DISORDER", "pattern": [{"lemma": "bipolar"}, {"lemma": "disorder"}]},  # matches "bipolar disorder"
    {"label": "DISORDER", "pattern": [{"lemma": "schizophrenia"}]},  # matches "schizophrenia"
    {"label": "DISORDER", "pattern": [{"lemma": "insomnia"}]},  # matches "insomnia"
    {"label": "DISORDER", "pattern": [{"lemma": "autism"}]},  # matches "autism"
    {"label": "DISORDER", "pattern": [{"lemma": "alcohol"}, {"lemma": "addiction"}]},  # matches "alcohol addiction"
    {"label": "DISORDER", "pattern": [{"lemma": "amphetamine"}, {"lemma": "addiction"}]},
    # matches "amphetamine addiction"

    {"label": "DISORDER", "pattern": [{"lower": "panic"}, {"lower": "attack"}]},
    # matches "panic attack" and "panic attacks"
    {"label": "DISORDER", "pattern": [{"lower": "panic"}, {"lower": "disorder"}]},  # matches "panic disorder"
    {"label": "DISORDER", "pattern": [{"lower": "acute"}, {"lower": "stress"}, {"lower": "disorder"}]},
    # matches "acute stress disorder"
]

dependency_patterns = [
    # anchor token: DISORDER entity
    {
        "RIGHT_ID": "DISORDER",
        "RIGHT_ATTRS": {"ent_type": "DISORDER"}
    },
    # DISORDER -> Verb
    {
        "LEFT_ID": "DISORDER",
        "REL_OP": ".",
        "RIGHT_ID": "DISORDER_VERB",
        "RIGHT_ATTRS": {"ORTH": "are"}

    },
    # Verb -> Symptoms/Description
    {
        "RIGHT_ID": "SYMPTOM",
        "LEFT_ID": "DISORDER_VERB",
        "RIGHT_ATTRS": {"pos": {"in": ["NOUN", "ADJ"]}},
        "REL_OP": "<"
    }
]

# Add the EntityRuler to the pipeline
ruler = nlp.add_pipe('entity_ruler', after="ner")
ruler.add_patterns(patterns)

with open("../data/panic_attacks.txt", "r") as f:
    text = f.read()

print(text)

doc = nlp(text)

matcher = DependencyMatcher(nlp.vocab)
matcher.add("DISORDER", [dependency_patterns])

matches = matcher(doc)

print(matches)

if matches.__len__() > 0:
    match_id, token_ids = matches[0]
    for i in range(len(token_ids)):
        print(dependency_patterns[i]["RIGHT_ID"] + ":", doc[token_ids[i]].text)

# Print detected entities
for ent in doc.ents:
    print(ent.text, ent.label_)

"Panic attacks" are discrete episodes of intense fear or discomfort, accompanied by physical and cognitive symptoms, as listed in the DSM-5 panic attack checklist (American Psychiatric Association, 2013). Panic attacks are discrete by virtue of their sudden or abrupt onset and brief duration, as opposed to gradually building anxious arousal. Panic attacks in panic disorder often have an unexpected quality, meaning that from the patient's perspective, they appear to happen without an obvious trigger or at unexpected times. Indeed, the diagnosis of panic disorder is defined by recurrent "unexpected" panic attacks, followed by at least 1 month of persistent concern about their recurrence and their consequences, or by a significant change in behavior consequent to the attacks (American Psychiatric Association, 2013).
[]
Panic attacks DISORDER
DSM-5 NORP
panic attack DISORDER
(American Psychiatric Association, 2013) CITATION
Panic attacks DISORDER
Panic attacks DISORDER
panic disorder DISOR

In [2]:
doc = nlp(text)

for token in doc:
    print(token.text, token.pos_, token.dep_)

" PUNCT punct
Panic NOUN compound
attacks NOUN nsubj
" PUNCT punct
are AUX ROOT
discrete ADJ amod
episodes NOUN attr
of ADP prep
intense ADJ amod
fear NOUN pobj
or CCONJ cc
discomfort ADJ conj
, PUNCT punct
accompanied VERB advcl
by ADP agent
physical ADJ amod
and CCONJ cc
cognitive ADJ conj
symptoms NOUN pobj
, PUNCT punct
as SCONJ mark
listed VERB advcl
in ADP prep
the DET det
DSM-5 PROPN compound
panic NOUN compound
attack NOUN compound
checklist NOUN pobj
( PUNCT punct
American PROPN compound
Psychiatric PROPN compound
Association PROPN appos
, PUNCT punct
2013 NUM npadvmod
) PUNCT punct
. PUNCT punct
Panic NOUN compound
attacks NOUN nsubj
are AUX ROOT
discrete ADJ acomp
by ADP prep
virtue NOUN pobj
of ADP prep
their PRON poss
sudden ADJ amod
or CCONJ cc
abrupt ADJ conj
onset NOUN pobj
and CCONJ cc
brief ADJ amod
duration NOUN attr
, PUNCT punct
as SCONJ mark
opposed VERB advcl
to AUX aux
gradually ADV advmod
building VERB xcomp
anxious ADJ amod
arousal NOUN dobj
. PUNCT punct
Pani

In [3]:
from spacy import displacy

for sent in doc.sents:
    displacy.render(sent, style="dep")

In [4]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Panic attacks DISORDER
DSM-5 NORP
panic attack DISORDER
(American Psychiatric Association, 2013) CITATION
Panic attacks DISORDER
Panic attacks DISORDER
panic disorder DISORDER
panic disorder DISORDER
panic attacks DISORDER
at least 1 month DATE
(American Psychiatric Association, 2013) CITATION


In [5]:
displacy.render(doc, style="ent")

In [6]:
import pandas as pd

# Get named entity list per sentence

sent_entity_df = []

for sent in doc.sents:
    entity_list = [ent.text for ent in sent.ents]
    sent_entity_df.append({"sentence": sent, "entities": entity_list})

sent_entity_df = pd.DataFrame(sent_entity_df)
sent_entity_df

Unnamed: 0,sentence,entities
0,"("", Panic, attacks, "", are, discrete, episodes...","[Panic attacks, DSM-5, panic attack, (American..."
1,"(Panic, attacks, are, discrete, by, virtue, of...",[Panic attacks]
2,"(Panic, attacks, in, panic, disorder, often, h...","[Panic attacks, panic disorder]"
3,"(Indeed, ,, the, diagnosis, of, panic, disorde...","[panic disorder, panic attacks, at least 1 mon..."
