In [2]:
## import relevant libraries
import spacy
from spacy import displacy
from pathlib import Path
from spacy.pipeline import EntityRuler
from spacy.pipeline import merge_entities
from spacy.tokens import Doc
from spacy.tokens import Span
from spacy.matcher import Matcher

## define nlp language model
nlp = spacy.load("en_core_web_sm")

# remove all components from the pipeline
for pipe in nlp.pipe_names:
    if pipe not in ['tagger', "parser", "ner"]:
        nlp.remove_pipe(pipe)

# Doc extension function for detecting feature phrases
def get_feature_phrases(doc):
    feature_phrase_matcher = Matcher(nlp.vocab)

    #create feature phrases from pattern matcher
    for pattern in feature_phrase_patterns:
        feature_phrase_matcher.add(pattern["label"], None, pattern["pattern"])

    phrases = feature_phrase_matcher(doc) 
    
    #resolve overlapping feature phrases 
    seen_tokens = set()
    feature_phrase_list = []
    
    for match_id, start, end in phrases:
        # check for end - 1 here because boundaries are inclusive
        if start not in seen_tokens and end - 1 not in seen_tokens:
            
            if doc[start].lower_ in ["the"]: # remove 'DET' (the) from feature phrases
                feature_phrase = Span(doc, start + 1, end, label=match_id)
            else:
                feature_phrase = Span(doc, start, end, label=match_id)
                
            feature_phrase_list.append(feature_phrase)
            seen_tokens.update(range(start, end))
    

    for ent in doc.ents:
        if ent.start not in seen_tokens and ent.end - 1 not in seen_tokens:
            feature_phrase_list.append(ent)

    return feature_phrase_list

## function for normalising the text based on annotation categories
def replace_feature_phrase(word):
    found = [x for x in phrase_list if word.i in x[0]] # create a list of phrases if word.i is in the ranges (x[0]) 
    
    if len(found) > 0: # if the word features in the found list

        existing = [x for x in ranges if word.i in x]
        if len(existing) == 0:
            ranges.append(range(found[0][1].start, found[0][1].end))
            if found[0][1].label_ in ["SOCIALGROUP", "SECURITYGROUP", "ECONOMICGROUP", "GPEGROUP", "RELIGIOUSGROUP", "ARMEDGROUP"]:
                return found[0][1].label_
            else:
                return found[0][1].label_
        else:
            return ''
    
    elif word.ent_type_ in ["NORP", "GPE", "ORG", "PERSON"]:
        return word.text
    elif word.ent_type_:
        return word.ent_type_
    else:
        return word.text


#list of direct violence concepts and language features
DIRECTVIOLENCE_list = ["war", "Jihad"]
ADVERSARY_list = ["enemy", "regime"]
TERRITORY_list = ["area"]
SOCIAL_GROUP_list = ["alliance"]

language_features = [
    {"label" : "STRUCTURALVIOLENCE", "pattern": [{"LEMMA" : {"IN" : DIRECTVIOLENCE_list}}, {"LOWER" : {"IN" : ["of", "on"]}}, {"POS" : "NOUN"}]},
    {"label" : "INGROUP", "pattern" : [{"LOWER" : {"IN" : ["our", "my"]}}]},
    {"label" : "ADVERSARY", "pattern" : [{"DEP" : "amod", "OP" : "*"}, {"LEMMA" : {"IN" : ADVERSARY_list}}]},
    {"label" : "SOCIALGROUP", "pattern" : [{"LEMMA" : {"IN" : SOCIAL_GROUP_list}}]},
    {"label" : "TERRITORY", "pattern" : [{"LEMMA" : {"IN" : TERRITORY_list}}]}
]

#Initialise and add language features to concept ruler
concept_ruler = EntityRuler(nlp, overwrite_ents=False)
concept_ruler.add_patterns(language_features)

## list of feature phrase patterns
feature_phrase_patterns = [{"label": "INGROUPASSET", "pattern" : [{"ENT_TYPE" : "INGROUP"}, {"LEMMA" : {"NOT_IN" : ADVERSARY_list}, "POS" : {"IN" : ["PROPN", "NOUN"]}}]},
                           {"label": "GPEGROUP", "pattern" : [{"DEP" : "amod", "OP": "*"}, {"ENT_TYPE" : "SOCIALGROUP"}]},
                           {"label": "OUTGROUP", "pattern" : [{"ENT_TYPE" : "NORP"}, {"ENT_TYPE" : "ADVERSARY"}]},
                          ]

#add feature phrase extraction to Doc object as an extension and additional pipeline components
Doc.set_extension("feature_phrases", getter=get_feature_phrases, force=True) ## recall using doc._.feature_phrases
nlp.add_pipe(concept_ruler, after = "ner") # enable named concept recognition recalled using doc.ents
nlp.add_pipe(merge_entities, last = True) # merge entities

##test sentences
sentences = [u"Our war on terror begins with al-Qaeda"]
sentences = [u"the main enemy in the area - the American Zionist alliance",
            u"the enemy of Crusader-American forces",
            u"terrorist organisations known as al Qaeda"]

##visualise dependency parse
for sent in sentences:
    doc = nlp(sent)
    options = {"compact": False}
    svg = displacy.render(doc, style="dep")#, jupyter=False) # to save svg object have jupyter = false,
    file_name = '-'.join([w.text for w in doc if not w.is_punct]) + ".svg"
    output_path = Path("./images/" + file_name)
    #output_path.open("w", encoding="utf-8").write(svg)

In [237]:
for e in doc.ents:
    print(e.text, e.label_)

main enemy ADVERSARY
area TERRITORY
American NORP
Zionist NORP
alliance SOCIALGROUP


In [238]:
doc._.feature_phrases

[American Zionist alliance, main enemy, area]

In [239]:
##used to identify the tags ascribed to each token

phrase_list = [(range(x.start, x.end), x) for x in doc._.feature_phrases]
new_sent = []
ranges = []

for sent in sentences:
    doc = nlp(sent)
    
    for token in doc:
        new_sent.append(replace_feature_phrase(token))
    print(new_sent)

['the', 'ADVERSARY', 'in', 'the', 'TERRITORY', '-', 'the', 'GPEGROUP', '', '']
