In [162]:
%%time

### import libraries

import spacy
from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRuler
from spacy.matcher import Matcher
from spacy.pipeline import merge_entities
from spacy.strings import StringStore
from spacy.tokens import Doc, Span, Token
from spacy import displacy
import os
import json
import requests
from datetime import date, datetime

import pandas as pd
pd.set_option('display.max_colwidth', -1)
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 999)

from collections import Counter, OrderedDict, defaultdict
import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np
import platform;

spacy.info()
print('============================== Info about python ==============================')
print('python version: ', platform.sys.version)
print()
print('=========================== Loading Language Models ===========================')
model = 'en_core_web_md'

print('loading', model)
nlp = spacy.load(model)

print('loaded', model)
print('complete at: ', datetime.now().strftime("%d/%m/%Y - %H:%M:%S"))



spaCy version    2.1.9                         
Location         C:\Users\Steve\AppData\Roaming\Python\Python37\site-packages\spacy
Platform         Windows-10-10.0.17763-SP0     
Python version   3.7.1                         
Models                                         

python version:  3.7.1 | packaged by conda-forge | (default, Nov 13 2018, 19:01:41) [MSC v.1900 64 bit (AMD64)]

loading en_core_web_md
loaded en_core_web_md
complete at:  26/02/2020 - 15:53:09
Wall time: 23.8 s


In [163]:
%%time

#             ## include phrases such as 'war on terror' where terror is the concept
#             elif doc[start - 2].dep_ in ["nsubj", "amod"] and doc[start].dep_ in ["pobj"] and start != 0:
#                 concept = Span(doc, start - 2, end)
#                 #print('3 = >', concept, '=>', doc.vocab.strings[match_id])
                
#             ## include phrases such as 'weapon of mass desctruction' where weapon in the concept
# #             elif doc[start].dep_ in ["nsubj", "csubj", "pobj"] and end < len(doc):
# #                  if doc[start + 1].dep_ in ["prep"]:
# #                         if list(doc[start + 1].rights):
# #                             concept = Span(doc, start, list(doc[start + 1].rights)[-1].i + 1)
                        
# #                         print(doc[start:end])
# #                         print(str(concept.sent).strip())
# #                         print('4 = >', concept, '=>', doc.vocab.strings[match_id])
# #                         print(concept)
# #                         print('-----')        
            
            ## expand the span if label is a title followed by a person
            
            ## get labels for the concept
            
## define pipelines extensions

# Pipeline component for capturing Concepts
class ConceptMatcher(object):
    """This class is a for a pipelines component for detecting concepts in a text."""

    name = "Concept Matcher"  # component name, will show up in the pipeline
    
    from collections import defaultdict
    
    def __init__(self, nlp):
        
        """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher
        with the shared vocab, get the label ID and generate Doc objects as phrase match patterns.
        """

        # Set up the Matcher
        self.matcher = Matcher(nlp.vocab)
        
        with open(Doc.get_extension("gptypology")[0], 'r') as fp:
            self.group_typology = json.load(fp)
            
        for entry in self.group_typology.values():        
            for pattern in entry.values():
                for subcat, terms in pattern.items():
                    self.matcher.add(subcat, None, [{"LEMMA" : {"IN" : terms}}])
                    
       ## set up the concept typology look up
        self.concept_typology = dict()            
        for ideology, value in self.group_typology.items():
            labels = []
            for subcat in value.values():
                labels += list(subcat.keys())
            self.concept_typology[ideology] = labels
            
        ## set up the group feature typology look up
        self.feature_typology = defaultdict(list)

        for value in self.group_typology.values():
            for feature, concept in value.items():
                self.feature_typology[feature] += list(concept.keys())
              
        Doc.set_extension("named_concepts", default = [], force = True)
        
        Span.set_extension("CONCEPT", default = '', force = True)
        Token.set_extension("CONCEPT", default = '', force = True)
        
        Span.set_extension("ATTRIBUTE", default = '', force = True)
        Token.set_extension("ATTRIBUTE", default = '', force = True)
        
        Span.set_extension("IDEOLOGY", default = '', force = True)
        Token.set_extension("IDEOLOGY", default = '', force = True)        

    def __call__(self, doc):
        
        """Apply the pipeline component on a Doc object and modify it if matches are found. 
        Return the Doc, so it can be processed by the next component in the pipeline, if available.
        
        merge entities code: https://support.prodi.gy/t/merge-entities-error/389
        filter code: https://github.com/explosion/spaCy/issues/4056
        """
        
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for match_id, start, end in matches:
            temp = ""
            concept = Span(doc, start, end)
            concept._.CONCEPT = doc.vocab.strings[match_id]
            concept._.IDEOLOGY = self.get_ideology(concept._.CONCEPT)
            concept._.ATTRIBUTE = self.get_attribute(concept._.CONCEPT)
            
            doc._.named_concepts = spacy.util.filter_spans(list(doc._.named_concepts) + [concept])
                
        return doc #that one
    
    def get_ideology(self, span):
        
        """the purpose of this function is to return the social group identity related to the token
        the social group identity is a category for a set of rules in the named_concept_rules dict object"""

        for ideology, patterns in self.concept_typology.items():
            if span.lower() in [pattern.lower() for pattern in patterns]:
                return ideology
        return ''
    
    def get_attribute(self, span):
        
        """the purpose of this function is to return the social group identity related to the token
        the social group identity is a category for a set of rules in the named_concept_rules dict object"""

        for attribute, patterns in self.feature_typology.items():
            if span.lower() in [pattern.lower() for pattern in patterns]:
                return attribute
        return ''
    
class ConceptResolution(object):
    
    """ this class is to resolve the concepts with neighbouring entities and objects """
    
    name = "Concept Resolution"
    
    def __init__(self, nlp):
        
        self.new_concepts = []
        
    def __call__(self, doc):
        
        """ iterate over the named concepts and if there is a relevant neighbour expand the span """
        
        for concept in doc._.named_concepts:
            
            start = concept.start
            end = concept.end
            
            #concept = doc[doc[term.start].left_edge.i : doc[term.end].right_edge.i+1]
            
            ## gather up noun phrases
            
            ## include concept modifier
            if start != 0 and doc[start - 1].dep_ in ["amod", "compound"]:
                if doc[start - 1].lower_ == "terrorist":
                    print(doc[start - 1], '=>', doc[start - 1]._.CONCEPT)
            
                if doc[start - 1].ent_type_:
                    concept = Span(doc, start - 1, end, label = doc[start - 1].ent_type_)
                else:
                    temp = doc[start - 1]._.CONCEPT
                    concept = Span(doc, start - 1, end)
                    concept
                #print('1 = >', concept, '=>', doc.vocab.strings[match_id])
                
            ## include associated nounds/proper nouns
            elif start != 0 and doc[start - 1].pos_ in ["NOUN", "PROPN"]:
                
                if doc[start - 1].ent_type_:
                    concept = Span(doc, start - 1, end, label = doc[start - 1].ent_type_)
                else:
                    concept = Span(doc, start - 1, end)
                #print('2 = >', concept, '=>', doc.vocab.strings[match_id])
                
            elif end < len(doc) and doc[end].pos_ in ["NOUN", "PROPN"]:
                
                if doc[end].ent_type_:
                    concept = Span(doc, start, end + 1, label = doc[end].ent_type_)
                else:
                    concept = Span(doc, start, end + 1)
                    
            self.new_concepts.append(concept)
            
        doc._.named_concepts = []
        doc._.named_concepts = spacy.util.filter_spans(self.new_concepts)

        return doc

    
#Pipeline component for capturing Named Entities
class EntityMatcher(object):
    
    """This class is a for a pipelines component for supplementing the model's named entities in a text."""

    name = "Named Entity Matcher"  # component name, will show up in the pipeline

    def __init__(self, nlp):
        
        """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher 
        with the shared vocab, get the label ID and generate Doc objects as phrase match patterns.
        """
        
        #setup dictionary of EntityIDs
        
        # Set up the PhraseMatcher – it can now take Doc objects as patterns,
        self.matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
        
        with open(os.path.join(Doc.get_extension("kpath")[0], "named_entity_corrections.json"), 'r') as fp:
            self.named_entities = json.load(fp)

        for label, terms in self.named_entities.items():
            if len(terms) > 0:
                patterns = [nlp.make_doc(text) for text in terms] # -- used for PhraseMatcher
                self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        
        """Apply the pipeline component on a Doc object and modify it if matches are found. 
        Return the Doc, so it can be processed by the next component in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for match_id, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=match_id)
            spans.append(entity)

        with doc.retokenize() as retokenizer:
            for span in spacy.util.filter_spans(spans):
                retokenizer.merge(doc[span.start:span.end])
        
        doc.ents = spacy.util.filter_spans(list(doc.ents) + spans)

        return doc # this one

class EntityResolution(object):
    """spaCy v2.0 pipeline component that create unique identifier for
    named entities and named concepts in a document.
    """

    name = "Entity Resolution"  # component name, will show up in the pipeline

    def __init__(self, nlp):
        """Initialise the pipeline component. Place after merge entities
        in the pipeline.
        """
        # Load the country and nationalities look up lists
        with open(os.path.join(Doc.get_extension("kpath")[0], "country_codes_dict.json"), 'r') as fp:
            self.country_codes_dict = json.load(fp)

        with open(os.path.join(Doc.get_extension("kpath")[0], "nationality_codes_dict.json"), 'r') as fp:
            self.nationality_codes_dict = json.load(fp)

        Doc.set_extension("countryIDs", default = dict(), force = True)
        Doc.set_extension("nationalityIDs", default = dict(), force = True)
        Doc.set_extension("peopleIDs", default = dict(), force = True)
        Doc.set_extension("orgIDs", default = dict(), force = True)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension("entID", default='', force = True)
        Span.set_extension("entID", default='', force = True)

    def __call__(self, doc):
        
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        
        def ID_getter(token, DocID = {}, LookUpID = {}, IDtype = ''): ## function to create unique ID and return to 

            if LookUpID: # if a look up table has been passed

                for key, value in LookUpID.items(): #iterate through the table  
                    if token.lemma_.lower() in [entry.lower() for entry in value]: #is the token an entry in the table
                        DocID[key] = value[0] # create entry in reference table of first reference
                        token.ent_type_ = IDtype
                        return key

            else: # if a look up table has not been passed
                if len(DocID) == 0: #create an entry if dictionary is empty
                    DocID[IDtype[0:2] + str("%03d" % len(DocID))] = token.orth_.title()
                    token.ent_type_ = IDtype
                    return IDtype[0:2] + str("%03d" % len(DocID))
                else:
                    for key, value in DocID.copy().items():
                        #print(len(DocID), token.lemma_.lower(), '=>', value.lower())
                        if token.lemma_.lower() in value.lower():
                            return key
                        
                    DocID[IDtype[0:2] + str("%03d" % len(DocID))] = token.orth_.title()
                    token.ent_type_ = IDtype
                    return key       

        for token in doc:
            if token.ent_type_ in ["GPE"]:
                token._.entID = ID_getter(token, DocID = doc._.countryIDs, LookUpID = self.country_codes_dict, IDtype = "COUNTRY")
            elif token.ent_type_ in ["NORP"]:
                token._.entID = ID_getter(token, DocID = doc._.nationalityIDs, LookUpID = self.nationality_codes_dict, IDtype = "NATIONALITY")
            elif token.ent_type_ in ["PERSON"]:
                token._.entID = ID_getter(token, DocID = doc._.peopleIDs, IDtype = "PERSON")
            elif token.ent_type_ in ["ORG"]:
                token._.entID = ID_getter(token, DocID = doc._.orgIDs, IDtype = "ORG")

        return doc  # don't forget to return the Doc

print('complete at: ', datetime.now().strftime("%d/%m/%Y - %H:%M:%S"))

complete at:  26/02/2020 - 15:53:09
Wall time: 2.99 ms


In [164]:
%%time

### this cell is for the document extensions

def get_feature_phrases(doc):
    
    """this function is a document extenion which extracts the feature phrases related to the violent patterns"""
    
    feature_phrase_matcher = Matcher(nlp.vocab)
    
    with open(os.path.join(Doc.get_extension("kpath")[0], "noun_phrase_patterns_v2.json"), 'r') as fp:
        noun_phrase_patterns = json.load(fp)

    #create feature phrase from pattern matcher#
    
    for patterns in noun_phrase_patterns.values():        
        rules = [pattern for pattern in patterns]
        for rule in rules:
#             print(rule["label"], '=>', rule["pattern"])
            feature_phrase_matcher.add(rule["label"], None, rule["pattern"])
            
#     rules = []
#     for patterns in noun_phrase_patterns.values():
#         rules.extend([rule["label"] for rule in patterns])
        
#     for rule in rules:
#         print(rule, '=>', feature_phrase_matcher.get(rule))

    feature_phrases = feature_phrase_matcher(doc) 
    #print(feature_phrases)
    
    #resolve overlapping feature phrases 
    seen_tokens = set()
    new_entities = []
    
    for match_id, start, end in feature_phrases:
        # check for end - 1 here because boundaries are inclusive
        if start not in seen_tokens and end - 1 not in seen_tokens:
            
            if doc[start].lower_ in ["the", "a"]: # remove 'DET' (the) from feature phrases
                feature_phrase = Span(doc, start + 1, end, label=match_id)
            else:
                feature_phrase = Span(doc, start, end, label=match_id)
                
            new_entities.append(feature_phrase)
            seen_tokens.update(range(start, end))
    

    for ent in doc.ents:
        if ent.start not in seen_tokens and ent.end - 1 not in seen_tokens:
            new_entities.append(ent)
    
#    print("printing feature phrases")
#     print(new_entities)
#     for entity in new_entities:
#         print(entity.sent)
#         print('>> ', entity, ' > ', entity.label_, ' > ', entity._.IDEOLOGY)#, ' > ', entity._.IDEOLOGY)
#         print('-')
        
    return new_entities

def get_doc_ideologies(doc):
    
    """This function returns a dictionary containing a count of ideology mentioned within the document"""
    
    ## create a list for counting the number of ideologies featuring as custom attributes of each term
    group = [feature._.IDEOLOGY for feature in doc._.named_concepts if feature._.IDEOLOGY]

    ## get the data structure of ideologies as a json object
    with open(Doc.get_extension("gptypology")[0], 'r') as fp:
        ideology = {key : 0 for key in json.load(fp).keys()}
        
    ## create a counter for the ideologies featuring in the doc
    for k, v in dict(Counter(group)).items():
        ideology[k] = v / len(group)
        
    return ideology

def get_doc_ingroupassets(doc):
    
    """This function returns a dictionary containing a count of each ingroup asset mentioned within the document"""

    return dict(Counter([feature.lower_ for feature in doc._.feature_phrases
                      if feature.label_ in ["INGROUPASSET"]
                     and str(feature.root.lemma_).lower() != "-pron-"]))

def get_doc_outgroupassets(doc):
    
    """This function returns a dictionary containing a count of each outgroup asset mentioned within the document"""
    
    return dict(Counter([feature.lower_ for feature in doc._.feature_phrases 
                      if feature.label_ in ["OUTGROUPASSET"]
                     and str(feature.root.lemma_).lower() != "-pron-"]))

def get_doc_ingroup(doc):
    
    """This function returns a dictionary containing a count of the ingroup terms mentioned within the document"""
    
    group = [feature.lower_.title() for feature in doc._.named_concepts if feature._.ATTRIBUTE in ["ingroup"]]

    return {k: v for k, v in sorted(dict(Counter(group)).items(), key=lambda item: item[1], reverse = True)} #ingroup terms

def get_doc_outgroup(doc):

    """This function returns a dictionary containing a count of the outgroup terms mentioned within the document"""

    group = [feature.lower_.title() for feature in doc._.named_concepts if feature._.ATTRIBUTE in ["outgroup"]]
    
    return {k: v for k, v in sorted(dict(Counter(group)).items(), key=lambda item: item[1], reverse = True)} # outgroup terms

def get_doc_ingroup_ideologies(doc):
    
    """This function returns a dictionary containing a count of the ideologies related to each ingroup term mentioned within the document"""
    
    group = [feature._.IDEOLOGY for feature in doc._.named_concepts if feature._.ATTRIBUTE in ["ingroup"]]

    with open(Doc.get_extension("gptypology")[0], 'r') as fp:
        ideologies = {key : 0 for key in json.load(fp).keys()}
        
    for k, v in dict(Counter(group)).items():
        ideologies[k] = v
        
    return {k: v for k, v in sorted(ideologies.items(), key=lambda item: item[1], reverse = True)} # ingroup ideologies

def get_doc_outgroup_ideologies(doc):
    
    """This function returns a dictionary containing a count of the ideologies related to each outgroup term mentioned within the document"""
    
    group = [feature._.IDEOLOGY for feature in doc._.named_concepts if feature._.ATTRIBUTE in ["outgroup"]]
    
    with open(Doc.get_extension("gptypology")[0], 'r') as fp:
        ideologies = {key : 0 for key in json.load(fp).keys()}
        
    for k, v in dict(Counter(group)).items():
        ideologies[k] = v
        
    return {k: v for k, v in sorted(ideologies.items(), key=lambda item: item[1], reverse = True)} # outgroup ideologies

def get_doc_leaders(doc):
    
    """This function returns a dictionary containing a count of the leaders mentioned within the document"""
    
    leaders = []

    for person in doc.ents:
        if person.label_ in ["PERSON"]:
            leaders.append(Span(doc, doc[person.start].left_edge.i, doc[person.end].i).lower_.title())

    return {k: v for k, v in sorted(dict(Counter(leaders)).items(), key=lambda item: item[1], reverse = True)}

def get_doc_directviolence(doc):
    
    """this function returns a dictionary containing a count of direct violence mentions within the document"""
    
    group = [feature.lower_.title() for feature in doc._.named_concepts if feature._.CONCEPT in \
                ["BARBARY", "MILACTION", "WARFARE", "PHYSICALVIOLENCE", "POISON"]]
    
    return {k: v for k, v in sorted(dict(Counter(group)).items(), key=lambda item: item[1], reverse = True)} # direct violence terms

def get_doc_structuralviolence(doc):
    
    """this function returns a dictionary containing a count of direct violence mentions within the document"""
    
    group = [feature.lower_.title() for feature in doc._.named_concepts if feature._.CONCEPT in \
                ["UNEQUITABLE", "SOCIALINJUSTICE"]]
    
    return {k: v for k, v in sorted(dict(Counter(group)).items(), key=lambda item: item[1], reverse = True)} # structural violence terms

def get_doc_directviolence_ideologies(doc):
    
    """this function returns a dictionary containing a count of direct violence ideologies within the document"""
    
    group = [feature._.IDEOLOGY for feature in doc._.named_concepts if feature._.CONCEPT in \
                ["WEAPON", "BARBARY", "MILACTION", "WARFARE", "PHYSICALVIOLENCE", "POISON"]]
    
    return {k: v for k, v in sorted(dict(Counter(group)).items(), key=lambda item: item[1], reverse = True)} # direct violence terms

def get_doc_structuralviolence_ideologies(doc):
    
    """this function returns a dictionary containing a count of structural violence ideologies within the document"""
    
    group = [feature._.IDEOLOGY for feature in doc._.named_concepts if feature._.CONCEPT in \
                ["UNEQUITABLE", "SOCIALINJUSTICE", "VICTIM"]]
    
    return {k: v for k, v in sorted(dict(Counter(group)).items(), key=lambda item: item[1], reverse = True)} # direct violence terms
    
print('complete at: ', datetime.now().strftime("%d/%m/%Y - %H:%M:%S"))

complete at:  26/02/2020 - 15:53:10
Wall time: 6.98 ms


In [165]:
%%time
### setup pipeline

filepath = 'C:/Users/Steve/OneDrive - University of Southampton/CulturalViolence/KnowledgeBases/data/'

Doc.set_extension("gptypology", default = os.path.join(filepath, "group_typology.json"), force = True )
Doc.set_extension("kpath", default = filepath, force = True)

class MergeConcepts:
    
    """the purpose of this function is to merge the named concepts of a doc object"""
    
    name='Merge Named Concepts'
    
    def __init__(self, nlp):
        pass
    def __call__(self, doc):
        with doc.retokenize() as retokenizer:
            for span in doc._.named_concepts:
                retokenizer.merge(doc[span.start:span.end], attrs = {"_" : {"CONCEPT" : span._.CONCEPT,
                                                               "IDEOLOGY" : span._.IDEOLOGY},
                                                         "ENT_TYPE" : span.label_,
                                                         "LEMMA" : span.text.lower(),
                                                         "POS" : "NOUN"
                                                        })
        return doc  

from spacy.language import Language

for pipe in nlp.pipe_names:
    if pipe not in ['tagger', "parser", "ner"]:
        nlp.remove_pipe(pipe)
        
def append_stringstore(doc):
    
    with open(os.path.join(Doc.get_extension("kpath")[0], "noun_phrase_patterns_v2.json"), 'r') as fp:
        noun_phrase_patterns = json.load(fp)
        
    rules = ["OUTGROUPASSET"]
    for patterns in noun_phrase_patterns.values():
        rules.extend([rule["label"] for rule in patterns])
    
    for label in rules:
        nlp.vocab.strings.add(label)  
    
    return doc

print("complete")
        
print(' | '.join(nlp.pipe_names))

nlp.add_pipe(append_stringstore, after = "parser")

# clean up named entities
nlp.add_pipe(EntityMatcher(nlp), before = "ner") # top up on named entities
nlp.add_pipe(merge_entities, after = "ner")

# clean up named concepts
nlp.add_pipe(ConceptMatcher(nlp), after = "merge_entities") # add concepts
#nlp.add_pipe(ConceptResolution(nlp), after = "Concept Matcher")
nlp.add_pipe(MergeConcepts(nlp), after = "Concept Matcher")

#nlp.add_pipe(EntityResolution(nlp))
print(' | '.join(nlp.pipe_names))

Doc.set_extension("feature_phrases", getter=get_feature_phrases, force=True)
Doc.set_extension("doc_ideologies", getter=get_doc_ideologies, force=True)
Doc.set_extension("doc_ingroupassets", getter=get_doc_ingroupassets, force=True)
Doc.set_extension("doc_outgroupassets", getter=get_doc_outgroupassets, force=True)
Doc.set_extension("doc_ingroup", getter=get_doc_ingroup, force=True)
Doc.set_extension("doc_outgroup", getter=get_doc_outgroup, force=True)
Doc.set_extension("doc_outgroup_ideologies", getter=get_doc_outgroup_ideologies, force=True)
Doc.set_extension("doc_ingroup_ideologies", getter=get_doc_ingroup_ideologies, force=True)
Doc.set_extension("doc_leaders", getter=get_doc_leaders, force=True)
Doc.set_extension("doc_directviolence", getter=get_doc_directviolence, force=True)
Doc.set_extension("doc_structuralviolence", getter=get_doc_structuralviolence, force=True)
Doc.set_extension("doc_directviolence_ideologies", getter=get_doc_directviolence_ideologies, force=True)
Doc.set_extension("doc_structuralviolence_ideologies", getter=get_doc_structuralviolence_ideologies, force=True)

Language.factories["Entity Matcher"] = lambda nlp, **cfg: EntityMatcher(nlp, **cfg)
Language.factories["Concept Matcher"] = lambda nlp, **cfg: ConceptMatcher(nlp, **cfg)
Language.factories["Entity Resolution"] = lambda nlp, **cfg: EntityResolution(nlp, **cfg)

print('complete at: ', datetime.now().strftime("%d/%m/%Y - %H:%M:%S"))

complete
tagger | parser | ner
tagger | parser | append_stringstore | Named Entity Matcher | ner | merge_entities | Concept Matcher | Merge Named Concepts
complete at:  26/02/2020 - 15:53:10
Wall time: 187 ms


In [166]:
%%time

with open(os.path.join(Doc.get_extension("kpath")[0], "bush_filelist.json"), 'r') as fp:
    bush_filelist = json.load(fp)
    
with open(os.path.join(Doc.get_extension("kpath")[0], "binladen_filelist.json"), 'r') as fp:
    binladen_filelist = json.load(fp)
    
with open(os.path.join(Doc.get_extension("kpath")[0], "lutherking_filelist.json"), 'r') as fp:
    lutherking_filelist = json.load(fp)
    
filepath = 'C:/Users/Steve/OneDrive - University of Southampton/CulturalViolence/KnowledgeBases/Speeches/'

binladenpath = os.path.join(filepath, 'Osama bin Laden/')
bushpath = os.path.join(filepath, 'George Bush/')
lutherkingpath = os.path.join(filepath, 'Martin Luther King/')


class Orator:
    
    """ this class is to define an orator object"""
    
    def __init__(self, firstname = '', lastname = '', groupcategory = '', filepath = '', \
                 ideology = None, named_concept_rules = None, entities = None):
        
        self.firstname = firstname
        self.lastname = lastname
        self.fullname = firstname + ' ' + lastname
        self.groupcategory = groupcategory
        self.filepath = filepath
        self.kpath = Doc.get_extension("kpath")[0]
        
        ## look up table from the typology
        
        with open(Doc.get_extension("gptypology")[0], 'r') as fp:
            self.named_concept_rules = json.load(fp)
            
        self.ideology_dict = {key : 0 for key in self.named_concept_rules.keys()}
            
        #create a dictionary of the named entities of all documents 
        with open(os.path.join(self.kpath, "named_entity_corrections.json"), 'r') as fp:
            self.entities = list(json.load(fp).keys())
        
        ## list of named entity categories
        self.entity_dict = {entity : set() for entity in self.entities}
        
        ## remember to figure out how to append these dictionaries as texts are added
        self.outgroup_ideology_dict = dict()
        self.outgroups_dict = dict()
        self.leaders_dict = dict()
        self.orator_ideology_dict = dict()
        self.orator_directviolence_dict = dict()
        self.orator_structuralviolence_dict = dict()
        self.orator_directviolence_ideology_dict = dict()
        self.orator_structuralviolence_ideology_dict = dict()
        self.violence_ideology_dict = {"Direct Violence" : self.orator_directviolence_ideology_dict, \
                                         "Structural Violence" : self.orator_structuralviolence_ideology_dict}
        
        #a dictionary of the raw text based on {index : text}, accessed using self.speeches_text[i]
        self.speeches_list = {}
        
        #dictionary of nlp'd speeches based on {index : nlp_doc}, accessed using self.speeches_nlp[i]
        #to access data use self.speeches_nlp[i].text_nlp
        #access extentions using self.speeches_nlp[i].text_nlp.<extension> or self.speeches_nlp[i].text_nlp._.<extension>
        self.speeches_nlp = {}       
    
    def __str__(self):
        return f'{self.fullname}'
    
    def __len__(self):
        return len(self.speeches_nlp)
    
    def __iter__(self):
        return iter([self.speeches_nlp[i] for i in self.speeches_nlp.keys()])
    
    def __contains__(self, other):

        for test in self:
            if other.text_nlp.similarity(test.text_nlp) == 1.0:
                return True
        return False
    
    def display_table(self, table, transpose = None, heatmap = False):
    
        """this method displays a dictionary table passed to it"""
        
        cmp = "Reds"
        
        if heatmap:
            pd.style.background_gradient(cmap=cmp)
        
        if transpose:
        
            display(pd.DataFrame.from_dict(table, orient = 'index').fillna("0").T)
                    
        else:
            
            display(pd.DataFrame.from_dict(table, orient = 'index').fillna("0"))
            
    def orator_violence_ideologies(self):
        
        """ this function display the direct and structural violence ideologies from each doc associated
        with the author"""
        
        print(f'the following table are summaries of the direct and structural violence ideologies for {self.fullname}')
        
        for document in self:
            self.orator_directviolence_ideology_dict[str(document.datestamp)] = document.text_nlp._.doc_directviolence_ideologies
        
        self.display_table(self.orator_directviolence_ideology_dict)
        
        for document in self:
            self.orator_structuralviolence_ideology_dict[str(document.datestamp)] = document.text_nlp._.doc_structuralviolence_ideologies
            
        self.display_table(self.orator_structuralviolence_ideology_dict)
        
        self.display_table(self.violence_ideology_dict)        
    
    def orator_directviolence(self):
        
        """ this function gathers a count of direct violence mentions in each doc associated with the Orator and 
        displays the results in a DataFrame"""
        
        print(f'the following table is a summary of references made by {self.fullname} to direct violence')
        
        for document in self:
            self.orator_directviolence_dict[str(document.datestamp)] = document.text_nlp._.doc_directviolence
            
        self.display_table(self.orator_directviolence_dict, transpose = True)
        
    def orator_structuralviolence(self):
        
        """ this function gathers a count of structural violence mentions in each doc associated with the Orator and 
        displays the results in a DataFrame"""
        
        print(f'the following table is a summary of references made by {self.fullname} to structural violence')
        
        for document in self:
            self.orator_structuralviolence_dict[str(document.datestamp)] = document.text_nlp._.doc_structuralviolence
         
        self.display_table(self.orator_structuralviolence_dict, transpose = True)
    
    def orator_ideologies(self):
        
        """ this function gathers an ideology count from each doc associated with the Orator and 
        displays the results in a DataFrame"""
        
        print(f'the following table summarises ideologies used by {self.fullname}')
        
        for document in self:
            self.orator_ideology_dict[str(document.datestamp)] = document.text_nlp._.doc_ideologies
            
        self.display_table(self.orator_ideology_dict)
    
    def orator_outgroup_ideologies(self):
        
        """ this function gathers an ideology count from each doc associated with the Orator and 
        displays the results in a DataFrame"""
        
        print(f'the following table summarises ideologies used by {self.fullname} in relation to their outgroup')
        
        for document in self:
            self.outgroup_ideology_dict[str(document.datestamp)] = document.text_nlp._.doc_outgroup_ideologies
                           
        self.display_table(self.outgroup_ideology_dict)
        
    def orator_outgroups(self):
        
        """ this function gathers an count of Outgroup terms from each doc associated with the Orator and 
        displays the results in a DataFrame"""
        
        print(f'the following table summarises {self.fullname}\'s mentions of an outgroup')
        
        for document in self:
            self.outgroups_dict[str(document.datestamp)] = document.text_nlp._.doc_outgroup
        
        self.display_table(self.outgroups_dict, transpose = True)
        
    def orator_leaders(self):
        
        """ this function gathers an count of leaders from each doc associated with the Orator and 
        displays the results in a DataFrame"""
        
        print(f'the following table summarises the leaders referred to by {self.fullname}')
        
        for document in self:
            self.leaders_dict[str(document.datestamp)] = document.text_nlp._.doc_leaders
            
        self.display_table(self.leaders_dict, transpose = True)
        
    def display_entities(self):
    
        """ this function displays a table of the named entities extracted from the spaCy language model"""

        display(pd.DataFrame.from_dict(self.entity_dict, orient='index').T.replace(np.nan, '', regex=True))
    
    def add_text(self, file):
                
        """ 
        the purpose of this method is to compile the nlp'd speeches in to a dict object
        it operates by creating a list of references to the speech and a dictionary of speech objects:
            - reference (normally the date)
            - filename
            - speech title
        """      
        new_text = Text(orator = self, filepath = self.filepath + file[1], title = file[2], datestamp = file[0])
           
        ## create new index for new text
        index = len(self.speeches_nlp)
        
        ## add new object to self.speeches_list and self.speeches_nlp
        
        if new_text not in self:
            self.speeches_list[index] = (date(int(file[0][0:4]), int(file[0][4:6]), int(file[0][6:9])), file[1], file[2]) 
            self.speeches_nlp[index] = new_text
            print('adding: ', f'{self.speeches_nlp[index]}'.ljust(100), 'to ', self.lastname)
        else:
            print(f'text already added: {other}')
            
        ## sort dictionaries in to date order and re-index
        indexes = [num for num in range(len(self))]
        
        self.speeches_nlp = OrderedDict(sorted(self.speeches_nlp.items(), key = lambda kv: kv[1].datestamp))
        self.speeches_nlp = dict(zip(indexes, self.speeches_nlp.values()))
        
        self.speeches_list = OrderedDict(sorted(self.speeches_list.items(), key = lambda kv: kv[1]))
        self.speeches_list = dict(zip(indexes, self.speeches_list.values()))
            
    def orator_index(self):
        
        """ this displays a dataframe of the text references in date order"""
        
        #df.style.set_properties(**{'text-align': 'left'})
        display(pd.DataFrame.from_dict({k : v.reference for (k, v) in self.speeches_nlp.items()},
                                       orient = 'index', columns = ['reference'])
                                       .style.set_properties(**{'text-align': 'left'})
                                       .set_table_styles([dict(selector='th', props=[('text-align', 'left')])]))
        
    def create_json_file(self):
        
        """ this method is used to create a json file of the speeches"""
        json_list = []
        
        
        for k, v in self.speeches_nlp.items():
            item = { 
                "orator" : self.fullname,
                "date" : v.datestamp.strftime("%Y/%m/%d"),
                "title" : v.title,
                "wordcount" : len(v.text_nlp)
            }
            json_list.append(item)    
    
        filepath = os.path.join(Doc.get_extension("kpath")[0], str(self.lastname + "_jsonlist.json"))
        
        with open(filepath, "wb") as f:
             f.write(json.dumps(json_list).encode("utf-8"))            
    
    def __call__(self):
        
        self.orator_violence_ideologies()
        
        self.orator_directviolence()
        
        self.orator_structuralviolence()
        
        self.orator_ideologies()
        
        self.orator_outgroup_ideologies()
        
        self.orator_outgroups()
        
        self.orator_leaders()
        
class Text:
    
    """ this class is to define a text object"""
    
    def __init__(self, orator = Orator(), filepath = '', title = '', datestamp = '', type = ''):

        doc = ''
        with open(filepath, 'r') as document:
            self.text = document.read()
            
        self.type = type
        self.datestamp = date(int(datestamp[0:4]), int(datestamp[4:6]), int(datestamp[6:9]))
        self.oratorgroup = orator.groupcategory
        self.title = title
        self.filepath = filepath
        self.text_nlp = nlp(self.text)
        self.reference = f'{orator.lastname} ({self.datestamp}) {self.title}'
        
    def __call__(self):
        
        print('ideologies')
        display(pd.DataFrame.from_dict(self.text_nlp._.doc_ideologies, orient = 'index').T)
        print('ingroup assets')
        display(pd.DataFrame.from_dict(self.text_nlp._.doc_ingroupassets, orient = 'index').T)
        print('outgroup assets')
        display(pd.DataFrame.from_dict(self.text_nlp._.doc_outgroupassets, orient = 'index').T)
        print('ingroup')
        display(pd.DataFrame.from_dict(self.text_nlp._.doc_ingroup, orient = 'index').T)
        print('outgroup')
        display(pd.DataFrame.from_dict(self.text_nlp._.doc_outgroup, orient = 'index').T)
        print('outgroup ideologies')
        display(pd.DataFrame.from_dict(self.text_nlp._.doc_outgroup_ideologies, orient = 'index').T)
        print('ingroup ideologies')
        display(pd.DataFrame.from_dict(self.text_nlp._.doc_ingroup_ideologies, orient = 'index').T)
        print('leaders')
        display(pd.DataFrame.from_dict(self.text_nlp._.doc_leaders, orient = 'index').T)
        print('direct violence mentions')
        display(pd.DataFrame.from_dict(self.text_nlp._.doc_directviolence, orient = 'index').T)
        print('structural violence mentions')
        display(pd.DataFrame.from_dict(self.text_nlp._.doc_structuralviolence, orient = 'index').T)
        print('direct violences ideologies')
        display(pd.DataFrame.from_dict(self.text_nlp._.doc_directviolence_ideologies, orient = 'index').T)
        print('structural violence ideologies')
        display(pd.DataFrame.from_dict(self.text_nlp._.doc_structuralviolence_ideologies, orient = 'index').T)
        
    
    def __str__(self):
        return f'{self.reference}'
    
    def __len__(self):
        return len(self.text_nlp)
    
#     def __contains__(self, other):
#         if other.text_nlp.similarity(self.text_nlp) == 1.0:
#             return True
#         return False
    
    def display_IDs(self, ID):
        
        """ this function displays a table of the IDs passed to the functions, options are
        - countryIDs
        - nationalityIDs
        - peopleIDs
        - orgIDs """
        
        try:
            display(pd.DataFrame.from_dict('self.text_nlp._.' + ID, orient="index", columns = ['entity']))
        except:
            print(ID, 'is not available')     
        
bush = Orator(firstname = 'George', lastname = 'Bush', groupcategory = 'American', filepath = bushpath)
#bush.add_text(bush_filelist[3])
for i in bush_filelist:
    bush.add_text(i)
    
binladen = Orator(firstname = 'Osama', lastname = 'bin Laden', groupcategory = 'Muslim', filepath = binladenpath)
for i in binladen_filelist:
    binladen.add_text(i)
    
lutherking = Orator(firstname = 'Martin', lastname = 'Luther King', groupcategory = 'American', filepath = lutherkingpath)
for i in lutherking_filelist:
    lutherking.add_text(i)

print('complete at: ', datetime.now().strftime("%d/%m/%Y - %H:%M:%S"))

adding:  Bush (2001-09-14) Remarks at the National Day of Prayer & Remembrance Service                        to  Bush
adding:  Bush (2001-09-15) First Radio Address following 911                                                  to  Bush
adding:  Bush (2001-09-17) Address at Islamic Center of Washington, D.C.                                      to  Bush
adding:  Bush (2001-09-20) Address to Joint Session of Congress Following 911 Attacks                         to  Bush
adding:  Bush (2001-09-11) Address to the Nation                                                              to  Bush
adding:  Bush (2001-10-07) Operation Enduring Freedom in Afghanistan Address to the Nation                    to  Bush
adding:  Bush (2001-10-11) 911 Pentagon Remembrance Address                                                   to  Bush
adding:  Bush (2001-10-11) Prime Time News Conference on War on Terror                                        to  Bush
adding:  Bush (2001-10-26) Address on Signing th

In [193]:
import pandas as pd

def get_ideologies(doc):
    titles = dict([])
    
    """This function returns a dictionary containing a count of ideology mentioned within the document"""
    
    ## create a list for counting the number of ideologies featuring as custom attributes of each term
    group = [feature._.IDEOLOGY for feature in doc._.named_concepts if feature._.IDEOLOGY]
    
    ## get the data structure of ideologies as a json object
    with open(Doc.get_extension("gptypology")[0], 'r') as fp:
        ideology = {key : 0 for key in json.load(fp).keys()}
        
    ## create a counter for the ideologies featuring in the doc
    for k, v in dict(Counter(group)).items():
        ideology[k] = v / len(group)

    

    return ideology

entity = bush

titles = {"Titles" : []}

for document in entity:
            entity.orator_ideology_dict[str(document.datestamp)] = get_ideologies(document.text_nlp)
            titles["Titles"].append(document.title)
        
filepath = r"C:/Users/Steve/OneDrive - University of Southampton/CulturalViolence/data/"

with open(os.path.join(filepath, "bushideologiesfile.json"), "wb") as f:
     f.write(json.dumps(entity.orator_ideology_dict).encode("utf-8"))

cmp = "Reds"
df1 = pd.DataFrame.from_dict(entity.orator_ideology_dict, orient = 'index').fillna("0").T \
        .style.background_gradient(cmap=cmp).format("{:.0%}")
df2 = pd.DataFrame.from_dict(titles)

display(df1)



display(pd.DataFrame.from_dict(entity.orator_ideology_dict, orient = 'index').fillna("0").T)
        
#Orator().display_table(bush.orator_ideology_dict)

Unnamed: 0,2001-09-11,2001-09-14,2001-09-15,2001-09-17,2001-09-20,2001-10-07,2001-10-11,2001-10-26,2001-11-10,2001-12-11,2002-01-29
social,43%,44%,36%,56%,37%,38%,40%,15%,35%,51%,32%
academia,0%,0%,0%,1%,0%,0%,0%,0%,0%,0%,1%
medical,5%,9%,3%,1%,3%,2%,2%,2%,4%,3%,5%
geopolitics,12%,12%,14%,14%,21%,12%,16%,22%,20%,13%,17%
religion,7%,18%,5%,11%,5%,6%,6%,2%,5%,7%,3%
economic,9%,4%,3%,6%,4%,4%,7%,8%,4%,1%,17%
justice,14%,7%,12%,6%,16%,17%,13%,42%,21%,10%,13%
military,11%,5%,27%,6%,13%,21%,15%,9%,10%,14%,12%


Unnamed: 0,2001-09-11,2001-09-14,2001-09-15,2001-09-17,2001-09-20,2001-10-07,2001-10-11,2001-10-26,2001-11-10,2001-12-11,2002-01-29
social,0.425743,0.444444,0.363636,0.555556,0.372685,0.38125,0.401575,0.147239,0.353086,0.507246,0.322239
academia,0.0,0.0,0.0,0.013889,0.00463,0.0,0.0,0.0,0.0,0.0,0.013616
medical,0.049505,0.091503,0.025974,0.013889,0.025463,0.01875,0.023622,0.02454,0.044444,0.028986,0.045386
geopolitics,0.118812,0.124183,0.142857,0.138889,0.208333,0.11875,0.15748,0.220859,0.2,0.130435,0.170953
religion,0.069307,0.183007,0.051948,0.111111,0.050926,0.0625,0.062992,0.02454,0.049383,0.072464,0.025719
economic,0.089109,0.039216,0.025974,0.055556,0.041667,0.04375,0.070866,0.079755,0.041975,0.014493,0.16944
justice,0.138614,0.065359,0.116883,0.055556,0.162037,0.16875,0.133858,0.417178,0.207407,0.101449,0.133132
military,0.108911,0.052288,0.272727,0.055556,0.134259,0.20625,0.149606,0.08589,0.103704,0.144928,0.119516


In [40]:
json_list = []
orator = "bush"
        
for k, v in bush.speeches_nlp.items():
    item = { 
        "orator" : bush.fullname,
        "date" : v.datestamp.strftime("%Y %m %d"),
        "title" : v.title,
        "wordcount" : len(v.text_nlp)
    }
    json_list.append(item)

print(json_list)    
    
filepath = r"C:/Users/Steve/Documents/CulturalViolence/KnowledgeBases/" + orator + "jsonfile.json"
with open(filepath, "wb") as f:
     f.write(json.dumps(json_list).encode("utf-8"))

[{'orator': 'George Bush', 'date': '2001 09 11', 'title': 'Address to the Nation', 'wordcount': 677}, {'orator': 'George Bush', 'date': '2001 09 14', 'title': 'Remarks at the National Day of Prayer & Remembrance Service', 'wordcount': 1066}, {'orator': 'George Bush', 'date': '2001 09 15', 'title': 'First Radio Address following 911', 'wordcount': 519}, {'orator': 'George Bush', 'date': '2001 09 17', 'title': 'Address at Islamic Center of Washington, D.C.', 'wordcount': 561}, {'orator': 'George Bush', 'date': '2001 09 20', 'title': 'Address to Joint Session of Congress Following 911 Attacks', 'wordcount': 3415}, {'orator': 'George Bush', 'date': '2001 10 07', 'title': 'Operation Enduring Freedom in Afghanistan Address to the Nation', 'wordcount': 1095}, {'orator': 'George Bush', 'date': '2001 10 11', 'title': '911 Pentagon Remembrance Address', 'wordcount': 1435}, {'orator': 'George Bush', 'date': '2001 10 11', 'title': 'Prime Time News Conference on War on Terror', 'wordcount': 827}, {

In [140]:
test_doc = bush.speeches_nlp[4].text_nlp
for i, tok in enumerate(test_doc):
    print(tok.i, ':', str(tok).strip(), '=>', tok.ent_type_, '=>', tok._.CONCEPT, '=>', tok._.IDEOLOGY, '=>', tok.pos_)
    if i == 100:
        break

In [58]:
binladen()

the following table are summaries of the direct and structural violence ideologies for Osama bin Laden


Unnamed: 0,military,justice,medical
1996-08-23,48,3,2
2001-10-07,6,3,0
2001-11-09,3,4,0
2002-11-24,61,5,1
2004-11-01,24,7,4


Unnamed: 0,justice
1996-08-23,3
2001-10-07,3
2001-11-09,3
2002-11-24,3
2004-11-01,1


Unnamed: 0,1996-08-23,2001-10-07,2001-11-09,2002-11-24,2004-11-01
Direct Violence,"{'military': 48, 'justice': 3, 'medical': 2}","{'military': 6, 'justice': 3}","{'justice': 4, 'military': 3}","{'military': 61, 'justice': 5, 'medical': 1}","{'military': 24, 'justice': 7, 'medical': 4}"
Structural Violence,{'justice': 3},{'justice': 3},{'justice': 3},{'justice': 3},{'justice': 1}


the following table is a summary of references made by Osama bin Laden to direct violence


Unnamed: 0,1996-08-23,2001-11-09,2002-11-24,2004-11-01,2001-10-07
Fighting,9,1,3,1,0
War,6,0,4,8,0
Fight,6,0,11,3,2
Aggression,5,0,4,0,0
Expel,4,0,0,0,0
Massacres,3,0,0,0,0
Aggressions,2,0,0,0,0
Killing,2,0,1,1,0
Explosion,2,0,0,0,0
Fought,2,0,1,0,0


the following table is a summary of references made by Osama bin Laden to structural violence


the following table summarises modes of ideology used by Osama bin Laden


Unnamed: 0,social,academia,medical,geopolitics,religion,economic,justice,military
1996-08-23,266,13,21,159,125,79,85,168
2001-10-07,31,0,4,11,24,1,14,10
2001-11-09,29,0,3,10,17,1,10,8
2002-11-24,128,0,21,87,57,45,50,85
2004-11-01,80,0,11,41,11,27,25,36


the following table summarises modes of ideology used by Osama bin Laden in relation to their outgroup


Unnamed: 0,military,geopolitics,religion,social,economic,justice,academia,medical
1996-08-23,42,19,13,1,1,1,0,0
2001-10-07,1,0,3,2,0,1,0,0
2001-11-09,1,0,6,2,0,1,0,0
2002-11-24,1,0,1,1,0,1,0,0
2004-11-01,4,2,0,0,0,1,0,0


the following table summarises Osama bin Laden's mentions of an outgroup


Unnamed: 0,1996-08-23,2002-11-24,2001-10-07,2001-11-09,2004-11-01
Enemy,23,1,0,0,0
Regime,19,0,0,0,0
Kufr,8,0,0,0,0
Crusaders,7,0,0,0,0
Crusader,3,0,0,0,0
Enemies,2,0,0,0,0
Infidels,2,0,2,5,0
Invaders,2,0,0,0,0
Aggressors,2,0,0,0,0
Oppressor,1,0,1,1,2


the following table summarises the leaders referred to by Osama bin Laden


Unnamed: 0,1996-08-23,2001-11-09,2002-11-24,2001-10-07,2004-11-01
Muhammad,1,1,3,0,0
Mohammed,0,0,0,3,0
Bush,0,1,1,1,12
Sharon,0,0,2,0,0
Satan,0,0,1,0,0
All The Prophets Of,0,0,1,0,0
Benjamin Franklin,0,0,1,0,0
Your President Clinton,0,0,1,0,0
The Nation Of Monotheism,0,0,1,0,0
This Crusade,0,0,1,0,0


In [None]:
from spacy import displacy

test_doc = bush.speeches_nlp[4].text_nlp

for concept in test_doc._.named_concepts:
    if concept._.FEATURE in ["ingroup", "outgroup"]:
        
        print(concept, '=>', concept._.CONCEPT, '=>', concept._.FEATURE)
        print(concept, '=>', str(concept.sent).strip())
        displacy.render(concept.sent, style = "dep")
        print('-----')
        

In [93]:
for concept in test_doc.noun_chunks:
    print(concept)

Mr. Speaker
Mr. President Pro Tempore
members
Congress
fellow Americans
the normal course
events
Presidents
this chamber
the state
the Union
no such report
It
the American people
We
it
the courage
passengers
who
terrorists
others
the ground
passengers
an exceptional man
you
me
his wife
Lisa Beamer
We
the state
our Union
the endurance
rescuers
exhaustion
We
the unfurling
flags
the lighting
candles
the giving
blood
the saying
prayers
English
Hebrew
Arabic
We
the decency
a loving
people
who
the grief
strangers
My fellow citizens
the last nine days
the entire world
itself
the state
our Union
it
we
a country
danger
freedom
Our grief
anger
anger
resolution
we
our enemies
justice
justice
our enemies
justice
I
the Congress
its leadership
such an important time
America
the evening
the tragedy
Republicans
Democrats
the steps
this Capitol
"God Bless America
you
you
40 billion dollars
our communities
the needs
our military
Minority Leader Gephardt
Majority Leader Daschle
Senator Lott
I
you
your fr

In [None]:
w = nlp("enemies of human freedom have commmited an act of war using weapons of mass dectruction")
#w = nlp("The evidence we have gathered all points to a collection of loosely affiliated terrorist organizations known as al Qaeda")
w = nlp("President and Senator Clinton, thank you all for being here")
displacy.render(w, style = 'dep')
for x in w:
    print(x.text.ljust(15), '=>', x.dep_.ljust(15))

In [131]:
for span in bush.speeches_nlp[4].text_nlp._.named_concepts:
    print(span.start_char, ':', span.text.ljust(30), '=>', span._.FEATURE.ljust(15), '=>', span._.CONCEPT.ljust(15), '=>', span._.IDEOLOGY.ljust(15))

0 : Mr. Speaker                    => hierarchy       => GPETITLE        => geopolitics    
13 : Mr. President                  => hierarchy       => GPETITLE        => geopolitics    
51 : Congress                       => identity        => GPEGROUP        => geopolitics    
65 : fellow Americans               => identity        => INGROUP         => social         
115 : Presidents                     => hierarchy       => GPETITLE        => geopolitics    
177 : Union                          => identity        => GPEGROUP        => geopolitics    
256 : American people                => identity        => SOCGROUP        => social         
296 : courage                        => trade           => BENEVOLANCE     => social         
307 : passengers                     => identity        => ECONGROUP       => economic       
330 : terrorists                     => identity        => CRIMEGROUP      => justice        
363 : ground                         => entity          => LOCATI

In [None]:
#def survey(results, category_names, plottitle):
#    """
#    Parameters
#    ----------
#    results : dict
#        A mapping from question labels to a list of answers per category.
#        It is assumed all lists contain the same number of entries and that
#        it matches the length of *category_names*.
#    category_names : list of str
#        The category labels.
#    """
#    labels = [*results.keys()]
#    data = np.array([*results.values()])
#    data_cum = data.cumsum(axis=1)
#    category_colors = plt.get_cmap('RdYlGn')(
#        np.linspace(0.15, 0.85, data.shape[1]))

#    fig, ax = plt.subplots(figsize=(15, len(labels)/2))
#    ax.invert_yaxis()
#    ax.xaxis.set_visible(False)
#    ax.set_xlim(0, np.sum(data, axis=1).max())

#    for i, (colname, color) in enumerate(zip(category_names, category_colors)):
#        widths = data[:, i]
#        starts = data_cum[:, i] - widths
#        ax.barh(labels, widths, left=starts, height=0.8,
#                label=colname, color=color)
#        xcenters = starts + widths / 2

#        r, g, b, _ = color
#        text_color = 'white' if r * g * b < 0.5 else 'darkgrey'
#        for y, (x, c) in enumerate(zip(xcenters, widths)):
#            ax.text(x, y, str(int(c)), ha='center', va='center',
#                    color=text_color)
#    ax.legend(ncol=len(category_names), bbox_to_anchor=(0, 1),
#              loc='lower left', fontsize='large')
    
#    if plottitle == "NORP":
#        plt.title("Nationalities or Religious or Political Groups (NORP)", loc = 'right')
#    if plottitle == "ORG":
#        plt.title("Organisation (ORG)", loc = 'right')
#    if plottitle == "PERSON":
#        plt.title("Person (PERSON))", loc = 'right')
        
#    plt.savefig(f'C:/Users/Steve/Documents/Cultural Violence/George Bush/{plottitle}.png', bbox_inches='tight')

#    return fig, ax
    
#def get_results(docs = None, keyvalue = ''):
    
#    if docs is not None:
#        dic1 = docs[0]._.namedgroups[keyvalue]
#        dic2 = docs[1]._.namedgroups[keyvalue]    
#        dic3 = dict(dic2)

#        for k, v in dic1.items():
#            dic3[k] = [v, dic3[k]] if k in dic3 else [v, 0]

#        for k,v in dic2.items():
#            if k not in dic1:
#                dic3[k]  = [0, v] 
                
#    return dic3

#category_names = [bush.fullname, binladen.fullname]

#for group in ["NORP", "ORG", "PERSON"]:
#    survey(get_results(docs = [bush.speeches_nlp[4].text_nlp, binladen.speeches_nlp[0].text_nlp], keyvalue = group), category_names, group)

In [None]:
import wikipediaapi
import pandas as pd

print("argh")


def get_wikisummary(token):

    wiki_wiki = wikipediaapi.Wikipedia('en')
    page_py = wiki_wiki.page(token)

    if page_py.exists():
        return (page_py.title, " ".join(str(nlp(page_py.summary, disable = ['tokenizer', 'ner']).sents.__next__()).split()))
    else:
        return ('no wiki reference', 'no wiki reference')


if input("Restart from fresh (y/n): ").lower() == 'n':
    filename = input('existing filename: ')

    with open("".join(["C:/Users/Steve/Documents/Cultural Violence/Knowledge Bases/", filename]), 'r') as fp:
        corrections_dict = json.load(fp)
        
    with open(r"C:\Users\Steve\Documents\Cultural Violence\Knowledge Bases\seen_tokens.json", 'r') as fp:
        seen_tokens = {key for key in json.load(fp)}

else:
    corrections_dict = dict()
    seen_tokens = set()

for i, doc in enumerate(binladen):

    for token in binladen.speeches_nlp[i].text_nlp:
        entries_dict = dict()

        if token.ent_type_ and \
        token.ent_type_ not in ['ORATOR', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL'] and \
        token.text not in seen_tokens:

            seen_tokens.add(token.text)
            
            with open(r"C:\Users\Steve\Documents\Cultural Violence\Knowledge Bases\seen_tokens.json", "wb") as f:
                    f.write(json.dumps(dict.fromkeys(seen_tokens)).encode("utf-8"))

            wikientry = get_wikisummary(token.text)
            entries_dict[token.text] = [token.ent_type_, wikientry[0], wikientry[1]]
            entries_dict['sentence'] = ['', '', token.sent]
            displacy.render(token.sent, style = 'ent')
            pd.set_option('display.max_colwidth', -1)
            display(pd.DataFrame.from_dict(entries_dict, orient='index', columns = ['ent_type_', 'wiki_title', 'summary'])
                .style.set_properties(**{'text-align': 'left'})
                .set_table_styles([dict(selector='th', props=[('text-align', 'left')])]))

            if input('correct y/n ').lower() == 'n':
                corrections_dict[token.text] = {
                    'original ent_type_' : token.ent_type_, 
                    'wiki_title': wikientry[0], 
                    'wiki_summary' : wikientry[1],
                    'correction' : input('correct type')
                }

                ### check wiki entry and correct with manual entry if required
                
                answer = 'n'
                while answer == 'n':
                    display(pd.DataFrame.from_dict(corrections_dict[token.text], orient = "index"))
                    
                    answer = input('correct wiki entry? (y/n)').lower()
                    
                    if answer != 'n':
                        break
                                
                    corrections_dict[token.text] = {
                        'original ent_type_' : token.ent_type_, 
                        'wiki_title': input("wiki_title: "), 
                        'wiki_summary' : input("wiki_summary: "),
                        'correction' : input("correct type: ")
                    }
                    
                with open(r"C:\Users\Steve\Documents\Cultural Violence\Knowledge Bases\binladen_entitycorrections.json", "wb") as f:
                    f.write(json.dumps(corrections_dict).encode("utf-8"))

print('complete')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.cbook import get_sample_data


fname = get_sample_data('percent_bachelors_degrees_women_usa.csv',
                        asfileobj=False)
gender_degree_data = np.array(list(result.items())
 = np.genfromtxt(fname, delimiter=',', names=True)

# You typically want your plot to be ~1.33x wider than tall. This plot
# is a rare exception because of the number of lines being plotted on it.
# Common sizes: (10, 7.5) and (12, 9)
fig, ax = plt.subplots(1, 1, figsize=(12, 14))

# These are the colors that will be used in the plot
ax.set_prop_cycle(color=[
    '#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c', '#98df8a',
    '#d62728', '#ff9896', '#9467bd', '#c5b0d5', '#8c564b', '#c49c94',
    '#e377c2', '#f7b6d2', '#7f7f7f', '#c7c7c7', '#bcbd22', '#dbdb8d',
    '#17becf', '#9edae5'])

# Remove the plot frame lines. They are unnecessary here.
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)

# Ensure that the axis ticks only show up on the bottom and left of the plot.
# Ticks on the right and top of the plot are generally unnecessary.
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()

fig.subplots_adjust(left=.06, right=.75, bottom=.02, top=.94)
# Limit the range of the plot to only where the data is.
# Avoid unnecessary whitespace.
ax.set_xlim(1969.5, 2011.1)
ax.set_ylim(-0.25, 90)

# Set a fixed location and format for ticks.
ax.set_xticks(range(1970, 2011, 10))
ax.set_yticks(range(0, 91, 10))
ax.xaxis.set_major_formatter(plt.FuncFormatter('{:.0f}'.format))
ax.yaxis.set_major_formatter(plt.FuncFormatter('{:.0f}%'.format))

# Provide tick lines across the plot to help your viewers trace along
# the axis ticks. Make sure that the lines are light and small so they
# don't obscure the primary data lines.
ax.grid(True, 'major', 'y', ls='--', lw=.5, c='k', alpha=.3)

# Remove the tick marks; they are unnecessary with the tick lines we just
# plotted. Make sure your axis ticks are large enough to be easily read.
# You don't want your viewers squinting to read your plot.
ax.tick_params(axis='both', which='both', labelsize=14,
               bottom=False, top=False, labelbottom=True,
               left=False, right=False, labelleft=True)

# Now that the plot is prepared, it's time to actually plot the data!
# Note that I plotted the majors in order of the highest % in the final year.
majors = ['Health Professions', 'Public Administration', 'Education',
          'Psychology', 'Foreign Languages', 'English',
          'Communications\nand Journalism', 'Art and Performance', 'Biology',
          'Agriculture', 'Social Sciences and History', 'Business',
          'Math and Statistics', 'Architecture', 'Physical Sciences',
          'Computer Science', 'Engineering']

y_offsets = {'Foreign Languages': 0.5, 'English': -0.5,
             'Communications\nand Journalism': 0.75,
             'Art and Performance': -0.25, 'Agriculture': 1.25,
             'Social Sciences and History': 0.25, 'Business': -0.75,
             'Math and Statistics': 0.75, 'Architecture': -0.75,
             'Computer Science': 0.75, 'Engineering': -0.25}

for column in majors:
    # Plot each line separately with its own color.
    column_rec_name = column.replace('\n', '_').replace(' ', '_')

    line, = ax.plot('Year', column_rec_name, data=gender_degree_data,
                    lw=2.5)

    # Add a text label to the right end of every line. Most of the code below
    # is adding specific offsets y position because some labels overlapped.
    y_pos = gender_degree_data[column_rec_name][-1] - 0.5

    if column in y_offsets:
        y_pos += y_offsets[column]

    # Again, make sure that all labels are large enough to be easily read
    # by the viewer.
    ax.text(2011.5, y_pos, column, fontsize=14, color=line.get_color())

# Make the title big enough so it spans the entire plot, but don't make it
# so big that it requires two lines to show.

# Note that if the title is descriptive enough, it is unnecessary to include
# axis labels; they are self-evident, in this plot's case.
fig.suptitle('Percentage of Bachelor\'s degrees conferred to women in '
             'the U.S.A. by major (1970-2011)\n', fontsize=18, ha='center')

# Finally, save the figure as a PNG.
# You can also save it as a PDF, JPEG, etc.
# Just change the file extension in this call.
# fig.savefig('percent-bachelors-degrees-women-usa.png', bbox_inches='tight')
plt.show()

In [None]:
import tqdm

import pandas as pd
sent_dic = {}

i = 0
print("creating feature phrase list")
ranges = []
phrase_list = [(range(x.start, x.end), x) for x in new_doc._.feature_phrases]

def replace_feature_phrase(word):
    found = [x for x in phrase_list if word.i in x[0]] # create a list of phrases if word.i is in the ranges (x[0]) 
    
    #if word.ent_type_ in ["NORP", "ORG", "GPE", "PERSON"]:
    #    return word.text
    
    if len(found) > 0: # if the word features in the found list
                
        #print(found, ' > ', found[0][1].label_, ' > ', word, ' > ', word.i)

        existing = [x for x in ranges if word.i in x]
        if len(existing) == 0:
            ranges.append(range(found[0][1].start, found[0][1].end))
            if found[0][1].label_ in ["SOCIALGROUP", "SECURITYGROUP", "ECONOMICGROUP", "GPEGROUP", "RELIGIOUSGROUP", "ARMEDGROUP"]:
                return found[0][1].label_
            else:
                return found[0][1].label_
        else:
            return ''
    
    elif word.ent_type_ in ["NORP", "GPE", "ORG", "PERSON"]:
        return word.text
    elif word.ent_type_:
        return word.ent_type_
    else:
        return word.text
    
new_doc = bush.speeches_nlp[0].text_nlp
i=0
for n, sentence in tqdm.tqdm(enumerate(new_doc.sents), total = len(list(new_doc.sents))):
    
    index = []
    word_list = []
    lemma_list = []
    pos_list = []
    tag_list = []
    dep_list = []
    ent_type_list = []
    new_sentence = []

    #if not {"we", "us", "our"}.isdisjoint(set(map(lambda tok: tok.lower_, sentence))):
    #if 'OUTGROUP' in set(map(lambda tok: tok.lower, sentence)):
    for word in sentence:
        index.append(word.i)
        word_list.append(word.text)
        lemma_list.append(word.lemma_)
        pos_list.append(word.pos_)
        tag_list.append(word.tag_)
        dep_list.append(word.dep_)
        ent_type_list.append(word.ent_type_)
        new_sentence.append(replace_feature_phrase(word))

    sent_dic[i] = {'index' : index,
                   'sentence' : word_list,
                   'POS' : pos_list,
                   'LEMMA' : lemma_list,
                   'TAG' : tag_list,
                   'DEP' : dep_list,
                   'ENT_TYPE' : ent_type_list,
                   'NEW_SENT' : new_sentence
                      }
    i+=1  

pd.set_option("expand_frame_repr", False)
pd.set_option("display.max_columns", 999)
dic_len = len(sent_dic) 
print(f'sent_dic length: {dic_len}')
print()
for i, e in tqdm.tqdm(enumerate(sent_dic.items()), total = len(sent_dic)):
    print(f"{i}/{dic_len}")
    display(pd.DataFrame.from_dict(e[1]).T)
    print()