In [1]:
import pandas as pd
from spacy.lang.yo import Yoruba

In [2]:
ROOT_DIR = '../input/'

In [3]:
df = pd.read_csv(ROOT_DIR + "news.csv")

In [4]:
df.shape

(193279, 5)

In [15]:
df.head()

Unnamed: 0,id,timestamp,source,title,description
0,1,2020-03-21 18:28:18,CNBC,New York City-area airports halt air traffic a...,The FAA said air traffic was halted at New Yor...
1,2,2020-03-21 17:59:00,Yahoo.com,Airline CEOs promise to eliminate dividends an...,CEOs from America’s largest publicly traded ai...
2,3,2020-03-21 17:13:04,Wcvb.com,Market Basket joins list of grocery stores mod...,The modified schedule begins Monday for all Ma...
3,4,2020-03-21 17:00:00,Oilprice.com,What Happens If Oil Prices Go Negative - OilPr...,The combination of the Saudi Arabia vs Russia ...
4,5,2020-03-21 16:49:56,The Verge,Amazon to start paying US warehouse workers do...,Amazon will raise overtime pay for workers in ...


In [16]:
sources = df["source"].unique()
print(sources)

['CNBC' 'Yahoo.com' 'Wcvb.com' 'Oilprice.com' 'The Verge'
 'Mercurynews.com' 'Google News' 'CNN' 'Fool.com' 'CBS News' 'Nytimes.com'
 'Richmond.com' 'Newser.com' 'Wgntv.com' 'Chicagotribune.com' 'Forbes.com'
 'Vox.com' 'Thepointsguy.com' 'Bloomberg' 'Reuters']


In [17]:
condition = df["source"].isin(["CNBC"])

content_df = df.loc[condition, :]["title"][:100]
content_df.shape

(100,)

In [18]:
content_df.head()

0     New York City-area airports halt air traffic a...
21    Why long-term investors should never sell stoc...
44    Planning in the time of coronavirus means thin...
48    Financing programs for businesses hit by the c...
51    Breaking down this sell-off, among the most ex...
Name: title, dtype: object

In [19]:
for article in content_df[:2]:
    print(article)

New York City-area airports halt air traffic as coronavirus causes staffing issues - CNBC
Why long-term investors should never sell stocks in a panic


In [20]:
import spacy

nlp = spacy.load('en_core_web_sm')

text = "New York City-area airports halt air traffic as coronavirus causes staffing issues"

doc = nlp(text)

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)


New New PROPN NNP compound Xxx True False
York York PROPN NNP compound Xxxx True False
City City PROPN NNP compound Xxxx True False
- - PUNCT HYPH punct - False False
area area NOUN NN compound xxxx True False
airports airport NOUN NNS nsubj xxxx True False
halt halt VERB VBP ROOT xxxx True False
air air NOUN NN compound xxx True False
traffic traffic NOUN NN dobj xxxx True False
as as SCONJ IN mark xx True True
coronavirus coronavirus NOUN NN nsubj xxxx True False
causes cause VERB VBZ advcl xxxx True False
staffing staffing NOUN NN compound xxxx True False
issues issue NOUN NNS dobj xxxx True False


In [11]:
def return_entities_and_processed_docs(data_frame):
    named_entities = {}
    processed_docs = []

    for item in data_frame:
        doc = nlp(item)
        processed_docs.append(doc)

        for ent in doc.ents:
            entity_text = ent.text
            entity_type = str(ent.label_)
            current_ents = {}

            if entity_type in named_entities.keys():
                current_ents = named_entities.get(entity_type)

            current_ents[entity_text] = current_ents.get(entity_text, 0) + 1

            named_entities[entity_type] = current_ents

    return named_entities, processed_docs

named_entities, processed_docs = return_entities_and_processed_docs(content_df)

In [12]:
named_entities

{'GPE': {'New York City-area': 1,
  'New York': 3,
  'France': 1,
  'South Korea': 1,
  'US': 5,
  'California': 1,
  'Washington': 2,
  'Australia': 3,
  'Italy': 1,
  'Cisco': 1,
  'China': 2,
  'Japan': 1,
  'Tokyo': 3,
  'East Africa': 1,
  'Germany': 1,
  'Netflix': 2,
  'Canada': 1,
  'Spain': 1,
  'Poland': 1,
  'Citi': 1,
  'Massachusetts': 1},
 'ORG': {'CNBC': 3,
  'Mnuchin': 2,
  'Emirates Airline': 1,
  'Fed': 4,
  'Congress': 4,
  'NY': 1,
  'Trump': 3,
  'National Guard': 1,
  'Dow': 1,
  'Securities and Exchange Commission': 1,
  'NYSE': 1,
  'IBM': 1,
  'White House': 1,
  'Goldman': 1,
  'Boeing': 2,
  'Airbus': 1,
  "WeWork board's": 1,
  'SoftBank': 2,
  'JetBlue': 1,
  'Fauci': 1,
  'Treasury': 2,
  'Shell': 1,
  'YouTube': 1,
  'Deere': 1,
  'Amazon': 1,
  'Apple & more': 1,
  'Hasbro': 2,
  'Nike': 1,
  'OECD': 1,
  'GOP': 1,
  'H&M': 1,
  "El-Erian: 'Pockets": 1,
  'GE Aviation': 1,
  'CVS Health': 1,
  'N95': 1,
  'Bank of America': 1,
  'CFO': 1,
  'COVID-19': 1

In [13]:
len(processed_docs)

100

In [14]:
def print_top_10(named_entities):
    for key in named_entities.keys():
        print(key)
        entities = named_entities.get(key)

        #Sort the entries by their frequency in descending
        #Order and print out the most frequent n ones
        sorted_keys = sorted(entities, key=entities.get, reverse=True)
        for item in sorted_keys[:10]:
            if (entities.get(item) > 1):
                print(" " + item + ": " + str(entities.get(item)))

print_top_10(named_entities)

GPE
 US: 5
 New York: 3
 Australia: 3
 Tokyo: 3
 Washington: 2
 China: 2
 Netflix: 2
ORG
 Fed: 4
 Congress: 4
 CNBC: 3
 Trump: 3
 Mnuchin: 2
 Boeing: 2
 SoftBank: 2
 Treasury: 2
 Hasbro: 2
MONEY
DATE
 2020: 2
 today: 2
CARDINAL
 3: 2
PERSON
PERCENT
LOC
 Europe: 3
EVENT
 Olympics: 2
NORP
 European: 2
TIME


In [15]:
named_entities.keys()

dict_keys(['GPE', 'ORG', 'MONEY', 'DATE', 'CARDINAL', 'PERSON', 'PERCENT', 'LOC', 'EVENT', 'NORP', 'TIME'])

In [16]:
def calculate_entity_span(document, entity):
    indexes = []
    for ent in document.ents:
        if ent.text == entity:
            for i in range(int(ent.start), int(ent.end)):
                indexes.append(i)
    return indexes

In [17]:
entity = "The New York City-area"

sentences = "New York City-area airports halt air traffic as coronavirus causes staffing issues"

doc = nlp(sentences)

calculate_entity_span(doc, entity)

[]

In [18]:
for sentences in sentences:
    doc = nlp(sentences)
    for token in doc:
        print(token.dep_)

ROOT
ROOT
ROOT
dep
ROOT
ROOT
ROOT
ROOT
dep
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
dep
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
dep
ROOT
ROOT
ROOT
ROOT
dep
ROOT
ROOT
ROOT
dep
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
dep
ROOT
ROOT
dep
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
dep
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
dep
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT
dep
ROOT
ROOT
ROOT
ROOT
ROOT
ROOT


In [19]:
def calc_entity_subject_object(document, entity, indexes):
    actions = []
    action = ''
    participant1 = ''
    participant2 = ''

    for token in document:
        #Next, you identify the main verb expressing the main action in the sentence
        #To extract the relation, we have to find the ROOT of the sentence (which is also the verb of the sentence)
        if token.pos == "VERB" and token.dep_ == 'ROOT':
            #initialize the indexes for thesubject and the object related to the main verb
            subj_ind = -1
            obj_ind = -1
            #store the main verb itself (token.text in the action variable)
            action = token.text
            children = [child for child in token.children]
            for child1 in children:
                #find the subject via the nsubj relation and store it as a participant1
                #and its index as subj_ind
                if child1.dep_ == 'nsubj':
                    participant1 = child1.text
                    sub_ind = int(child1.i)
                #If there is a preposition attached to the verb (e.g.. "write about"), then
                #you need to search for the indirect object as the second participant.
                if child1.dep_ == 'prep':
                    participant2 = ''
                    child1_children = [child for child in child1.children]
                    for child2 in child1_children:
                        #If such an object is a noun or a proper noun
                        #You store it as participant and its index as obj_ind
                        if child2.pos_ == 'NOUN' or child2.pos_ == 'PROPN':
                            participant2 = child2.text
                            obj_ind = int(child2.i)

                    #If at this point both participants of the main action have been identified and
                    #their indexes are included in the indexes of the words covered by the entity,
                    #you add the action with two participants to the list of actions
                    if not participant2 == '':
                        if subj_ind in indexes:
                            actions.append(entity + " " + action + " ", child1.text + " " + participant2)
                        elif obj_ind in indexes:
                            actions.append(participant1 + " " + actions + ' ' + child1.text + " " + entity)
                #Otherwise, if there is no preposition attached to the verb,
                #participant2 is a direct object of the main verb,
                # which can be identified via the dobj relation
                if child1.dep_ == 'dobj' == (child1.pos_ == 'NOUN' or child1.pos_ == 'PROPN'):
                    participant2 = child1.text
                    obj_ind = int(child1.i)
                    # In this case you apply the same strategy as above,
                    #adding the action with two participants to the list of actions.
                    if subj_ind in indexes:
                            actions.append(entity + " " + action + " ", child1.text + " " + participant2)
                    elif obj_ind in indexes:
                            actions.append(participant1 + " " + action + " " + entity)
    # FInally if the final list of actions is not empty,
    # Print out the sentence and all actions together with the participants
    if not len(actions) == 0:
         print(f"\nSentence = {document}")
         for item in actions:
              print(item)

In [27]:
#Now let's check that it's working
for sentence in sentences:
    doc = nlp(sentence)
    indexes = calculate_entity_span(doc, entity)
    calc_entity_subject_object(doc, entity, indexes)

In [20]:
def return_docs_of_given_ent_type(processed_docs, entity, ent_type):
    output_sentences = []
    for doc in processed_docs:
        for sentence in doc.sents:
            # Only consider sentences that contain the input entity
            # of the specified type among it's named entities
            if entity in [ent.text for ent in sentence.ents if ent.label_ == ent_type]:
               output_sentences.append(sentence)
    return output_sentences
entity = "Trump"

ent_sentences = return_docs_of_given_ent_type(processed_docs, entity, 'ORG')
print(ent_sentences)

[Trump activates National Guard in California, New York and Washington state to fight coronavirus outbreak, In coronavirus tweet storm, Trump touts suspect 'cure' and potential easing of guidelines to boost economy, Uber CEO asks Trump to support gig workers impacted by the coronavirus]


In [21]:
for sentence in ent_sentences:
    indexes = calculate_entity_span(sentence, entity)
    calc_entity_subject_object(sentence, entity, indexes)

In [22]:
entity = "Congress"

ent_sentences = return_docs_of_given_ent_type(processed_docs, entity, 'ORG')
print(ent_sentences)

for sentence in ent_sentences:
    indexes = calculate_entity_span(sentence, entity)
    calc_entity_subject_object(sentence, entity, indexes)

[Airlines tell Congress they need cash coronavirus aid or thousands will be furloughed, Congress struggles to reach a deal on its massive coronavirus stimulus bill, Stock market live updates: Dow futures down 400, waiting on Congress, 'limit down' again, Mnuchin says Congress is 'very close' to a stimulus agreement and must get it done 'today']


# APPLY

In [24]:
#Performing named entity recognition on a random text
from spacy import displacy

text = 'Mr Kelvin is a very Great man with Absolute personality alongside his dedication for the Apple Firm, and he currently resides in Canada .'

doc = nlp(text)

displacy.render(doc, style="ent")

# Visualize entity types in sentences containing the specified entity:

In [36]:
def visualize_given_ent_type(processed_docs, entity, ent_type):
    for doc in processed_docs:
        for sentence in doc.sents:
            if entity in [ent.text for ent in sentence.ents if ent.label_ == ent_type]:
                displacy.render(sentence, style='ent')

visualize_given_ent_type(processed_docs, 'Trump', 'ORG')

# Now let's find sentences where the particular named entity is used alongside other entities of the same type - i.e a specific
# entity type is used in a particular number of times

In [45]:
def return_count_of_ent_type(sentence, ent_type):
    return len([ent.text for ent in sentence.ents if ent.label_ == ent_type])

text = 'Last week, Democratic lawmakers from both parties said they had the Senate votes needed to pass legislation that would prevent tech platforms, including Apple, GM and Facebook, from favouring their own businesses.'

doc = nlp(text)

return_count_of_ent_type(doc, 'ORG')

4

In [47]:
def return_docs_of_given_ent_type_custom(processed_docs, entity, ent_type):
    output_sentences = []
    for doc in processed_docs:
        for sentence in doc.sents:
            if entity in [ent.text for ent in sentence.ents if ent.label_ == ent_type and
                          return_count_of_ent_type(sentence, ent_type) > 1 ]:
                output_sentences.append(sentence)
    return output_sentences

output_sentences = return_docs_of_given_ent_type_custom(processed_docs, "Trump", "ORG")

print(len(output_sentences))


1


In [50]:
def visualize_conditional_sentences(sentences):
    colors = {"ORG": "linear-gradient(90deg, #64B5F6, #E0F7FA)"}
    options = {"ents": ["ORG"], "colors": colors}

    for sentence in sentences:
        displacy.render(sentence, style='ent', options=options)

visualize_conditional_sentences(output_sentences)