In [4]:
#uncomment and run to install spacy
#import sys
#!{sys.executable} -m pip install spacy
#!{sys.executable} -m spacy download en

In [5]:
import pandas as pd
import spacy

In [6]:
nlp = spacy.load('en_core_web_sm')

[List of all named entities](https://spacy.io/api/annotation#named-entities)

### Extract `MONEY` and `PRODUCT` from the earning calls

### Extract all named entities from the earning calls. Rank them by word frequencies

In [7]:
data = pd.read_csv('data/EC10.csv')

In [9]:
#extract all named entities from text
def ne_text(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

In [10]:
named_entities = data['text'].apply(ne_text)

In [14]:
def filter_ne(list_entities, entities=['MONEY', 'PRODUCT']):
    return [(word, ent) for word,ent in list_entities if ent in entities]

In [16]:
filtered_entities = named_entities.apply(filter_ne)
filtered_entities

0    [(two marks, MONEY), ($54.5 million, MONEY), (...
1    [(A360 Collaboration for Revit, PRODUCT), (Q3,...
2    [($147.8 million, MONEY), (Q1, PRODUCT), ($13....
3    [(1.15, MONEY), (0.94, MONEY), (more than $400...
4    [(Splunk, PRODUCT), (Q2, PRODUCT), (Splunk, PR...
5    [(0.36, MONEY), ($60.7 million, MONEY), ($1 bi...
6    [(Q1, PRODUCT), ($330 million, MONEY), ($28.6 ...
7    [(Management, PRODUCT), (1.20, MONEY), ($239 m...
8    [($106.8 million, MONEY), ($107.6 million, MON...
9    [($98 million, MONEY), ($75 million, MONEY), (...
Name: text, dtype: object

In [20]:
#rank by frequency
#first let us try to do it for the first text
entities = named_entities[0]
entities[:10]

[("Zoe's Kitchen, Inc.", 'ORG'),
 ('NYSE', 'ORG'),
 ('2015', 'DATE'),
 ('August 27, 2015 5:00', 'DATE'),
 ('James Besch - CFO', 'PERSON'),
 ('Kevin Miles', 'PERSON'),
 ('Chief Executive Officer & Director\n', 'ORG'),
 ('Karen F. Short -', 'PERSON'),
 ('Andrew Marc', 'PERSON'),
 ('M. Miller', 'PERSON')]

In [24]:
#put the entities in a data frame
s = pd.Series(entities).value_counts()
s.head()

(Kevin Miles, PERSON)          34
(James Besch - CFO, PERSON)    14
(2016, DATE)                   12
(Kevin, PERSON)                10
(Controller\n, PERSON)         10
dtype: int64

In [28]:
#isolate the steps in a function
def most_frequent_entities(list_of_entities):
    return pd.Series(list_of_entities).value_counts().head().to_dict()

In [29]:
named_entities.apply(most_frequent_entities)

0    {('Kevin Miles', 'PERSON'): 34, ('James Besch ...
1    {('Carl Bass - President', 'PERSON'): 48, ('on...
2    {('James Debney - President', 'PERSON'): 32, (...
3    {('Mary N. Dillon - Chief', 'PERSON'): 16, ('o...
4    {('Splunk', 'PERSON'): 27, ('Splunk', 'PRODUCT...
5    {('Paul Raines', 'PERSON'): 46, ('Tony D. Bart...
6    {('China', 'GPE'): 24, ('the prior quarter', '...
7    {('U.S.', 'GPE'): 35, ('Canadian', 'NORP'): 14...
8    {('the second quarter', 'DATE'): 7, ('last yea...
9    {('Vault', 'ORG'): 32, ('CRM', 'PRODUCT'): 19,...
Name: text, dtype: object

In [30]:
data['frequent entities'] = named_entities.apply(most_frequent_entities)
data.head()

Unnamed: 0,id,text,date,company,sector,frequent entities
0,32934,"Zoe's Kitchen, Inc. (NYSE:ZOES)\nQ2 2015 Earni...",2015-08-28 00:11:00,"Zoe's Kitchen, Inc.",Consumer Services,"{('Kevin Miles', 'PERSON'): 34, ('James Besch ..."
1,32905,"Autodesk, Inc. (NASDAQ:ADSK)\nQ2 2016 Earnings...",2015-08-28 00:07:00,"Autodesk, Inc.",Technology,"{('Carl Bass - President', 'PERSON'): 48, ('on..."
2,32926,Smith & Wesson Holding Corp. (NASDAQ:SWHC)\nQ1...,2015-08-27 23:25:00,Smith & Wesson Holding Corporation,Capital Goods,"{('James Debney - President', 'PERSON'): 32, (..."
3,32930,"Ulta Salon, Cosmetics & Fragrance, Inc. (NASDA...",2015-08-27 23:09:00,"Ulta Salon, Cosmetics & Fragrance, Inc.",Consumer Services,"{('Mary N. Dillon - Chief', 'PERSON'): 16, ('o..."
4,32907,Splunk Inc. (NASDAQ:SPLK)\nQ2 2016 Earnings Co...,2015-08-27 22:56:00,Splunk Inc.,Technology,"{('Splunk', 'PERSON'): 27, ('Splunk', 'PRODUCT..."
