In [2]:
import csv

In [5]:
data = []
with open('data/EC10.csv') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',')
    for row in csvreader:
        data.append(row[1])
data = data[1:] #exclude header

In [6]:
len(data)

10

In [7]:
print(data[0][:1000])

Zoe's Kitchen, Inc. (NYSE:ZOES)
Q2 2015 Earnings Call
August 27, 2015 5:00 pm ET
Executives
James Besch - CFO, Vice President-Accounting & Controller
Kevin Miles - President, Chief Executive Officer & Director
Analysts
Karen F. Short - Deutsche Bank Securities, Inc.
Andrew Marc Barish - Jefferies LLC
Nicole M. Miller Regan - Piper Jaffray & Co (Broker)
Will Slabaugh - Stephens, Inc.
Jason West - Credit Suisse Securities (NYSE:USA) LLC (Broker)
Sharon M. Zackfia - William Blair & Co. LLC
David E. Tarantino - Robert W. Baird & Co., Inc. (Broker)
Paul Westra - Stifel, Nicolaus & Co., Inc.
Stephen Anderson - Maxim Group
Operator
Good day, ladies and gentlemen, and thank you for standing by. Welcome to the Zoe's Kitchen Second Quarter 2015 Earnings Conference Call. At this time, all participants have been placed in a listen-only mode, and the lines will be open for your questions following the presentation. Please note that this conference is being recorded today, August 27, 2015.
On the ca

The list `data` contains 10 entries. Each entry is the full-text of  a company's [earning call](https://en.wikipedia.org/wiki/Earnings_call)

The company's ticker and stock exchange are listed in the first few lines in each earning call. Using regular expressions, extract the tickers and stock exchange symbols for all the companies. 

Note that an earning call may contain tickers of other companies. We want to exclude them. In other words, we want to first ticker / stock exchange listed in each earning call

In [34]:
import re
stock_ticker = re.compile('[(]([A-Z]+):([A-Z]+)[)]')
stock_ticker.findall(data[0]) # just to test

[('NYSE', 'ZOES'), ('NYSE', 'USA')]

In [32]:
def extract_stock_ticker(earning_call, extract='both'):
    stock_ticker = re.compile('[(]([A-Z]+):([A-Z]+)[)]')
    stock_ticker.search(data[0])
    res = stock_ticker.findall(earning_call) 
    if res:
        hit = res[0] #first hit
        if extract == 'both':
            return hit
        elif extract == 'ticker':
            return hit[1]
        elif extract == 'exchange':
            return hit[0]
    return None # no need here, but to add an emphasis

In [35]:
tickers_exchanges = [extract_stock_ticker(x) for x in data]
tickers_exchanges

[('NYSE', 'ZOES'),
 ('NASDAQ', 'ADSK'),
 ('NASDAQ', 'SWHC'),
 ('NASDAQ', 'ULTA'),
 ('NASDAQ', 'SPLK'),
 ('NYSE', 'GME'),
 ('NASDAQ', 'OVTI'),
 ('NYSE', 'TD'),
 ('NASDAQ', 'BAMM'),
 ('NYSE', 'VEEV')]

In [36]:
tickers = [extract_stock_ticker(x, extract='ticker') for x in data]
tickers

['ZOES', 'ADSK', 'SWHC', 'ULTA', 'SPLK', 'GME', 'OVTI', 'TD', 'BAMM', 'VEEV']

In [37]:
exchanges = [extract_stock_ticker(x, extract='exchange') for x in data]
exchanges

['NYSE',
 'NASDAQ',
 'NASDAQ',
 'NASDAQ',
 'NASDAQ',
 'NYSE',
 'NASDAQ',
 'NYSE',
 'NASDAQ',
 'NYSE']

How about extracting all numbers that start with `$`

In [63]:
money = re.compile(r'\$\d+[.,\d]+')
money.findall(data[0]) # just to test

['$54.5',
 '$41.9',
 '$54.4',
 '$41.8',
 '$230,000',
 '$11.6',
 '$8.8',
 '$1.8',
 '$6.3',
 '$4.5',
 '$550,000',
 '$180,000',
 '$100,000',
 '$142,000',
 '$700',
 '$500,000',
 '$637,000',
 '$442,000',
 '$2.7',
 '$800,000',
 '$710,000',
 '$620,000',
 '$750,000',
 '$871,000',
 '$124,000',
 '$120,000',
 '$0.01',
 '$1.1',
 '$0.06',
 '$19.6',
 '$19.5',
 '$955,000',
 '$0.05',
 '$705,000',
 '$0.04',
 '$220',
 '$224',
 '$200,000.',
 '$25.3',
 '$25.5',
 '$550,000',
 '$25.3',
 '$25.5',
 '$1.3']

The numbers are meaningless so let us put some context

In [66]:
money = re.compile(r'(\w+\s)(\$\d+[.,\d]+)(\s\w+)')
money.findall(data[0]) # just to test

[('to ', '$54.5', ' million'),
 ('from ', '$41.9', ' million'),
 ('to ', '$54.4', ' million'),
 ('to ', '$41.8', ' million'),
 ('approximately ', '$230,000', ' of'),
 ('to ', '$11.6', ' million'),
 ('from ', '$8.8', ' million'),
 ('by ', '$1.8', ' million'),
 ('to ', '$6.3', ' million'),
 ('from ', '$4.5', ' million'),
 ('approximately ', '$550,000', ' related'),
 ('approximately ', '$180,000', ' related'),
 ('approximately ', '$100,000', ' related'),
 ('to ', '$142,000', ' of'),
 ('exceeded ', '$700', ' million'),
 ('approximately ', '$500,000', ' of'),
 ('to ', '$637,000', ' from'),
 ('approximately ', '$2.7', ' million'),
 ('approximately ', '$800,000', ' in'),
 ('approximately ', '$710,000', ' compared'),
 ('to ', '$620,000', ' in'),
 ('approximately ', '$750,000', ' per'),
 ('was ', '$871,000', ' in'),
 ('of ', '$124,000', ' in'),
 ('was ', '$120,000', ' or'),
 ('a ', '$1.1', ' million'),
 ('or ', '$0.06', ' per'),
 ('were ', '$19.6', ' million'),
 ('with ', '$19.5', ' million'),


Apply to all earning calls

In [67]:
money_extract = [ money.findall(x) for x in data ]

In [68]:
money_extract

[[('to ', '$54.5', ' million'),
  ('from ', '$41.9', ' million'),
  ('to ', '$54.4', ' million'),
  ('to ', '$41.8', ' million'),
  ('approximately ', '$230,000', ' of'),
  ('to ', '$11.6', ' million'),
  ('from ', '$8.8', ' million'),
  ('by ', '$1.8', ' million'),
  ('to ', '$6.3', ' million'),
  ('from ', '$4.5', ' million'),
  ('approximately ', '$550,000', ' related'),
  ('approximately ', '$180,000', ' related'),
  ('approximately ', '$100,000', ' related'),
  ('to ', '$142,000', ' of'),
  ('exceeded ', '$700', ' million'),
  ('approximately ', '$500,000', ' of'),
  ('to ', '$637,000', ' from'),
  ('approximately ', '$2.7', ' million'),
  ('approximately ', '$800,000', ' in'),
  ('approximately ', '$710,000', ' compared'),
  ('to ', '$620,000', ' in'),
  ('approximately ', '$750,000', ' per'),
  ('was ', '$871,000', ' in'),
  ('of ', '$124,000', ' in'),
  ('was ', '$120,000', ' or'),
  ('a ', '$1.1', ' million'),
  ('or ', '$0.06', ' per'),
  ('were ', '$19.6', ' million'),
  ('w