In [None]:
from IPython.core.display import HTML
with open ("../style.css", "r") as file:
    css = file.read()
HTML(css)

# A Ply Scanner to Count Numbers

Below we implement a minimal scanner using `ply` that has the purpose of finding all *numbers* that are present in a text.  It will
add these numbers and print the sum of these numbers.  Besides numbers, the scanner also recognizes *fake numbers*, which are strings 
that consist solely of digits but that are not numbers because of leading zeros.  For example, `007` is a fake number.

In [None]:
import ply.lex as lex

We are counting both the numbers that we encounter as well as the fake numbers.  Furthermore, we keep track of of the sum of all numbers encountered.

In [None]:
count_fake   = 0
count_number = 0
sum_numbers  = 0

With `ply` every token needs to have a name.  The token `IGNORE` is used to recognize letters and punctuation symbols.

In [None]:
tokens = (
    'NUMBER',
    'FAKE_NUM',
    'IGNORE'
)

In [None]:
def t_FAKE_NUM(t):
    r'0[0-9]+'
    global count_fake
    count_fake += 1
    print(f'Found fake {t.value}')
    t.value = None
    return t

In [None]:
def t_NUMBER(t):
    r'0|[1-9][0-9]*'
    global count_number, sum_numbers
    count_number += 1
    t.value = int(t.value)
    print(f'Found {t.value}')
    sum_numbers += t.value
    return t

In [None]:
def t_IGNORE(t):
    r'[a-zA-Z.,;:!?]'
    return

White space is also ignored.

In [None]:
t_ignore  = ' \t\n'

Any other characters are reported as lexical errors.

In [None]:
def t_error(t):
    print("Illegal character '%s'" % t.value[0])
    t.lexer.skip(1)

For `ply` to work in a jupyter notebook we have to set the variable `__file__`.

In [None]:
__file__ = 'main'

We generate the lexer with the next cell.

In [None]:
lexer = lex.lex()

The variable `data` contains our input text. 

In [None]:
data = \
'''
    Here are 33 apples and 5 pears.  Furthermore, here are
    8 bananas and 3 peaches.  James Bond is agent 007.  
    How many pieces of fruit does this text contain?
'''

The next command feeds the input data to the generated scanner.

In [None]:
lexer.input(data)

Finally, we run the scanner.  The tokens produced by the scanner are discarded.

In [None]:
for token in lexer:
    pass

In [None]:
count_number

In [None]:
count_fake

In [None]:
sum_numbers