In [None]:
import ipywidgets as wi
import nltk
import re
from IPython.display import HTML
try:
    import ipython_blocking
except:
    !pip install ipython_blocking
    import ipython_blocking
HTML('''
<script>
code_show = true;
function code_toggle()
{
if(code_show)
    {
    $('div.input').hide();
    }
else 
    {
    $('div.input').show();
    }
code_show = !code_show
}
$(document).read(code_toggle);
</script>
<form action = "javascript:code_toggle()"><input type="submit" value ="Show/Hide Code"></form>
''')

In [None]:
def parse_person(line):
    names = []
    #using the NTLK parts of speach model to check for people.
    for i, chunk in enumerate(nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(line)))):
        if type(chunk) == nltk.tree.Tree:
            if chunk.label() == 'PERSON':
                names.append(chunk[0][0])
    '''
    The above code works great for English names.  However, it struggled to identify the non-anglo surname 
    'Haung'. Therefore I added this code to grab the second word after an NLTK identified person.  
    To solve the non-anglo problem efficiently you would want to retrain a word net underlying NLTK 
    to include a more comprehensive list of global names. 
    
    A potential bug in this code is that it would extract a company such as "Kellys Carpets" as a name.
    Additionally,  it does not account for potential middle names.  
    '''
    if len(names)==1:
        names.append(line.split(' ')[i])
    return ' '.join(names)
        
def getName(text):
    text = text.split('\n')
    for line in text:
        person = parse_person(line)
        if person:
            return('Name: '+person)    
        
def getPhoneNumber(text):
    #removing the newline character to allow for the functions to be used in both the GUI and non-GUI code
    text = text.replace('\n','')
    #Writing regex to grab the first 5 characters before the phone number to ensure we do not grab a fax number
    phone_pattern = re.compile(r'((?:.{5})(?:\+1\s)?[\(]?\d{3}[\)|-]\s?\d{3}[-|\s]\d{4})')
    #matching the pattern and grabbing the second capture group which is the number
    number =re.findall(phone_pattern, text)
    #only selecting the numbers that are not labeled as a fax.
    number = [x[5:] for x in number if 'Fax' not in x][0]
    #cleaning the number by removing any non-digits
    number = re.sub(r'\D','',number)
    return('Phone: '+number)


def getEmail(text):
    #removing the newline character to allow for the functions to be used in both the GUI and non-GUI code
    text = text.replace('\n',' ')
    #simple regex for email limiting the size of each component to 255 out of habit as regex's are expensive. 
    emailPattern = re.compile(r'\s((?:\w|\.){1,255}@\w{1,255}\.\w{1,255})')
    email = re.findall(emailPattern, text)
    
    return('Email: '+email[0].strip())

def getContactInfo(b):
    try:
        text = ocr.value
        display(header)
        print(getName(text))
        print(getPhoneNumber(text))
        print(getEmail(text))
    except:
        print('Please enter text in the box')




In [None]:
#simple widget for the input
style = {'description_width': 'initial'}
title = wi.HTML('''<h2 align='center'> Enter OCR Results Below:</h2>''')
ocr = wi.Textarea(placeholder='Kelly Simmons\nLead Data Scientist\nBooz Allen Hamilton\n303-919-9606\nkellylsimmons@gmail.com',
                   layout=wi.Layout(width='40%', height='200px'),
                   style= style)
button = wi.Button(description='Get Contact Info', button_style='success')
gui = wi.VBox([title, ocr, button])
gui.layout.align_items = 'center'
button.on_click(getContactInfo)
header = wi.HTML('''<h3><font color='green'>Your contact info is:</font></h3>''')
display(gui)