# Topic today:
## findall vs. finditer
## how to use regex 101

**Findall**: get a list of all matching patterns

**Findinter**: return an iterator yielding MatchObject instances using the pattern

In [3]:
import re

text = "I carefully entered the room but my cat Lily noticed me immediately."

In [8]:
# adverbs in the text
# ends -ly
#lowercase


pattern = "[a-z][a-z]+ly"

findall_result = re.findall(pattern, text)
print(findall_result)

['carefully', 'immediately']


In [9]:
print (type(findall_result))

<class 'list'>


In [10]:
finditer_result = re.finditer(pattern,text)
print(finditer_result)
print(type(finditer_result))

<callable_iterator object at 0x7fc8b92dc640>
<class 'callable_iterator'>


In [12]:
finditer_result = re.finditer(pattern,text)

for item in finditer_result:
    print(item)
    print(type(item))
    print(dir(item))

<re.Match object; span=(2, 11), match='carefully'>
<class 're.Match'>
['__class__', '__copy__', '__deepcopy__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'end', 'endpos', 'expand', 'group', 'groupdict', 'groups', 'lastgroup', 'lastindex', 'pos', 're', 'regs', 'span', 'start', 'string']
<re.Match object; span=(56, 67), match='immediately'>
<class 're.Match'>
['__class__', '__copy__', '__deepcopy__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'end', 'endpos', 'expand', 'gr

In [15]:
finditer_result = re.finditer(pattern,text)


for item in finditer_result:
    print(item.string)
    print(item.group())
    print(item.span())
    print(type(item))
    

I carefully entered the room but my cat Lily noticed me immediately.
carefully
(2, 11)
<class 're.Match'>
I carefully entered the room but my cat Lily noticed me immediately.
immediately
(56, 67)
<class 're.Match'>


# Extracting names

In [39]:
text = """Ronald Mayr: A 505-345-5816 rmay@gmail.com 1989-04-20
Bell Kassulke: B 714-325-5816 bkassulke@umich.edu 1990/03/20
Jacqueline Rupp: A 734-851-2281 jrupp@fancy.org 1978-05-11
Alexander Zeller: C 853-395-1492 azller@samsung.co.kr 1985-10-30
Valentina Denk: C 492.333.1932 vdenk@weird-email.com 1993-01-02
Lola-Rose Coates: C 593*492*3212 lr-coates@something.net 1992-08-15"""

print(text)

Ronald Mayr: A 505-345-5816 rmay@gmail.com 1989-04-20
Bell Kassulke: B 714-325-5816 bkassulke@umich.edu 1990/03/20
Jacqueline Rupp: A 734-851-2281 jrupp@fancy.org 1978-05-11
Alexander Zeller: C 853-395-1492 azller@samsung.co.kr 1985-10-30
Valentina Denk: C 492.333.1932 vdenk@weird-email.com 1993-01-02
Lola-Rose Coates: C 593*492*3212 lr-coates@something.net 1992-08-15


In [21]:
pattern = "[A-Za-z]+\s[A-Za-z]+"

match_names = re.findall(pattern, text)
print(match_names)

['Ronald Mayr', 'Bell Kassulke', 'Jacqueline Rupp', 'Alexander Zeller', 'Valentina Denk', 'Rose Coates']


In [22]:
pattern = "[A-Za-z-]+\s[A-Za-z]"

match_names = re.findall(pattern, text)
print(match_names)

['Ronald M', 'Bell K', 'Jacqueline R', 'Alexander Z', 'Valentina D', 'Lola-Rose C']


In [23]:
pattern = "[A-Za-z-]+\s[A-Za-z]+"

match_names = re.findall(pattern, text)
print(match_names)

['Ronald Mayr', 'Bell Kassulke', 'Jacqueline Rupp', 'Alexander Zeller', 'Valentina Denk', 'Lola-Rose Coates']


## Extracting phone numbers 

In [24]:
pattern = "\d{3}[.*-]\d{3}[.*-]\d{4}"
match_phnumbers = re.findall(pattern, text)
print(match_phnumbers)

['505-345-5816', '714-325-5816', '734-851-2281', '853-395-1492', '492.333.1932', '593*492*3212']


## Extracting Grades

In [27]:
pattern = "(?<=:\s)[A-F]{1}(?=\s)"
match_grades = re.findall(pattern, text)
print(match_grades)

['A', 'B', 'A', 'C', 'C', 'C']


## Extracting Emails

In [35]:
pattern = "\w+?-?\w+@\w+.\w+.[a-z]+"
match_emails = re.findall(pattern, text)
print(match_emails)

['rmay@gmail.com', 'bkassulke@umich.edu', 'jrupp@fancy.org', 'azller@samsung.co.kr', 'vdenk@weird-email.com', 'lr-coates@something.net']


## Extract Birthday

In [41]:
pattern ="\d{4}[-/]\d{2}[-/]\d{2}"
match_bdays = re.findall(pattern, text)
print(match_bdays)

['1989-04-20', '1990/03/20', '1978-05-11', '1985-10-30', '1993-01-02', '1992-08-15']


## Group referencing

* (?!) negative lookahead
* (?=) positive lookahead
* (?<=) posiive lookbehind
* (?<!) negative lookbehind

string = **foobarbarfoo**

* **bar(?=bar)* find the 1st bar("bar" which has "bar" after it)
* **bar(?!bar)** find the 2nd bar ("bar" which doeant have "bar" after it)
* **(?<=foo)bar** finds the 1st bar ("bar" which has "foo" before it)
* **(?<!foo)bar** finds the 2nd bar ("bar" which doesnt have "foo" before it)

## Extract names using lookahed

In [42]:
print(text)

Ronald Mayr: A 505-345-5816 rmay@gmail.com 1989-04-20
Bell Kassulke: B 714-325-5816 bkassulke@umich.edu 1990/03/20
Jacqueline Rupp: A 734-851-2281 jrupp@fancy.org 1978-05-11
Alexander Zeller: C 853-395-1492 azller@samsung.co.kr 1985-10-30
Valentina Denk: C 492.333.1932 vdenk@weird-email.com 1993-01-02
Lola-Rose Coates: C 593*492*3212 lr-coates@something.net 1992-08-15


In [43]:
# match all characters before the colon:
pattern =   """.+   # match everything
            (?=:) # before the colon
            """

match_names = re.findall(pattern,text,re.VERBOSE)
# re.VERBOSE allows us to comment on regex pattern, making the code more readable
match_names

['Ronald Mayr',
 'Bell Kassulke',
 'Jacqueline Rupp',
 'Alexander Zeller',
 'Valentina Denk',
 'Lola-Rose Coates']

In [44]:

pattern =   """(?<=:\s)   # after the colon and space
            \w # match a single character
            """

match_grades = re.findall(pattern,text,re.VERBOSE)
# re.VERBOSE allows us to comment on regex pattern, making the code more readable
match_grades

['A', 'B', 'A', 'C', 'C', 'C']

## Extracting usernames using lookahed

In [45]:

pattern =   """\w+-?\w+   # match all characters with or without hypen
            (?=@) # before the @ colon
            """

match_usernames = re.findall(pattern,text,re.VERBOSE)
# re.VERBOSE allows us to comment on regex pattern, making the code more readable
match_usernames

['rmay', 'bkassulke', 'jrupp', 'azller', 'vdenk', 'lr-coates']

## Backreference with capture groups

* () captured group
* (?:) non-captured group

In [47]:
pattern = """ (.+) # Group 1
              (?::\s) # Non-capture group
              (\w)    # Group2
              (?:\s)   #Non-captured group
              (.+?)   # Group 3
              (?:\s)   # Non-captured group
              (.+?)   # Group 4
              (?:\s) # Non-captured group
              (.+) #Group 5  
    
        """
#refence by group numbers
[match.group() for match in re.finditer(pattern, text, re.VERBOSE)]

['Ronald Mayr: A 505-345-5816 rmay@gmail.com 1989-04-20',
 'Bell Kassulke: B 714-325-5816 bkassulke@umich.edu 1990/03/20',
 'Jacqueline Rupp: A 734-851-2281 jrupp@fancy.org 1978-05-11',
 'Alexander Zeller: C 853-395-1492 azller@samsung.co.kr 1985-10-30',
 'Valentina Denk: C 492.333.1932 vdenk@weird-email.com 1993-01-02',
 'Lola-Rose Coates: C 593*492*3212 lr-coates@something.net 1992-08-15']

In [48]:
[match.group(1) for match in re.finditer(pattern, text, re.VERBOSE)]

['Ronald Mayr',
 'Bell Kassulke',
 'Jacqueline Rupp',
 'Alexander Zeller',
 'Valentina Denk',
 'Lola-Rose Coates']

In [49]:
[match.group(2) for match in re.finditer(pattern, text, re.VERBOSE)]

['A', 'B', 'A', 'C', 'C', 'C']

In [50]:
[match.group(3) for match in re.finditer(pattern, text, re.VERBOSE)]

['505-345-5816',
 '714-325-5816',
 '734-851-2281',
 '853-395-1492',
 '492.333.1932',
 '593*492*3212']

In [51]:
[match.group(4) for match in re.finditer(pattern, text, re.VERBOSE)]

['rmay@gmail.com',
 'bkassulke@umich.edu',
 'jrupp@fancy.org',
 'azller@samsung.co.kr',
 'vdenk@weird-email.com',
 'lr-coates@something.net']

In [52]:
[match.group(5) for match in re.finditer(pattern, text, re.VERBOSE)]

['1989-04-20',
 '1990/03/20',
 '1978-05-11',
 '1985-10-30',
 '1993-01-02',
 '1992-08-15']

In [53]:
pattern = """ (?P<fullname>.+) # Group 1 Fullnames
              (?::\s) # Non-capture group
              (?P<grades>\w)    # Group 2 Grades
              (?:\s)   #Non-captured group
              (?P<phone>.+?)   # Group 3 Phone numbers
              (?:\s)   # Non-captured group
              (?P<email>.+?)   # Group 4 Emails
              (?:\s) # Non-captured group
              (?P<birthday>.+) #Group 5   Birthdays
    
        """
#reference by group names
[match.group('fullname') for match in re.finditer(pattern, text, re.VERBOSE)]

['Ronald Mayr',
 'Bell Kassulke',
 'Jacqueline Rupp',
 'Alexander Zeller',
 'Valentina Denk',
 'Lola-Rose Coates']

In [54]:
[match.group('email') for match in re.finditer(pattern, text, re.VERBOSE)]

['rmay@gmail.com',
 'bkassulke@umich.edu',
 'jrupp@fancy.org',
 'azller@samsung.co.kr',
 'vdenk@weird-email.com',
 'lr-coates@something.net']

## Extract info of all student who had a grade A

In [56]:
print(text)

Ronald Mayr: A 505-345-5816 rmay@gmail.com 1989-04-20
Bell Kassulke: B 714-325-5816 bkassulke@umich.edu 1990/03/20
Jacqueline Rupp: A 734-851-2281 jrupp@fancy.org 1978-05-11
Alexander Zeller: C 853-395-1492 azller@samsung.co.kr 1985-10-30
Valentina Denk: C 492.333.1932 vdenk@weird-email.com 1993-01-02
Lola-Rose Coates: C 593*492*3212 lr-coates@something.net 1992-08-15


In [55]:
pattern = """ (?P<fullname>.+) # Group 1 Fullnames
              (?::\s) # Non-capture group
              (?P<grades>[A])    # Group 2 Grades
              (?:\s)   #Non-captured group
              (?P<phone>.+?)   # Group 3 Phone numbers
              (?:\s)   # Non-captured group
              (?P<email>.+?)   # Group 4 Emails
              (?:\s) # Non-captured group
              (?P<birthday>.+) #Group 5   Birthdays
    
        """
#reference by group names
[match.group('fullname') for match in re.finditer(pattern, text, re.VERBOSE)]

['Ronald Mayr', 'Jacqueline Rupp']

In [57]:
[match.group(1) for match in re.finditer(pattern, text, re.VERBOSE)]

['Ronald Mayr', 'Jacqueline Rupp']

## Not A grades

In [58]:
pattern = """ (?P<fullname>.+) # Group 1 Fullnames
              (?::\s) # Non-capture group
              (?P<grades>[^A])    # Group 2 Grades
              (?:\s)   #Non-captured group
              (?P<phone>.+?)   # Group 3 Phone numbers
              (?:\s)   # Non-captured group
              (?P<email>.+?)   # Group 4 Emails
              (?:\s) # Non-captured group
              (?P<birthday>.+) #Group 5   Birthdays
    
        """
#reference by group names
[match.group('fullname') for match in re.finditer(pattern, text, re.VERBOSE)]

['Bell Kassulke', 'Alexander Zeller', 'Valentina Denk', 'Lola-Rose Coates']

## Replace string with regex

re.sub(regex_search_term, regex_replacement, text_before)

In [59]:
re.sub('a','i','banana')

'binini'

In [60]:
print(text)

Ronald Mayr: A 505-345-5816 rmay@gmail.com 1989-04-20
Bell Kassulke: B 714-325-5816 bkassulke@umich.edu 1990/03/20
Jacqueline Rupp: A 734-851-2281 jrupp@fancy.org 1978-05-11
Alexander Zeller: C 853-395-1492 azller@samsung.co.kr 1985-10-30
Valentina Denk: C 492.333.1932 vdenk@weird-email.com 1993-01-02
Lola-Rose Coates: C 593*492*3212 lr-coates@something.net 1992-08-15


In [61]:
#replace .* between phone numbers with -

print(re.sub("(?<=\d{3})[.*]", "-", text))

Ronald Mayr: A 505-345-5816 rmay@gmail.com 1989-04-20
Bell Kassulke: B 714-325-5816 bkassulke@umich.edu 1990/03/20
Jacqueline Rupp: A 734-851-2281 jrupp@fancy.org 1978-05-11
Alexander Zeller: C 853-395-1492 azller@samsung.co.kr 1985-10-30
Valentina Denk: C 492-333-1932 vdenk@weird-email.com 1993-01-02
Lola-Rose Coates: C 593-492-3212 lr-coates@something.net 1992-08-15
