# Lesson

## Basic Regexes

In [1]:
import pandas as pd

In [2]:
import re

re.findall(r'b', 'abcd')
## this code will find all instances of the character 'b' in the following string

['b']

In [3]:
def show_all_matches(regexes, subject, re_length=6):
    '''
    This function will take regex's, and a subject, and return all instance
    '''
    print('Sentence:')
    print()
    print('    {}'.format(subject))
    print()
    print(' regexp{} | matches'.format(' ' * (re_length - 6)))
    print(' ------{} | -------'.format(' ' * (re_length - 6)))
    for regexp in regexes:
        fmt = ' {:<%d} | {!r}' % re_length
        matches = re.findall(regexp, subject)
        if len(matches) > 8:
            matches = matches[:8] + ['...']
        print(fmt.format(regexp, matches))

In [4]:
sentence = 'Mary had a little lamb. 1 little lamb. Not 10, not 12, not 22, just one.'

show_all_matches([
    r'a',
    r'm',
    r'M',
    r'Mary',
    r'little',
    r'1',
    r'10',
    r'22'
], sentence)

Sentence:

    Mary had a little lamb. 1 little lamb. Not 10, not 12, not 22, just one.

 regexp | matches
 ------ | -------
 a      | ['a', 'a', 'a', 'a', 'a']
 m      | ['m', 'm']
 M      | ['M']
 Mary   | ['Mary']
 little | ['little', 'little']
 1      | ['1', '1', '1']
 10     | ['10']
 22     | ['22']


## Metacharacters and Character Classes

In [5]:
res = [
    r'\w', # finds all alphanumeric characters 
    r'\d', # finds all numeric characters
    r'\s', # finds all blank whitespace
    r'.', # matches every character
    r'\.', # a literal period
]
show_all_matches(res, sentence)

Sentence:

    Mary had a little lamb. 1 little lamb. Not 10, not 12, not 22, just one.

 regexp | matches
 ------ | -------
 \w     | ['M', 'a', 'r', 'y', 'h', 'a', 'd', 'a', '...']
 \d     | ['1', '1', '0', '1', '2', '2', '2']
 \s     | [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '...']
 .      | ['M', 'a', 'r', 'y', ' ', 'h', 'a', 'd', '...']
 \.     | ['.', '.', '.']


In [6]:
show_all_matches([r'l\w\w\w\W', r'\d\d'], sentence, re_length=9)
## metacharacters can be combined

## \w\w\w\w will find all instances fo 4 alphanumeric characters in a row

## \d\d will find all instaces of 2 digit numbers 

Sentence:

    Mary had a little lamb. 1 little lamb. Not 10, not 12, not 22, just one.

 regexp    | matches
 ------    | -------
 l\w\w\w\W | ['lamb.', 'lamb.']
 \d\d      | ['10', '12', '22']


## Repeating

In [7]:
show_all_matches([
    r'\d+' # + = 1 or more repeating of the same character type
], sentence)

print('\n---\n')

show_all_matches([
    r'a{2,}', # 2 or more repititions of the same character
    r'a{2}', # 2 exactly 2 repititions of the same character
    r'a{3,4}' # either 3 or 4 repititions of the same character
], 'aabbaaaa')

Sentence:

    Mary had a little lamb. 1 little lamb. Not 10, not 12, not 22, just one.

 regexp | matches
 ------ | -------
 \d+    | ['1', '10', '12', '22']

---

Sentence:

    aabbaaaa

 regexp | matches
 ------ | -------
 a{2,}  | ['aa', 'aaaa']
 a{2}   | ['aa', 'aa', 'aa']
 a{3,4} | ['aaaa']


## Any of or None of

In [8]:
show_all_matches([
    r'[lt]', # will locate any instance of either letter inside brackets occuring
    r'[lt]+', # will locate any instance of the letters in the brackets repeating or following each other
    r'[^aeiou\s\.]', # any letter that's not a vowel
    r'[a-d]' # will locate any instance of the letters between a and d alphabetically occuring
], sentence, re_length=12)

Sentence:

    Mary had a little lamb. 1 little lamb. Not 10, not 12, not 22, just one.

 regexp       | matches
 ------       | -------
 [lt]         | ['l', 't', 't', 'l', 'l', 'l', 't', 't', '...']
 [lt]+        | ['l', 'ttl', 'l', 'l', 'ttl', 'l', 't', 't', '...']
 [^aeiou\s\.] | ['M', 'r', 'y', 'h', 'd', 'l', 't', 't', '...']
 [a-d]        | ['a', 'a', 'd', 'a', 'a', 'b', 'a', 'b']


## Anchors

In [9]:
show_all_matches([
    r'\bo\w+', # any word that starts with an 'o'
    r'^\s', # starts with a space
    r'^M', # starts with 'M'
    r'\.$', # ends with a period
], sentence)

Sentence:

    Mary had a little lamb. 1 little lamb. Not 10, not 12, not 22, just one.

 regexp | matches
 ------ | -------
 \bo\w+ | ['one']
 ^\s    | []
 ^M     | ['M']
 \.$    | ['.']


## Capture Groups

In [10]:
sentence = '''
You can find us on the web at https://codeup.com. Our ip address is 123.123.123.123 (maybe).
'''.strip()

In [11]:
sentence

'You can find us on the web at https://codeup.com. Our ip address is 123.123.123.123 (maybe).'

In [12]:
ip_re = r'\d+(\.\d+){3}' # will find any occurance of a number, followed by more numbers and a period, 
                         # that repeats 3 times
                         ## like an ip address 

match = re.search(ip_re, sentence)
match[0]

'123.123.123.123'

In [13]:
# simplified for demonstration, a real url to parse urls would be much more
# complex
url_re = r'(https?)://(\w+)\.(\w+)' # will locate instances in the string that begin with 'http'
                                    # maybe followed by an 's'
                                    # followed by '://' then proceeded by any amount of letters
                                    # while will then be proceeded by a period, then proceeded by more letters
                                    ## like a simple website

protocol, domain, tld = re.search(url_re, sentence).groups()

print(f'''
protocol: {protocol}
domain:   {domain}
tld:      {tld}
''')


protocol: https
domain:   codeup
tld:      com



In [14]:
url_re = r'(?P<protocol>https?)://(?:\w+)\.(?P<tld>\w+)' # shy groups?

match = re.search(url_re, sentence)

print(f'''
groups: {match.groups()}
referencing a group by name: {match.group('tld')}
group dictionary: {match.groupdict()}
''')


groups: ('https', 'com')
referencing a group by name: com
group dictionary: {'protocol': 'https', 'tld': 'com'}



## Substitution

In [15]:
# remove anything that's not a digit
re.sub(r'\D', '', 'abc 123')

'123'

In [16]:
# remove anything that's not a letter
re.sub(r'[^a-z]', '', 'abc 123')

'abc'

In [17]:
re.sub(r'.(.).', r'\1', 'abc')
# remove everything but the 2nd character in a 3 character string

'b'

In [18]:
re.sub(r'(.)(.)(.)', r'\3\2\1', 'abc')
# rearrange the order of characters to backwards in a 3 character string

'cba'

In [19]:
re.sub(r'.{2}$', 'X', 'abc')
# replaced the las 2 characters of any string with the character 'X'

'aX'

## Regex Flags

In [20]:
regexp = r'''
[aeiou] (?# any vowel)
[^aeiou] (?# followed by a non-vowel)
'''

In [21]:
# similar to above code
regexp = r'[aeiou][^aeiou]'

# Exercises

## 1. Write a function named is_vowel. It should accept a string as input and use a regular expression to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string.

In [22]:
def is_vowel(subject):
    '''
    This function will take a string and return a boolean vlaue indicating whether or not the 
    string is made up entirely of vowels
    '''
    regexp = r'^[aeiou]$'
    # looks for anything not a vowel
    if len(re.findall(regexp, subject.lower())) == 1: # if there is more than 1 non vowel
        return True
    else: 
        return False

In [23]:
is_vowel('a')

True

In [24]:
is_vowel('b')

False

In [25]:
is_vowel('ae')

False

In [26]:
is_vowel('abc')

False

## 2. Write a function named is_valid_username that accepts a string as input. A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the _ character. It should also be no longer than 32 characters. The function should return either True or False depending on whether the passed string is a valid username.

In [27]:
def is_valid_username(subject):
    '''
    This function will take a string and return a boolean value indicating whether or not a string
    is an appropriate username given the criteria specified
        - must start with a lowecase letter and be comprised of only lowercase letters
        - can only consister of letters, numbers, or an underscore characters
        - can not be longer than 32 characters
    '''
    
    regexp = r'^[a-z]\w{1,31}$'
    if len(re.findall(regexp, subject)) == 1:
        return True
    else:
        return False

In [28]:
is_valid_username('john')

True

In [29]:
is_valid_username('j0hn')

True

In [30]:
is_valid_username('joHn') # should be false, no capital letters 

True

In [31]:
is_valid_username('John')

False

In [32]:
is_valid_username('john27_asdf')

True

In [33]:
is_valid_username('27john')

False

In [34]:
is_valid_username('i_just_want_to_see_if_this_will_return_false_if_there_are_more_than_32_characters')
# should be false, far too long

False

In [35]:
is_valid_username('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa')
# should be false, far too long

False

In [36]:
is_valid_username('codeup')

True

In [37]:
is_valid_username('Codeup')

False

In [38]:
is_valid_username('codeup123')

True

In [39]:
is_valid_username('1codeup')

False

## 3. Write a regular expression to capture phone numbers. It should match all of the following:

- (210) 867 5309

- +1 210.867.5309

- 867-5309

- 210-867-5309

In [40]:
df = pd.DataFrame()
df['number'] = [
    '(210) 867 5309',
    '+1 210.867.5309',
    '867-5309',
    '210-867-5309',
    '2108675309',
]

In [41]:
phone_regex = re.compile(
'''^
(?P<country_code>\+\d+)?
\D*?
(?P<area_code>\d{3})?
\D*?
(?P<exchange_code>\d{3})
\D*?
(?P<line_number>\d{4})
$''', re.VERBOSE)

In [42]:
df['number'].str.extract(phone_regex)

Unnamed: 0,country_code,area_code,exchange_code,line_number
0,,210.0,867,5309
1,1.0,210.0,867,5309
2,,,867,5309
3,,210.0,867,5309
4,,210.0,867,5309


In [43]:
pd.concat([df, df['number'].str.extract(phone_regex)], axis=1)

Unnamed: 0,number,country_code,area_code,exchange_code,line_number
0,(210) 867 5309,,210.0,867,5309
1,+1 210.867.5309,1.0,210.0,867,5309
2,867-5309,,,867,5309
3,210-867-5309,,210.0,867,5309
4,2108675309,,210.0,867,5309


## 4. Use regular expressions to convert the dates below to the standardized year-month-day format.

In [44]:
date_list = [
    '02/04/19',
    '02/05/19',
    '02/06/19',
    '02/07/19',
    '02/08/19',
    '02/09/19',
    '02/10/19']

In [45]:
date_reg = r'(\d+)/(\d+)/(\d+)'

In [46]:
new_list = []
for date in date_list:
    new_list.append(re.sub(date_reg, r'20\3-\1-\2', date))

In [47]:
new_list

['2019-02-04',
 '2019-02-05',
 '2019-02-06',
 '2019-02-07',
 '2019-02-08',
 '2019-02-09',
 '2019-02-10']

In [48]:
[re.sub(date_reg, r'20\3-\1-\2', date) for date in date_list]

['2019-02-04',
 '2019-02-05',
 '2019-02-06',
 '2019-02-07',
 '2019-02-08',
 '2019-02-09',
 '2019-02-10']

## 5. Write a regex to extract the various parts of these logfile lines:

In [49]:
lines = """
GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58
POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58
GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58
"""

In [50]:
regexp = r'''
^
(?P<method>GET|POST)
\s
(?P<path>[/\w\-\?=]+)
\s
\[(?P<timestamp>.+)\]
\s
(?P<http_version>HTTP/\d+\.\d+)
\s
\{(?P<status_code>\d+)\}
\s
(?P<bytes_out>\d+)
\s
"(?P<user_agent>.+)"
\s
(?P<ip>\d+\.\d+\.\d+\.\d+)
$'''

In [51]:
[re.search(regexp, line, re.VERBOSE).groupdict() for line in lines.strip().split('\n')]

[{'method': 'GET',
  'path': '/api/v1/sales?page=86',
  'timestamp': '16/Apr/2019:193452+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '200',
  'bytes_out': '510348',
  'user_agent': 'python-requests/2.21.0',
  'ip': '97.105.19.58'},
 {'method': 'POST',
  'path': '/users_accounts/file-upload',
  'timestamp': '16/Apr/2019:193452+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '201',
  'bytes_out': '42',
  'user_agent': 'User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
  'ip': '97.105.19.58'},
 {'method': 'GET',
  'path': '/api/v1/items?page=3',
  'timestamp': '16/Apr/2019:193453+0000',
  'http_version': 'HTTP/1.1',
  'status_code': '429',
  'bytes_out': '3561',
  'user_agent': 'python-requests/2.21.0',
  'ip': '97.105.19.58'}]

In [52]:
regex = re.compile(regexp, re.VERBOSE)

df = pd.DataFrame()
df['line'] = lines.strip().split('\n')
df = pd.concat([df, df.line.str.extract(regex)], axis=1)
df

Unnamed: 0,line,method,path,timestamp,http_version,status_code,bytes_out,user_agent,ip
0,GET /api/v1/sales?page=86 [16/Apr/2019:193452+...,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,python-requests/2.21.0,97.105.19.58
1,POST /users_accounts/file-upload [16/Apr/2019:...,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET /api/v1/items?page=3 [16/Apr/2019:193453+0...,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,python-requests/2.21.0,97.105.19.58
