In [1]:
import pandas as pd
import re
import doctest

#### 1.) Write a function named is_vowel. It should accept a string as input and use a regular expression to determine if the passed string is a vowel. While not explicity mentioned in the lesson, you can treat the result of re.search as a boolean value that indicates whether or not the regular expression matches the given string.

In [2]:
def is_vowel(string):
    return bool(re.search(r'^[aeiouAEIOU]$', string))


assert is_vowel("a") == True
assert is_vowel("E") == True
assert is_vowel("aaa") == False
assert is_vowel("aeiou") == False
print('Exercise 1 is aight.')

Exercise 1 is aight.


#### 2.) Write a function named is_validusername that accepts a string as input. A valid username starts with a lowercase letter, and only consists of lowercase letters, numbers, or the character. It should also be no longer than 32 characters. The function should return either True or False depending on whether the passed string is a valid username.

In [3]:
# Why do we need the $ in the above pattern?
# because the regex below matches up to the capital letter but we're not saying everything needs to be lowercase
re.search(r"^[a-z][a-z0-9_]{,31}", "aaaCODEUPCODEUPaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")



<re.Match object; span=(0, 3), match='aaa'>

In [4]:
# starts with a lowercase letter
# is only lowercase letters, numbers, or _
# should be no longer than 32 characters 
# (already have starting character so{31})
# return a boolean

def is_valid_username(string):
    username = r'^[a-z][a-z0-9_]{,31}$'
    return bool(re.search(username, string))

assert is_valid_username("codeup") == True
assert is_valid_username("codeup123") == True
assert is_valid_username("123Codeup") == False
assert is_valid_username("CodeupCodeup!") == False
print('Exercise 2 is aight.')

Exercise 2 is aight.


#### 3.) Write a regular expression to capture phone numbers. It should match all of the following:

- (210) 867 5309
- +1 210.867.5309
- 867-5309
- 210-867-5309

#### Problem solving process:
- Put the subject strings in order of increasing complexity
- Solve them one at a time and build an iterative solution
- Add optionality as the pattern increases in parts

In [5]:
# The \D*? means zero or more of anything that's not a digit (including parentheses)
# This is another way for specifying optional characters like literal "()" or "+"
phone_regex = r'''
(?P<country_code>\+\d+)?
\D*
(?P<area_code>\d{3})?
\D*
(?P<exchange_code>\d{3})
\D*
(?P<line_number>\d{4})$
'''

numbers = pd.Series([
    '(210) 867 5309',
    '+1 210.867.5309',
    '867-5309',
    '210-867-5309',
], name='original')

pd.concat([numbers, numbers.str.extract(phone_regex, re.VERBOSE)], axis=1)


Unnamed: 0,original,country_code,area_code,exchange_code,line_number
0,(210) 867 5309,,210.0,867,5309
1,+1 210.867.5309,1.0,210.0,867,5309
2,867-5309,,,867,5309
3,210-867-5309,,210.0,867,5309


#### 4.) Use regular expressions to convert the dates below to the standardized year-month-day format.

- 02/04/19
- 02/05/19
- 02/06/19
- 02/07/19
- 02/08/19
- 02/09/19
- 02/10/19

In [6]:
dates = [
    "02/04/19",
    "02/05/19",
    "02/06/19",
    "02/07/19",
    "02/08/19",
    "02/09/19",
    "02/10/19"
]

df = pd.DataFrame({"original": dates})
df

Unnamed: 0,original
0,02/04/19
1,02/05/19
2,02/06/19
3,02/07/19
4,02/08/19
5,02/09/19
6,02/10/19


In [7]:
pattern = re.compile(r'''
(?P<month>\d{2})/
(?P<day>\d{2})/
(?P<year>\d{2})
''', re.VERBOSE)

In [8]:
df = pd.concat([df, df.original.str.extract(pattern)], axis=1)
df

Unnamed: 0,original,month,day,year
0,02/04/19,2,4,19
1,02/05/19,2,5,19
2,02/06/19,2,6,19
3,02/07/19,2,7,19
4,02/08/19,2,8,19
5,02/09/19,2,9,19
6,02/10/19,2,10,19


In [9]:
df["new_format"] = df.year + "/" + df.month + "/" + df.day 
df

Unnamed: 0,original,month,day,year,new_format
0,02/04/19,2,4,19,19/02/04
1,02/05/19,2,5,19,19/02/05
2,02/06/19,2,6,19,19/02/06
3,02/07/19,2,7,19,19/02/07
4,02/08/19,2,8,19,19/02/08
5,02/09/19,2,9,19,19/02/09
6,02/10/19,2,10,19,19/02/10


#### 5.) Write a regex to extract the various parts of these logfile lines:

GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58

POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58

GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58

In [10]:
logfile_regex = r'''
^(?P<method>GET|POST)
\s+
(?P<path>.*?)
\s+
\[(?P<timestamp>.*?)\]
\s+
(?P<http_version>.*?)
\s+
\{(?P<status>\d+)\}
\s+
(?P<bytes_sent>\d+)
\s+
"(?P<user_agent>.*)$
'''

lines = pd.Series([
    'GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58',
    'POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58',
    'GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58',
])
lines.str.extract(logfile_regex, re.VERBOSE)

Unnamed: 0,method,path,timestamp,http_version,status,bytes_sent,user_agent
0,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,"python-requests/2.21.0"" 97.105.19.58"
1,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...
2,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,"python-requests/2.21.0"" 97.105.19.58"


In [11]:
pd.read_csv('/usr/share/dict/words', header=None)
df.columns = ['word']
df[df.word.str.contains(r'[aeiou]\1', regex=True)]

ValueError: Length mismatch: Expected axis has 5 elements, new values have 1 elements