In [1]:
import re
import pandas as pd
import numpy as np

# 1. Is it a vowel?

In [2]:
def is_vowel(x:str):
    """ Returns match object if inputted string x is a vowel """
    return re.search(r'^[aeiouAEIOU]$',x)

    

# 2. user names

In [3]:
def is_valid_username(x:str):
    """ Checks whether inputted string is a valid username:
    -Starts with lowercase letter
    -Only lowercase letters, numbers or _
    -No longer than 32 characters"""
    if re.search(r'^[a-z][_a-z0-9]{,31}$',x):
        return True
    else:
        return False
    

In [4]:
is_valid_username('c_2odeup')


True

# 3 Phone numbers

In [5]:
subjects = ['(210) 867 5309',
            '+1 210.867.5309', 
            '867-5309',
            '210-867-5309']
regexp = r'\d{0,3}\D?\d{3}\D\d{4}'
for subject in subjects:
    if re.search(regexp, subject):
        print(subject, "matches")
    else:
        print(subject, "does not match")


(210) 867 5309 matches
+1 210.867.5309 matches
867-5309 matches
210-867-5309 matches


In [6]:
regexp = r'(?P<country>\+\d+)?\D*(?P<area_code>\d{3})?\D*(?P<first_three>\d{3})\D*(?P<last_four>\d{4})$'
pd.concat([pd.DataFrame({'num':subjects}),pd.DataFrame({'num':subjects}).num.str.extract(regexp)],axis=1)

Unnamed: 0,num,country,area_code,first_three,last_four
0,(210) 867 5309,,210.0,867,5309
1,+1 210.867.5309,1.0,210.0,867,5309
2,867-5309,,,867,5309
3,210-867-5309,,210.0,867,5309


# 4 convert to year - month - day

In [7]:
df = pd.DataFrame({'original_dates':['02/04/19','02/05/19','02/06/19','02/07/19','02/08/19','02/09/19','02/10/19']})
regexp=r'(\d{2})/(\d{2})/(\d{2})'

# re.sub(regexp,r'20\3-\1-\2', date)

In [8]:
df['converted'] = df.original_dates.str.replace(pat = regexp,repl = r'20\3-\1-\2', regex=True)

In [9]:
df

Unnamed: 0,original_dates,converted
0,02/04/19,2019-02-04
1,02/05/19,2019-02-05
2,02/06/19,2019-02-06
3,02/07/19,2019-02-07
4,02/08/19,2019-02-08
5,02/09/19,2019-02-09
6,02/10/19,2019-02-10


# 5 - logs

In [10]:
df = pd.DataFrame({'original_logs':['GET /api/v1/sales?page=86 [16/Apr/2019:193452+0000] HTTP/1.1 {200} 510348 "python-requests/2.21.0" 97.105.19.58',
        'POST /users_accounts/file-upload [16/Apr/2019:193452+0000] HTTP/1.1 {201} 42 "User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 97.105.19.58',
        'GET /api/v1/items?page=3 [16/Apr/2019:193453+0000] HTTP/1.1 {429} 3561 "python-requests/2.21.0" 97.105.19.58']})


df = pd.concat([df, 
                df.original_logs.str.extract(r'(?P<request_method>^[A-Z]{3,})\s*(?P<request_path>/\S*\b)\s*\[(?P<timestamp>[^]]*).*(?P<http_version>[H]\S*).\{(?P<status_code>\d{3,})\}\s*(?P<size>\s\d{1,}\s)\s*"(?P<user_agent>.*)"\s(?P<ip_address>\S*$)')],axis=1)


In [11]:
df

Unnamed: 0,original_logs,request_method,request_path,timestamp,http_version,status_code,size,user_agent,ip_address
0,GET /api/v1/sales?page=86 [16/Apr/2019:193452+...,GET,/api/v1/sales?page=86,16/Apr/2019:193452+0000,HTTP/1.1,200,510348,python-requests/2.21.0,97.105.19.58
1,POST /users_accounts/file-upload [16/Apr/2019:...,POST,/users_accounts/file-upload,16/Apr/2019:193452+0000,HTTP/1.1,201,42,User-Agent: Mozilla/5.0 (X11; Fedora; Fedora; ...,97.105.19.58
2,GET /api/v1/items?page=3 [16/Apr/2019:193453+0...,GET,/api/v1/items?page=3,16/Apr/2019:193453+0000,HTTP/1.1,429,3561,python-requests/2.21.0,97.105.19.58


# BONUS

In [12]:
words = pd.read_table('/usr/share/dict/words')
words = words.rename(columns = {'A':'word'}).dropna()

In [13]:
words

Unnamed: 0,word
0,a
1,aa
2,aal
3,aalii
4,aam
...,...
235880,zythem
235881,Zythia
235882,zythum
235883,Zyzomys


In [14]:
def count_vowels(x:str):
    num_vowels = 0
    for letter in str(x):
        if re.search(r'[aeiouAEIOU]',letter):
            num_vowels += 1
        else:
            continue
    return num_vowels

In [15]:
words["num_vowels"] = words.apply(lambda row: count_vowels(row.word), axis=1)

Words with at least 3 vowels

In [16]:
(words.num_vowels>=3).sum()

191365

In [17]:
def count_consec_vowels(x:str):
    if re.search(r'[aeiouAEIOU]{3,}',x):
        return True
    else:
        return False

In [18]:
words["consec_vowels_3"] = words.apply(lambda row: count_consec_vowels(row.word), axis=1)

In [19]:
print(f"{words.consec_vowels_3.sum()} words with at least 3 vowels in a row")

6182 words with at least 3 vowels in a row


In [20]:
words["consec_cons_4"] = words.word.str.contains(r'[^aeiouAEIOU]{4,}')

In [21]:
print(f"{words.consec_cons_4.sum()} words contain at least 4 consonants in a row")

19241 words contain at least 4 consonants in a row


In [22]:
'a'[-1]

'a'

In [23]:
words["start_end_same"] = words.word.apply(lambda row: row[0]==row[-1])

print(f"{words.start_end_same.sum()} words start and end with the same letter")

9967 words start and end with the same letter


In [24]:
def start_and_ends_with_vowel(x:str):
    if is_vowel(x[0]):
        if is_vowel(x[-1]):
            return True
        else:
            return False
    else:
        return False

In [25]:
words["start_end_vowel"] = words.word.apply(lambda row: start_and_ends_with_vowel(row))

print(f"{words.start_end_vowel.sum()} words start and end with a vowel")

14666 words start and end with a vowel


In [50]:
words["same_letter_3"] = words.word.str.contains(r'([a-z])\1{3}') # doesn't work yet

  words["same_letter_3"] = words.word.str.contains(r'([a-z])\1{3}') # doesn't work yet
