# Simple string searching
- methods on string class
- useful but limited functionality

In [1]:
s = "foozapbar"
s.index('zap')

3

In [2]:
# substring 

'zap' in s

True

In [3]:
s.startswith('foo')

True

In [4]:
s.endswith('bar')

True

# Regular Expressions
- very powerful, widely used
- syntax a tad cryptic at first glance
- Python has a fairly standard implementation, similar to what other languages provide
- module is 're'
- [doc](https://docs.python.org/3.5/library/re.html)
- [more readable doc](https://docs.python.org/3.5/howto/regex.html#regex-howto)

In [6]:
# this pattern will find substrings 
# that start with 'x', end with 'y'
# and has one or more digits in the middle
# '[0-9]' is a character set - stands for 
# any digit char
# '+' means one or more of the previous regex
# 'x' and 'y' stand for themselves

import re

pat = 'x[0-9]+y'
s = 'zxcvx9784843845ysdfx234yzX333Ycv234'

In [7]:
# find all substrings that match the pattern
# note match is case sensistive

re.findall(pat, s)

['x9784843845y', 'x234y']

In [8]:
# case insensitive search w/o compiling pattern

re.findall(pat, s, re.IGNORECASE)

['x9784843845y', 'x234y', 'X333Y']

In [9]:
s

'zxcvx9784843845ysdfx234yzX333Ycv234'

In [10]:
# replace the pattern with a string

re.sub(pat, 'FOOBAR', s)

'zxcvFOOBARsdfFOOBARzX333Ycv234'

In [11]:
# split on the pattern

re.split(pat, s)

['zxcv', 'sdf', 'zX333Ycv234']

# decrypt with RE

In [None]:
e = '{SVIu6Python-)dKct@\\JK)2is:y:=;;~6reallyMZ-&Bk`*6great!NB!|Krj##'

In [None]:
# '[^0-9] means any char EXCEPT the digits

words = re.findall('[0-9][^0-9]+', e)
words

In [None]:
for word in words:
    ln = int(word[0])
    decode = word[1:ln+1]
    print(decode)

# RE groups
- groups are enclosed by ()
- great for fishing out what matched

In [12]:
s='''
<img src="/icons/unknown.gif" alt="[   ]"> <a href="Problems_chap2.nb">Problems_chap2.nb</a>       2009-04-22 15:16  171K  
<img src="/icons/layout.gif" alt="[   ]"> <a href="Problems_chap2.pdf">Problems_chap2.pdf</a>      2009-10-12 13:15  252K  
<img src="/icons/unknown.gif" alt="[   ]"> <a href="Style07.nb">Style07.nb</a>              2009-04-22 15:16   12K  
'''
urls = re.split('\\n', s)[1:-1]
urls

['<img src="/icons/unknown.gif" alt="[   ]"> <a href="Problems_chap2.nb">Problems_chap2.nb</a>       2009-04-22 15:16  171K  ',
 '<img src="/icons/layout.gif" alt="[   ]"> <a href="Problems_chap2.pdf">Problems_chap2.pdf</a>      2009-10-12 13:15  252K  ',
 '<img src="/icons/unknown.gif" alt="[   ]"> <a href="Style07.nb">Style07.nb</a>              2009-04-22 15:16   12K  ']

In [None]:
# [BKMG] - file length will have a bytes/kilo/mega/giga suffix

for u in urls:
    m = \
    re.match('.+src="(.+)" .+href="(.+)".+ ([0-9]+[BKMG])', u)
    print(m.groups())