### Strings...

In [6]:
s = '深入 Python'
print(len(s))
print(s[0])

9
深


### string methods 

In [10]:
user = "js"
pw = "somtam"
"{0}'s password is {1}".format(user, pw)

"js's password is somtam"

In [12]:
si_suffixes = ['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
si_suffixes

['KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']

In [13]:
si_suffixes[0]

'KB'

In [17]:
# try to format something like 1000KB = 1MB
"1000{0[2]} = 1{0[3]}".format(si_suffixes) # note that 0 is a list, so 0[num] is an element

'1000GB = 1TB'

In [28]:
import sys, os, glob, humansize
# os.chdir("/Users/Calvin/PycharmProjects/Python-playground/")
'1MB = 1000{0.modules[humansize].SUFFIXES[1000][0]}'.format(sys) #sys.modules refers to all imported modules

'1MB = 1000KB'

### fancy formatting (with formatting specifier)

In [38]:
"{0:.2e} {1}".format(628.123456, "GB")

'6.28e+02 GB'

### some common string methods

In [48]:
s = """
abc
def
ghi
kjl
abc
abc
"""

s

'\nabc\ndef\nghi\nkjl\nabc\nabc\n'

In [49]:
s.split() # split at the carriage return

['abc', 'def', 'ghi', 'kjl', 'abc', 'abc']

In [50]:
s_list = s.split()
s_list

['abc', 'def', 'ghi', 'kjl', 'abc', 'abc']

In [51]:
for i in s_list:
    print(i.upper())

ABC
DEF
GHI
KJL
ABC
ABC


In [53]:
s.count("A")

0

In [55]:
query = "user=js&database=foo&password=bar"
query

'user=js&database=foo&password=bar'

In [57]:
bar = query.split("&")

In [61]:
list_of_list = [v.split("=", 1) for v in bar if "=" in v] #split(x, 1) meaning split once
list_of_list

[['user', 'js'], ['database', 'foo'], ['password', 'bar']]

In [63]:
a_dict = dict(list_of_list)
a_dict

{'database': 'foo', 'password': 'bar', 'user': 'js'}

### Bytes!

In [65]:
16 ** 2 # (hexadecimal system)

256

In [6]:
by = b"abcd\x65"
by

b'abcde'

In [2]:
type(by)

bytes

In [11]:
by = b"abcd\x65"
by += b"\xff"
len(by)

6

In [13]:
by[5] #\xff is the last in the hexadecimal system

255

In [14]:
by[0] = 101 #immutable

TypeError: 'bytes' object does not support item assignment

In [16]:
# try changing byte to bytearrays (mutable)
barr = bytearray(by)
barr

bytearray(b'abcde\xff')

In [18]:
barr[0] = 101
barr

bytearray(b'ebcde\xff')

In [25]:
by = b"d"
by

b'd'

In [20]:
s = "abcde"

In [22]:
by + s # error concatenating byte and string objects

TypeError: can't concat bytes to str

In [26]:
by.decode("ascii") #byte -> string
s.count(by.decode("ascii")) 

1

In [28]:
a_string = '深入 Python'
len(a_string)

9

In [33]:
by = a_string.encode("utf-8")
print(by) # can see that each chinese word is 3-byte, as discussed in the text
len(by) # as such there are a total of 13 bytes

b'\xe6\xb7\xb1\xe5\x85\xa5 Python'


13

In [35]:
by.decode("utf-8")

'深入 Python'

### REGEX

In [37]:
s = "NORTH BROAD ROAD"
s


'NORTH BROAD ROAD'

In [39]:
s.replace("ROAD", "RD.") # replaces all "ROAD" instances with "RD.", not quite what we want

'NORTH BRD. RD.'

In [42]:
import re
re.sub("ROAD$", "RD.", s)

'NORTH BROAD RD.'

In [45]:
# let's say the string does not have "ROAD" at the end
s2 = "NORTH BROAD"
re.sub("ROAD$", "RD.", s2) # so this will cause the "BROAD" to go haywire
# so what we really want is to substitute THE word "ROAD" at the end

'NORTH BRD.'

In [47]:
re.sub("\\bROAD$", "RD.", s2) #note that similar to R the slash needs to be escaped

'NORTH BROAD'

In [48]:
re.sub(r"\bROAD$", "RD.", s2) # use r"" to indicate raw string

'NORTH BROAD'

In [51]:
# what we really want is to substitute the word "ROAD" anywhere in the string with "RD".
re.sub(r"\bROAD\b", "RD.", s2)
s3 = "NORTH BROAD ROAD APT. 3"
re.sub(r"\bROAD\b", "RD.", s3)

'NORTH BROAD RD. APT. 3'

In [53]:
s3
re.sub(r"ROAD\b", "HAHA", s3)

'NORTH BHAHA HAHA APT. 3'

### REGEX Case Study: Roman Numerals

#### The thousands (M)

In [56]:
import re
pattern = "^M?M?M?$"
re.search(pattern, "M")

<_sre.SRE_Match object; span=(0, 1), match='M'>

In [57]:
re.search(pattern, "MM")

<_sre.SRE_Match object; span=(0, 2), match='MM'>

In [58]:
re.search(pattern, "MMM")

<_sre.SRE_Match object; span=(0, 3), match='MMM'>

In [59]:
re.search(pattern, "MMMM") # should return none, matches the first 3M , but not the last one

In [60]:
re.search(pattern, "") # match, since all Ms are optional

<_sre.SRE_Match object; span=(0, 0), match=''>

In [62]:
# possible patterns for hundreds (CM|CD|D{0,1}C{0,3})
pattern = "^M?M?M?(CM|CD|D?C?C?C?)$"

In [63]:
re.search(pattern, "MCM")

<_sre.SRE_Match object; span=(0, 3), match='MCM'>

In [64]:
re.search(pattern, "MD")

<_sre.SRE_Match object; span=(0, 2), match='MD'>

In [65]:
re.search(pattern, "MMMCCC")

<_sre.SRE_Match object; span=(0, 6), match='MMMCCC'>

In [66]:
re.search(pattern, "MCMC") #no match

In [67]:
re.search(pattern, "")

<_sre.SRE_Match object; span=(0, 0), match=''>

In [70]:
# redo the M-matching regex, using {n,m}
pattern = "^M{0,3}$"
print(re.search(pattern, ""))
print(re.search(pattern, "M"))
print(re.search(pattern, "MM"))
print(re.search(pattern, "MMM"))
print(re.search(pattern, "MMMM"))


<_sre.SRE_Match object; span=(0, 0), match=''>
<_sre.SRE_Match object; span=(0, 1), match='M'>
<_sre.SRE_Match object; span=(0, 2), match='MM'>
<_sre.SRE_Match object; span=(0, 3), match='MMM'>
None


In [75]:
# pattern for tens
pattern = "^M{0,3}(CM|CD|D?C?C?C?)(XC|XL|L{0,1}X{0,3})$"
print(re.search(pattern, "MCMXL")) #1940
print(re.search(pattern, "MCMLX")) #1960
print(re.search(pattern, "MCLXXX")) #1980
print(re.search(pattern, "MCLXXXX")) # not even a valid roman numeral

<_sre.SRE_Match object; span=(0, 5), match='MCMXL'>
<_sre.SRE_Match object; span=(0, 5), match='MCMLX'>
<_sre.SRE_Match object; span=(0, 6), match='MCLXXX'>
None


In [82]:
# similar pattern for ones, let's combine all
pattern = "^M{0,3}(CM|CD|D?C?C?C?)(XC|XL|L{0,1}X{0,3})(IV|IX|V?I{0,3})$"
print(re.search(pattern, "MDLV")) #1555
print(re.search(pattern, "MMDCLXVI")) #2666
print(re.search(pattern, "MMMDCCCLXXXVIII")) #3888
print(re.search(pattern, "I"))

<_sre.SRE_Match object; span=(0, 4), match='MDLV'>
<_sre.SRE_Match object; span=(0, 8), match='MMDCLXVI'>
<_sre.SRE_Match object; span=(0, 15), match='MMMDCCCLXXXVIII'>
<_sre.SRE_Match object; span=(0, 1), match='I'>


### Verbose regular expression matching for better understanding / documentation

In [88]:
# the following is equivalent to "^M{0,3}(CM|CD|D?C?C?C?)(XC|XL|L{0,1}X{0,3})(IV|IX|V?I{0,3})$"
# except with lots of whitespace; note that you need to call with an extra argument re.VERBOSE 
# because Python will assume it is compact REGEX otherwise

pattern = '''
    ^                   # beginning of string
    M{0,3}              # thousands - 0 to 3 Ms
    (CM|CD|D?C{0,3})    # hundreds - 900 (CM), 400 (CD), 0-300 (0 to 3 Cs),
                        #            or 500-800 (D, followed by 0 to 3 Cs)
    (XC|XL|L?X{0,3})    # tens - 90 (XC), 40 (XL), 0-30 (0 to 3 Xs),
                        #        or 50-80 (L, followed by 0 to 3 Xs)
    (IX|IV|V?I{0,3})    # ones - 9 (IX), 4 (IV), 0-3 (0 to 3 Is),
                        #        or 5-8 (V, followed by 0 to 3 Is)
    $                   # end of string
    '''

print(re.search(pattern, "M", re.VERBOSE))
print(re.search(pattern, "MMMDCCCLXXXVIII", re.VERBOSE)) 
print(re.search(pattern, "MMMDCCCLXXXVIII")) # returns None with out the extra re.VERBOSE argument since Python assumes it is compact expression

<_sre.SRE_Match object; span=(0, 1), match='M'>
<_sre.SRE_Match object; span=(0, 15), match='MMMDCCCLXXXVIII'>
None


In [89]:
help(re.compile)

Help on function compile in module re:

compile(pattern, flags=0)
    Compile a regular expression pattern, returning a pattern object.

