In [1]:
# "Unicode provides a unique number for every character, no matter what the platform, no matter what the program,
# no matter what the language." -The Unicode Consortium

# Python 3 strings are Unicode strings.

In [2]:
# Create a function that:
# -Takes a Python Unicode character
# -Looks up its name
# -Looks up the character again from the name (confirmation step)

def unicode_test(value):
    import unicodedata
    name = unicodedata.name(value)
    value2 = unicodedata.lookup(name)
    print('value="%s", name="%s", value2="%s"' % (value, name, value2))

In [3]:
unicode_test('A')

value="A", name="LATIN CAPITAL LETTER A", value2="A"


In [4]:
unicode_test('$')

value="$", name="DOLLAR SIGN", value2="$"


In [5]:
unicode_test('\u00a2') # Specify by Unicode ID

value="¢", name="CENT SIGN", value2="¢"


In [6]:
unicode_test('\u20ac')

value="€", name="EURO SIGN", value2="€"


In [7]:
unicode_test('\u2603')

value="☃", name="SNOWMAN", value2="☃"


In [9]:
import unicodedata # Need to import because outside of function.

unicodedata.name('\u00e9')

'LATIN SMALL LETTER E WITH ACUTE'

In [10]:
unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')

'é'

In [11]:
# Use Latin small letter e with acute in a string by calling Unicode ID.

place = 'caf\u00e9'
place

'café'

In [13]:
# Use Latin small letter e with acute in a string by calling name.

other_place = 'caf\N{LATIN SMALL LETTER E WITH ACUTE}'
other_place

'café'

In [15]:
# Build a string with a Unicode character by appending.

u_umlaut = '\N{LATIN SMALL LETTER U WITH DIAERESIS}'
u_umlaut

'ü'

In [17]:
drink = 'Gew' + u_umlaut + 'rztraminer'

print('Now I can finally have my', drink, 'in a ', place)

Now I can finally have my Gewürztraminer in a  café


In [18]:
# "The string len function counts Unicode characters, not bytes."

len('$')

1

In [19]:
len('\U0001f47b')

1

In [20]:
unicode_test('\U0001f47b')

value="👻", name="GHOST", value2="👻"


In [21]:
# "UTF-8 is the standard text encoding in Python, Linux, and HTML...If you use UTF-8 encoding throughout
# your code, life will be much easier than trying to hop in and out of various encodings." Provides:
# - A way to encode character strings to bytes
# - A way to decode bytes to character strings

In [None]:
# encode()
# - First argument is encoding name
# - Second argument helps to avoid exceptions; default is 'strict'
#   but can use 'ignore' (throw away anything that won't encode)
#   'replace' (replace anything that won't encode with a '?')
#   'backslashreplace' (produce a Python Unicode character string)
#   'xmlcharrefreplace' (produce character entity strings for use in web pages)

In [22]:
snowman = '\u2603'

In [23]:
len(snowman)

1

In [24]:
ds = snowman.encode('utf-8') # Encode Unicode snowman character to byte sequence

In [25]:
len(ds)

3

In [26]:
ds

b'\xe2\x98\x83'

In [27]:
snowman.encode('ascii', 'ignore')

b''

In [28]:
snowman.encode('ascii', 'replace')

b'?'

In [29]:
snowman.encode('ascii', 'backslashreplace')

b'\\u2603'

In [30]:
snowman.encode('ascii', 'xmlcharrefreplace')

b'&#9731;'

In [31]:
# decode()
# Text from external sources is encoded as byte strings,
# so we need to know which encoding was used in the first place, to eventually get Unicde strings.

In [33]:
place = 'caf\u00e9'
place

'café'

In [34]:
type(place)

str

In [35]:
place_bytes = place.encode('utf-8') # Encode in a bytes variable
place_bytes

b'caf\xc3\xa9'

In [36]:
type(place_bytes)

bytes

In [37]:
place2 = place_bytes.decode('utf-8') # Decode byte string back to a Unicode string
place2

'café'

In [38]:
# Formatting strings: Old style with %s
# - "The %s inside the string means to interpolate a string."
# - "The number of % appearances in the string needs to match the number of data items after the %."
# - Single data item goes right after the %; group multiple items into a tuple with format (item1, item2)
# - "You can add other values between the % and the type specifier 
#   to designate minimum and maximum widths, alignment, and character filling."

In [39]:
'%s' % 42 # Format as string

'42'

In [40]:
'%d' % 42 # Format as decimal integer

'42'

In [41]:
'%x' % 42 # Format as hex integer

'2a'

In [42]:
'%o' % 42 # Format as octal integer

'52'

In [43]:
'%s' % 7.03 # Format float as string

'7.03'

In [44]:
'%f' % 7.03 # Format float as decimal float

'7.030000'

In [45]:
'%e' % 7.03 # Format float as exponential float

'7.030000e+00'

In [46]:
'%g' % 7.03 # Format float as decimal OR exponential float

'7.03'

In [47]:
'%d%%' % 100 # An integer and a literal '%'

'100%'

In [48]:
actor = 'Richard Gere'
cat = 'Chester'
weight = 28

In [49]:
"My wife's favorite actor is %s." % actor

"My wife's favorite actor is Richard Gere."

In [50]:
"My cat, %s, weighs %s pounds." % (cat, weight)

'My cat, Chester, weighs 28 pounds.'

In [51]:
n = 42
f = 7.03
s = 'string cheese'

In [52]:
'%d %f %s' % (n,f,s) # Format as decimal integer, float, and string with default widths

'42 7.030000 string cheese'

In [55]:
# Min field width 10 chars per variable, right align and fill unused chars with spaces
'%10d %10f %10s' % (n,f,s)

'        42   7.030000 string cheese'

In [56]:
# Same field width, left align
'%-10d %-10f %-10s' % (n,f,s)

'42         7.030000   string cheese'

In [54]:
# Field width 10 chars, max char width 4 (truncate string, limit chars after decimal in float), right align
'%10.4d %10.4f %10.4s' % (n,f,s) 

'      0042     7.0300       stri'

In [57]:
# Default field width, max char width 4, right align
'%.4d %.4f %.4s' % (n,f,s)

'0042 7.0300 stri'

In [59]:
# Get field widths from arguments instead of hard-coding.
'%*.*d %*.*f %*.*s' % (10,4,n,10,4,f,10,4,s)

'      0042     7.0300       stri'

In [61]:
# New-style formatting with {} and format, as recommended in Python 3.

# Simplest usage includes spacing as entered, default formatting
'{} {} {}'.format(n,f,s)

'42 7.03 string cheese'

In [62]:
# Specify the order in which arguments appear in formatted string (default formatting)
# Here, the third argument named appears first, etc

'{2} {0} {1}'.format(f,s,n)

'42 7.03 string cheese'

In [64]:
# Named arguments with specifiers included, default formatting
'{n} {f} {s}'.format(n=42, f=7.03, s='string cheese')

'42 7.03 string cheese'

In [66]:
# Arguments combined into a dictionary
# {0} is entire dictionary, {1} is the string 'other'
# Default formatting
d = {'n':42, 'f':7.03, 's':'string cheese'}
'{0[n]} {0[f]} {0[s]} {1}'.format(d, 'other')

'42 7.03 string cheese other'

In [67]:
# New-style formatting specifications and positional arguments
'{0:d} {1:f} {2:s}'.format(n,f,s)

'42 7.030000 string cheese'

In [68]:
# Same values as named arguments, new-style formatting
'{n:d} {f:f} {s:s}'.format(n=42, f=7.03, s='string cheese')

'42 7.030000 string cheese'

In [69]:
# Min field width 10, right-align by default
'{0:10d} {1:10f} {2:10s}'.format(n,f,s)

'        42   7.030000 string cheese'

In [70]:
# Min field width 10, explicit right-align with >
'{0:>10d} {0:>10f} {2:>10s}'.format(n,f,s)

'        42  42.000000 string cheese'

In [71]:
# Min field width 10, left-align
'{0:<10d} {1:<10f} {2:<10s}'.format(n,f,s)

'42         7.030000   string cheese'

In [72]:
# Min field width 10, centered
'{0:^10d} {1:^10f} {2:^10s}'.format(n,f,s)

'    42      7.030000  string cheese'

In [73]:
# Cannot use precision value (after decimal point) for integers with new-style formatting,
# but can still use it to specify number of digits after the decimal for floats, 
# and max characters for strings.

# Will return error
'{0:>10.4d} {1:>10.4f} {2:>10.4s}'.format(n,f,s)

ValueError: Precision not allowed in integer format specifier

In [74]:
'{0:>10d} {1:>10.4f} {2:>10.4s}'.format(n,f,s)

'        42     7.0300       stri'

In [76]:
# Fill character lets you add any character to pad output fields
# Enter after the colon, before, any alignment or width specifiers

'{0:*^20s}'.format('BIG SALE')

'******BIG SALE******'

In [77]:
# Matching with regex
# Methods used with re
# - match(): Exact matching
# - search(): Return the first match, if any
# - findall(): Return a list of all non-overlapping matches, if any
# - split(): Split source at matches with pattern and return list of string pieces
# - sub(): Takes a replacement arg, changes all parts of source that match pattern to replacement

In [78]:
import re

source = 'Young Frankenstein'
m = re.match('You', source)

In [79]:
if m:
    print(m.group()) # If match returns an object, see what matched

You


In [80]:
m = re.match('^You', source) # Start anchor same as default match

if m:
    print(m.group())

You


In [81]:
# Default behavior won't find pattern that doesn't occur at beginning of source

m = re.match('Frank', source)

if m:
    print(m.group())

In [82]:
m = re.search('Frank', source)

if m:
    print(m.group())

Frank


In [84]:
m = re.match('.*Frank', source) # Add .* to match anything before pattern

if m:
    print(m.group())

Young Frank
