## Strings and Text

In [1]:
# Splitting strings on multiple delimiters
# most of the times the str.split() method is too simple too
# solve these kind of exercise. It is reccommended to use RE:

import re

line = 'asdf fjdk; afed, fjek,asdf,     foo'
re.split(r'[,;\s]\s*', line)


['asdf', 'fjdk', 'afed', 'fjek', 'asdf', 'foo']

In [2]:
# If we need to get also the separators on the list, we can use
# a capture group '()', example:

re.split(r'(;|,|\s)\s*', line)

['asdf', ' ', 'fjdk', ';', 'afed', ',', 'fjek', ',', 'asdf', ',', 'foo']

In [3]:
# In case we need to recover the delimiters:
delimiters = re.split(r'(;|,|\s)\s*', line)[1::2]

In [4]:
delimiters

[' ', ';', ',', ',', ',']

In [5]:
# To check start/end of a string: startswith, endswith:

text = 'example text ends with: hello'
text.endswith('hello')

True

In [6]:
# We can also check from a list of endings by using tuple as argument:
filenames= [ 'Makefile', 'foo.c', 'bar.py', 'spam.c', 'spam.h' ]

In [12]:
c_and_py_filenames = list(filter(lambda a: a.endswith(('.py', 'c')), filenames))

In [13]:
c_and_py_filenames

['foo.c', 'bar.py', 'spam.c']

In [14]:
# Matching Strings Using shell wildcard patterns: *.py: 
# fnmatch is used for these porpuse:

import fnmatch

In [16]:
fnmatch.fnmatch('foo.txt', '*.txt')

True

In [17]:
fnmatch.fnmatch('foo.txt', '?oo.txt')

True

In [18]:
fnmatch.fnmatch('Dat45.csv', 'Dat[0-9]*')

True

In [27]:
# This operation could also be done with RE of course
re.match(r'\d+/\d+/\d+', '23/10/21') is not None

True

In [28]:
# In case we need to use the same re pattern many times, it is usefull
# to compile it at the beginning and change the syntax:

my_re = re.compile(r'\d+/\d+/\d+')
my_re.match('23/10/21') is not None

True

In [30]:
# In regular expresions it is very common to use capture groups. This
# allows us to get them independently after the match has been loaded with
# the group expresion:

my_re_with_groups = re.compile(r'(\d+)/(\d+)/(\d+)')
my_match = my_re_with_groups.match('23/23/2021')

In [32]:
my_match.groups(), my_match.group(0), my_match.group(1), my_match.group(2), my_match.group(3)

(('23', '23', '2021'), '23/23/2021', '23', '23', '2021')

In [36]:
# Useful example:
text = 'Today is 11/27/2012. PyCon starts 3/13/2013.'
for month, day, year in my_re_with_groups.findall(text):
    print('{}-{}-{}'.format(year, month, day))

2012-11-27
2013-3-13


In [38]:
# When replacing text, there are two options: first: str.replace(),
# second, with re.sub method:
# First param: text to look for:
# Second param: text we wanna traduce to. When using capture groups \n 
# targets group number n
# Third: the hole string

text = 'Today is 11/26/2021. PyCon starts 2/13/2022.'
re.sub(r'(\d+)/(\d+)/(\d+)', r'\3-\1-\2', text)

'Today is 2021-11-26. PyCon starts 2022-2-13.'

In [None]:
# It is also possible to set a callback function instead of a regular expresion
# when traducctions are not as easy as in the examples shown before.

In [49]:
# Strip text: the easiest way is using the strip method
s = '  ------hello     world   \n'


In [50]:
# You can specify all the characters you wanna apply the stripping to
# on the string argument
s.strip('- \n')

'hello     world'

In [59]:
# The problem is that it does not apply to the text in the middle, in this
# particular case re is needed:

re.sub(r'[\s\n-]+', ' ', s).strip()

'hello world'

In [63]:
#Formating text with justification: >, <, ^ (middle)

my_original_string = 'Hello!!'
justified_right = f"{my_original_string:>20}"
justified_left = f"{my_original_string:<20}"
justified_middle = f"{my_original_string:^20}"

justified_right, justified_left, justified_middle

('             Hello!!', 'Hello!!             ', '      Hello!!       ')

In [65]:
# Concatenating text with join:

parts = ['Is', 'Chicago', 'Not', 'Chicago?']
' '.join(parts)

'Is Chicago Not Chicago?'

In [68]:
# Create a string in which embedded variable names are substituted with a
# string representation of a variable’s value.

first_usage = '{} has {} messages.'.format('Guido', 37)
second_usage = '{name} has {n} messages'.format(name = 'Guido', n = 37)
print(first_usage, second_usage, sep = '\n')


Guido has 37 messages.
Guido has 37 messages


In [71]:
# Using dicctionaries and format_map
name = 'Guido'
n = 37
my_dict_of_arguments = {'name': 'Guido', 'n': 37}
'{name} has {n} messages'.format_map(my_dict_of_arguments)

'Guido has 37 messages'

In [78]:
# The problem is when de dictionary doesnt hold one of the keys
# in that scenerario we can create a new class with method __missing__()
class MissingDict(dict):
    def __init__(self, *args, **kwargs):
        super(MissingDict, self).__init__(*args, **kwargs)
        self.update({'clave_añadida': 1})
    def __missing__(self, key):
        return "{" + key + "}"

In [83]:
my_dict_of_arguments_w_missing_method = MissingDict(my_dict_of_arguments)
my_dict_of_arguments_w_missing_method['clave_no_añadida']

'{clave_no_añadida}'

In [84]:
'{name} has {n} messages and his role is: {role}'.format_map(my_dict_of_arguments_w_missing_method)

'Guido has 37 messages and his role is: {role}'

In [85]:
# Reformatting text to a fixed number of columns:
# use textwrap.fill(text, n_columns)

s = "Look into my eyes, look into my eyes, the eyes, the eyes, \
the eyes, not around the eyes, don't look around the eyes, \
look into my eyes, you're under."



In [89]:
print(s)

Look into my eyes, look into my eyes, the eyes, the eyes, the eyes, not around the eyes, don't look around the eyes, look into my eyes, you're under.


In [90]:
import textwrap
print(textwrap.fill(s, 20))

Look into my eyes,
look into my eyes,
the eyes, the eyes,
the eyes, not around
the eyes, don't look
around the eyes,
look into my eyes,
you're under.


In [91]:
# With indentation: 
print(textwrap.fill(s, 20, initial_indent = '   ', subsequent_indent = '      '))

   Look into my
      eyes, look
      into my eyes,
      the eyes, the
      eyes, the
      eyes, not
      around the
      eyes, don't
      look around
      the eyes, look
      into my eyes,
      you're under.


In [94]:
# Tokenizing text i.e.: parse left to tight into a stream of tokens.

text = 'foo = 23 + 42 * 10'

In [95]:
# When working with byte strings instead of normal strings, most of the feautures
# and functions keeps running well except some, in that cases, an encoding-decoding is
# needed

my_string = 'Hello World!'
my_b_string = b'Hello World!'

In [96]:
my_string, my_b_string

('Hello World!', b'Hello World!')

In [97]:
my_string.split(), my_b_string.split()

(['Hello', 'World!'], [b'Hello', b'World!'])

In [99]:
my_string.find('W'), my_b_string.find(b'W')

(6, 6)

In [100]:
# Byte strings dont allow formatting
my_second_b_string = b'My favourate colour is {}'.format('blue')

AttributeError: 'bytes' object has no attribute 'format'

In [102]:
# Solution:
my_second_b_string = b'My favourate colour is {}'.decode('ascii').format('blue')
my_second_b_string

'My favourate colour is blue'

In [105]:
my_second_b_string_encoded = my_second_b_string.encode()
my_second_b_string_encoded

b'My favourate colour is blue'