# Il modulo `re` per le espressioni regolari

Sulla falsariga di [Regular Expression HOWTO](https://docs.python.org/3/howto/regex.html) e [Regular expression operations](https://docs.python.org/3/library/re.html) dalla documentazione ufficiale.

In [None]:
import re

## Uso di base

In [None]:
# raw string (https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals)

print(r'a\nb')

a\nb


In [None]:
# uso diretto

re.match(r'a|b', 'b')

<re.Match object; span=(0, 1), match='b'>

In [None]:
# o pre-compilando il pattern

p = re.compile(r'a|b')

p.match('b')

<re.Match object; span=(0, 1), match='b'>

In [None]:
# differenza fullmatch/match/search

p.fullmatch('xa'), p.fullmatch('ay')

(None, None)

In [None]:
p.match('xa'), p.match('ay')

(None, <re.Match object; span=(0, 1), match='a'>)

In [None]:
p.search('xay')

<re.Match object; span=(1, 2), match='a'>

In [None]:
# tutti?

list(p.finditer('mamma bella'))

[<re.Match object; span=(1, 2), match='a'>,
 <re.Match object; span=(4, 5), match='a'>,
 <re.Match object; span=(6, 7), match='b'>,
 <re.Match object; span=(10, 11), match='a'>]

In [None]:
# ma anche più semplicemente

p.findall('banana')

['b', 'a', 'a', 'a']

## Accedere alle sottostringhe

In [None]:
prefix = '02'
number = '342573'
telephone = prefix + '/' + number

In [None]:
# gruppi "semplici"

p = re.compile(r'([0-9]+)/([0-9]*)')

In [None]:
m = p.match(telephone)
m.groups()

('02', '342573')

In [None]:
# gruppi "annidati"

p = re.compile(r'(([0-9]+)/)?([0-9]*)')

In [None]:
m0, m1 = p.match(telephone), p.match(number)
m0.groups(), m1.groups()

(('02/', '02', '342573'), (None, None, '342573'))

In [None]:
# gruppi "denominati"

p = re.compile(r'((?P<prefix>[0-9]+)/)?(?P<number>[0-9]+)')

In [None]:
m0, m1 = p.match(telephone), p.match(number)
m0.groupdict(), m1.groupdict()

({'prefix': '02', 'number': '342573'}, {'prefix': None, 'number': '342573'})

In [None]:
# gruppi senza cattura

p = re.compile(r'(?:([0-9]+)/)?([0-9]*)')

In [None]:
m0, m1 = p.match(telephone), p.match(number)
m0.groups(), m1.groups()

(('02', '342573'), (None, '342573'))

## Rimpiazzare e spezzare

In [None]:
text = 'this number 12312515 will be censored, this 44 also'

In [None]:
# con una stringa fissta

p = re.compile(r'[0-9]')

p.sub('..', text)

'this number ................ will be censored, this .... also'

In [None]:
# con una stringa contennte il gruppo 

p = re.compile(r'([a-z]+)')

p.sub(r'<\1>', text)

'<this> <number> 12312515 <will> <be> <censored>, <this> 44 <also>'

In [None]:
p = re.compile(r'((?P<prefix>[0-9]+)/)?(?P<number>[0-9]+)')

p.sub(r'Il preffiso è \g<prefix>, il numero è \g<number>.', '02/342573')

'Il preffiso è 02, il numero è 342573.'

In [None]:
# rot 13 

import codecs

def rot13(match):
    value = match.group()
    return codecs.encode(value, 'rot_13')

In [None]:
# con una funzione 

p = re.compile(r'[a-z]+')

p.sub(rot13, text)

'guvf ahzore 12312515 jvyy or prafberq, guvf 44 nyfb'

In [None]:
# spezzare

re.split(r'(?:,|;) ', 'Split, a long sentence, according to what; is needed')

['Split', 'a long sentence', 'according to what', 'is needed']

## Greediness e numero di match

In [None]:
text = '<html><head><title>Title</title>'

tags = re.findall(r'<(.*)>', text)
tags

['html><head><title>Title</title']

In [None]:
tags = re.findall(r'<(.*?)>', text)
tags

['html', 'head', 'title', '/title']

In [None]:
list(re.finditer(r'm{1,2}', 'mamma che buono mmmm'))

[<re.Match object; span=(0, 1), match='m'>,
 <re.Match object; span=(2, 4), match='mm'>,
 <re.Match object; span=(16, 18), match='mm'>,
 <re.Match object; span=(18, 20), match='mm'>]

## Metacaratteri e flags

In [None]:
# . ^ $ * + ? { } [ ] \ | ( )

In [None]:
# set negati

re.findall(r'[^aeiou]', 'just consonants')

['j', 's', 't', ' ', 'c', 'n', 's', 'n', 'n', 't', 's']

In [None]:
# commenti

re.match(r"""
a # una a
| # oppure
b # una b
""", 'ba', re.VERBOSE)

<re.Match object; span=(0, 1), match='b'>

In [None]:
# end of line

p = re.compile(r'a$')

p.search('xa'), p.search('ay')

(<re.Match object; span=(1, 2), match='a'>, None)

In [None]:
# beginning of line 

p = re.compile(r'^a')

p.search('xa'), p.search('ay')

(None, <re.Match object; span=(0, 1), match='a'>)

In [None]:
# multiline

text = """I've seen things you people wouldn't believe.
Attack ships on fire off the shoulder of Orion.
I watched C-beams glitter in the dark near the Tannhäuser Gate.
All those moments will be lost in time, like tears in rain.
Time to die."""

In [None]:
# no flags

( re.findall(r'^\S+', text), 
 re.findall(r'\S+$', text) )

(["I've"], ['die.'])

In [None]:
# multline

( re.findall(r'^\S+', text, re.MULTILINE), 
  re.findall(r'\S+$', text, re.MULTILINE) )

(["I've", 'Attack', 'I', 'All', 'Time'],
 ['believe.', 'Orion.', 'Gate.', 'rain.', 'die.'])

In [None]:
# dotall

re.findall(r'\w+.+?\w+', text, re.DOTALL)

["I've",
 'seen things',
 'you people',
 "wouldn't",
 'believe.\nAttack',
 'ships on',
 'fire off',
 'the shoulder',
 'of Orion',
 'I watched',
 'C-beams',
 'glitter in',
 'the dark',
 'near the',
 'Tannhäuser Gate',
 'All those',
 'moments will',
 'be lost',
 'in time',
 'like tears',
 'in rain',
 'Time to',
 'die']

### Bordi

In [None]:
# boundary

re.findall(r'\w*s\b', 'this is a set of words ending with s')

['this', 'is', 'words', 's']

In [None]:
re.findall(r'\w*\Br\B\w*', 'this banner sports rare words having an r inside')

['sports', 'rare', 'words']

### Classi

In [None]:
text = 'only 123 number, 456 pass! or else?'

In [None]:
# numbers

re.findall(r'\d+', text), re.findall(r'\D+', text)

(['123', '456'], ['only ', ' number, ', ' pass! or else?'])

In [None]:
# alpfhanumeric 

re.findall(r'\w+', text), re.findall(r'\W+', text)

(['only', '123', 'number', '456', 'pass', 'or', 'else'],
 [' ', ' ', ', ', ' ', '! ', ' ', '?'])

In [None]:
# whitespace

re.findall(r'\s+', text), re.findall(r'\S+', text)

([' ', ' ', ' ', ' ', ' ', ' '],
 ['only', '123', 'number,', '456', 'pass!', 'or', 'else?'])

## Lookahead

In [None]:
# positivo

re.findall(r'(?=pino)\S+', 'pinocchio ranocchio pinolo' )

['pinocchio', 'pinolo']

In [None]:
# negativo

re.findall(r'\b(?!pino)\S+', 'pinocchio ranocchio pinolo' )

['ranocchio']

In [None]:
p = re.compile(r'.*\.(?!bat$|exe$)[^.]*$')

for f in 'a.txt', 'b.exe', 'c.bat', 'd.md':
    print(p.match(f))

<re.Match object; span=(0, 5), match='a.txt'>
None
None
<re.Match object; span=(0, 4), match='d.md'>


## Backreference

In [None]:
# star and end with the same char

re.match(r'(.)\d+(\1)', '1001'), re.match(r'(.)\d+(\1)', '1002')

(<re.Match object; span=(0, 4), match='1001'>, None)

In [None]:
# oh my got, not even CS! 

re.match(r'(\w+)(\1)', 'abbaabba')

<re.Match object; span=(0, 8), match='abbaabba'>

## Efficienza

In [None]:
%%time 
# veloce se appartiene 

re.match(r'(a+)+c', 'a' * 25 + 'c')

CPU times: user 103 µs, sys: 4 µs, total: 107 µs
Wall time: 110 µs


<re.Match object; span=(0, 26), match='aaaaaaaaaaaaaaaaaaaaaaaaac'>

In [None]:
%%time 
# mortale se non appartiene 

re.match(r'(a+)+c', 'a' * 25 + 'b')

CPU times: user 1.81 s, sys: 7.61 ms, total: 1.82 s
Wall time: 1.82 s


# Un Tokenizer

In [None]:
KIND_PATTERN = (
    ('NUMBER',       re.compile(r'\d+')),
    ('IDENTIFIER',   re.compile(r'\w+')),
    ('OP',           re.compile(r'[+*/-]')),
    ('OPEN_PAREN',   re.compile(r'\(')),
    ('CLOSED_PAREN', re.compile(r'\)')),
    ('WS',           re.compile(r'\s+'))
)

In [None]:
# In modo "diretto"

INPUT = '24 + pippo * (123)'

rest = INPUT
while rest:
    for kind, pattern in KIND_PATTERN:
        m = pattern.match(INPUT)
        if m:
            l = m.span()[1]
            value, rest = rest[:l], rest[l:]
            if kind != 'WS': print(kind, value)
            continue

NUMBER 24
IDENTIFIER  +
NUMBER  p
IDENTIFIER ip
NUMBER po
IDENTIFIER  *
NUMBER  (
IDENTIFIER 12
NUMBER 3)
IDENTIFIER 


In [None]:
# più pitonicamente

p = re.compile(
    '|'.join(
        '(?P<{}>{})'.format(kind, pattern.pattern) 
            for kind, pattern in KIND_PATTERN)
)

INPUT = '24 + pippo * (123)'

for m in p.finditer(INPUT):
    for k, v in m.groupdict().items():
        if k != 'WS' and v: print(k, v)

NUMBER 24
OP +
IDENTIFIER pippo
OP *
OPEN_PAREN (
NUMBER 123
CLOSED_PAREN )
