In [4]:
from regular_expression_visualization.visualize_reg import search_pattern

## Simple pattern

In [53]:
patterns =  [
    'ee', # exactly ee
    'ea',  # exactly ea
    'ai',
    'aa'
]
strings = ['tee', 'tea', 'bail']

search_pattern(patterns, strings)

Unnamed: 0,ee,ea,ai,aa
,,,,
,,,,
tee,tee,tee,tee,tee


## One of the pattern

Use ```|``` to seperate several pattern

In [54]:
patterns =  [
    'ee|ea|ai', # ee or ea or ai
]
strings = ['tee', 'tea', 'bail']

search_pattern(patterns, strings)

Unnamed: 0,ee|ea|ai
,
,
tee,tee


Pattern order matters

In [101]:
 patterns =  [
    'oo|ooo', # oo is tried first
    'ooo|oo', # ooo is tried first
]
strings = ['loong', 'looong', 'long']

search_pattern(patterns, strings)

Unnamed: 0,oo|ooo,ooo|oo
,,
,,
loong,loong,loong


When "one of pattern" is followed by or following other regular expressions, use () to seperate to seperate from them

In [103]:
patterns =  [
    'b(ea|ee)', # b + (ea or ee)
    'bea|ee' # bea or ee
]
strings = ['bead', 'bee']

search_pattern(patterns, strings)

Unnamed: 0,b(ea|ee),bea|ee
,,
bead,bead,bead


## Qualifiers

### appear m to n times

Use ```{m,n}```

In [107]:
patterns =  [
    'ooo', # o, three times
    'o{3}',  # o, three times
    'o{2,3}', # o, 2~3 time
    'o{2, 3}', # o, Not working! Don't put in the blank!    
    'o{2,}', # o, more than 2 times
    'lo{,3}', # l + o, o appears 0 to 3 times
    'o{,3}', # seems not working alone 
]
strings = ['looong', 'long', 'loong']

search_pattern(patterns, strings)

Unnamed: 0,ooo,o{3},"o{2,3}","o{2, 3}","o{2,}","lo{,3}","o{,3}"
,,,,,,,
,,,,,,,
looong,looong,looong,looong,looong,looong,looong,looong


### appear at least once

In [108]:
patterns =  [
    'o+n', # o, at least 1 time
    'o{1,}n'# same as above
]
strings = ['looong', 'long', 'bug']

search_pattern(patterns, strings)

Unnamed: 0,o+n,"o{1,}n"
,,
,,
looong,looong,looong


### appear zero or more times

In [109]:
patterns =  [
    'lo*ng', # long, o appears zero or more time
    'lo{0,}ng' # same as above
]
strings = ['long', 'lng', 'loong', 'leong']

search_pattern(patterns, strings)

Unnamed: 0,lo*ng,"lo{0,}ng"
,,
,,
,,
long,long,long


### appear zero or one time

In [110]:
patterns =  [
    'apples?', # apple, ending s may not appear
    'apples{0,1}' # same as above
]
strings = ['apple', 'apples']

search_pattern(patterns, strings)

Unnamed: 0,apples?,"apples{0,1}"
,,
apple,apple,apple


### non-greedy mode

By default, regular expressions try to match the longest possible string (greedy mode). If this is not desired, add ```?``` after qualifiers to match the shortest possible string.

In [7]:
patterns =  [
    '#.+#', # greedy mode
    '#.+?#', # non-greedy mode
]
strings = ['#Chapter 2#  #Ongoing#']

search_pattern(patterns, strings)

Unnamed: 0,#.+#,#.+?#
#Chapter 2# #Ongoing#,#Chapter 2# #Ongoing#,#Chapter 2# #Ongoing#


## Sub expression

use ```()```

In [115]:
patterns =  [
    'ba(na){2}', # b + na, na appears two times
    'banana', # same as above
    'bana{2}', # ban + a, a appear 2 times,
    'banaa', # same as above
    
]
strings = ['banana', 'banaa']

search_pattern(patterns, strings)

Unnamed: 0,ba(na){2},banana,bana{2},banaa
,,,,
banana,banana,banana,banana,banana


In [116]:
patterns =  [
    '(a+_+){2}', # two consecutive pattern which match a+_+, they are not necessarily the same string
    'a+_+a+_+', # same as above
    'a+_+'
]
strings = ['aa_a__', 'a_', 'a__a_a_']

search_pattern(patterns, strings)

Unnamed: 0,(a+_+){2},a+_+a+_+,a+_+
,,,
,,,
aa_a__,aa_a__,aa_a__,aa_a__


## Character Set

### Any character
```.``` stands for any character

In [63]:
patterns =  [
    'b.d', # b + any character + d
    'be..' # b + e + any character + any character
]
strings = ['bed', 'bid','bee', 'benign', 'beed']

search_pattern(patterns, strings)

Unnamed: 0,b.d,be..
,,
,,
,,
,,
bed,bed,bed


### Any character in a set
Use ```[...]```

In [64]:
patterns =  [
    'b[ei]d', # b + e or i + d
    'bed|bid' # same as above
] 
strings = ['bed', 'bid', 'bee', 'bud']

search_pattern(patterns, strings)

Unnamed: 0,b[ei]d,bed|bid
,,
,,
,,
bed,bed,bed


Use ```-``` for character range

In [65]:
patterns =  [
    'id_[0-5]', # id_ + any number in 0 to 5
    'id_[012345]' # same as above
]
strings = ['id_1', 'id_6']

search_pattern(patterns, strings)

Unnamed: 0,id_[0-5],id_[012345]
,,
id_1,id_1,id_1


In [117]:
patterns =  [
    'type_[a-ex]', # type_ + any character in range a to e and x,
    'type_[abcdex]', # same as above
    'type_[a-zA-Z]' # any letter
] 
strings = ['type_a', 'type_b', 'type_x', 'type_Z']

search_pattern(patterns, strings)

Unnamed: 0,type_[a-ex],type_[abcdex],type_[a-zA-Z]
,,,
,,,
,,,
type_a,type_a,type_a,type_a


Don't misuse ```[...]``` to expression one the pattern !

In [12]:
 patterns =  [
    '(公斤|千克).+', # 公斤 or 千克     
    '[公斤|千克].+', # character set containing 公，斤，千, 克 and |
] 
strings = ['购买2公斤苹果', '开公司', '保持克制', '<<|>>']

search_pattern(patterns, strings)

Unnamed: 0,(公斤|千克).+,[公斤|千克].+
,,
,,
,,
购买2公斤苹果,购买2公斤苹果,购买2公斤苹果


### Any character not in set

Use ```[^...]```

In [67]:
patterns =  [
    'type_[^a-z]' # type_ + any character not in a to z
] 
strings = ['type_1', 'type_a', 'type_c']

search_pattern(patterns, strings)

Unnamed: 0,type_[^a-z]
,
,
type_1,type_1


### Any number

Use ```\d```

In [68]:
patterns =  [
    'id_\d\d', # id_ + any number character + any number character
    'id_[0-9][0-9]' # same as above
]
strings = ['id_12', 'id_0', 'id']

search_pattern(patterns, strings)

Unnamed: 0,id_\d\d,id_[0-9][0-9]
,,
,,
id_12,id_12,id_12


### Any non-number character

Use ```\D```

In [69]:
patterns =  [
    'e\D',  # e + any character which is not number character
    'e[^0-9]' # same as above
]
strings = ['bee', 'tel', 'te1']

search_pattern(patterns, strings)

Unnamed: 0,e\D,e[^0-9]
,,
,,
bee,bee,bee


### Any word charcters

Use ```\w```, word character means a-z, A-Z, 0-9 and _

In [70]:
patterns = [
    '\w+',  # any word character, more than one time
    '[a-zA-Z0-9_]+' # same as above
]
strings = [':id_1.']

search_pattern(patterns, strings)

Unnamed: 0,\w+,[a-zA-Z0-9_]+
:id_1.,:id_1.,:id_1.


### Any non-word characters

Use ```\W```

In [71]:
patterns =  [
    '\W+', # any non-word character, more than one time
    '[^a-zA-Z0-9_]+'# same as above
]
strings = ['id_1 + id_2']

search_pattern(patterns, strings)

Unnamed: 0,\W+,[^a-zA-Z0-9_]+
id_1 + id_2,id_1 + id_2,id_1 + id_2


### Any space

Use ```\s```

In [72]:
patterns =  [
    '\s.*\s', # blank + any string + blank
    '[\t\n\f\r ].*[\t\n\f\r ]' # same as above
]
strings = ['Monkey D Luffy']

search_pattern(patterns, strings)

Unnamed: 0,\s.*\s,[ ].*[ ]
Monkey D Luffy,Monkey D Luffy,Monkey D Luffy


### Any Non Space

In [119]:
patterns =  [
    '\S.+\S', # any character except space + any string + any character except space
    '[^\t\n\f\r ].*[^\t\n\f\r ]' # same as above
]
strings = ['on the\ntree']

search_pattern(patterns, strings)

Unnamed: 0,\S.+\S,[^ ].*[^ ]
on the tree,on the tree,on the tree


## Escaping

As you see, many characters like ```(```,```.```,```+``` have special means in regular expression. If you want to disable these and search for these characters, add ```\``` before them

In [120]:
patterns =  [
    '($\d+.\d+)', # $ . + are not treated as characters
    '\(\$\d+\.\d+\)' # $ . + are treated as characters
]
strings = ['apple ($3.25)']

search_pattern(patterns, strings)

Unnamed: 0,($\d+.\d+),\(\$\d+\.\d+\)
apple ($3.25),apple ($3.25),apple ($3.25)


## Anchor

Anchor are searched but won't be part be of the matching result

### followed by

Use ```(?=...)```

In [88]:
patterns =  [
    '\w+(?=\.)',  # word character string, followed by comma. the comma is not returned in the matching result
    '\w+\.' # comma is returned in the matching result
]
strings = ['Apple juice.']

search_pattern(patterns, strings)

Unnamed: 0,\w+(?=\.),\w+\.
Apple juice.,Apple juice.,Apple juice.


### Not followed by

Use ```(?!...)```

In [89]:
patterns =  [
    '\w+(?!\.)', # word character string, not followed by comma
    '\w+[^\.]' # word character string, followed by any character which is not comma
]
strings = ['Apple juice.']

search_pattern(patterns, strings)

Unnamed: 0,\w+(?!\.),\w+[^\.]
Apple juice.,Apple juice.,Apple juice.


### Following

Use ```(?<=...)```

In [90]:
patterns =  [
    '(?<=:)\d+', # number character string, following :
    ':\d+' # : + number character string
]
strings = ['apple:10']

search_pattern(patterns, strings)

Unnamed: 0,(?<=:)\d+,:\d+
apple:10,apple:10,apple:10


### not following

Use ```(?<!)```

In [94]:
patterns =  [
    '(?<!A)\d+', # number character string, not followed by A
    '[^A]\d+' # any character expect A + number character string
]
strings = ['A123 123']


search_pattern(patterns, strings)

Unnamed: 0,(?,[^A]\d+
A123 123,A123 123,A123 123


### border of word

In [95]:
patterns =  [
    r'\beat\b', # eat surrounded by border of word, (whole word searching)
    'eat' #
]
strings = ['I eat food', 'beat']

search_pattern(patterns, strings)

Unnamed: 0,\beat\b,eat
,,
I eat food,I eat food,I eat food


Why use ```r``` in ```r'\beat\b'```? ```\b``` in python has special meaning (like ```+``` has special meaning in regular expression), it represents a back space character [(see here)](https://stackoverflow.com/questions/25065608/what-does-backward-slash-b-do-in-python) 

To disable this behaviour, add ```r``` in front of the string. (Like we add ```\``` before ```+``` in regular expression)

### not border of word

In [97]:
patterns =  [
    r'\Beat\B', # eat, not following or followed by word border, (appear within a word)
    'eat'
]
strings = ['I eat food', 'beats']

search_pattern(patterns, strings)

Unnamed: 0,\Beat\B,eat
,,
I eat food,I eat food,I eat food
