In [1]:
import re

In [2]:
"""
Three main functions:
- match - tests if the a string matches a pattern
- search - test if a string contains a pattern, by finding the first occurence
- findall - find all occurences of the pattern in a string
""";

<hr>

## When to use `match`?

In [3]:
pattern_cpf = re.compile(r"\d{3}\.\d{3}\.\d{3}-\d{2}")

In [4]:
# trivial case (exactly the pattern)
pattern_cpf.match("111.111.111-11")

<re.Match object; span=(0, 14), match='111.111.111-11'>

In [5]:
# text before pattern
pattern_cpf.match("text_text_text111.111.111-11")

In [6]:
# text after pattern
pattern_cpf.match("111.111.111-11text_text_text")

<re.Match object; span=(0, 14), match='111.111.111-11'>

In [7]:
pattern_cpf_2 = re.compile(r"\d{3}\.\d{3}\.\d{3}-\d{2}$") # or fullmatch

In [8]:
pattern_cpf_2.match("111.111.111-11")

<re.Match object; span=(0, 14), match='111.111.111-11'>

In [9]:
pattern_cpf_2.match("111.111.111-11text_text_text")

## Quick regex overview
```
- abc…      Letters
- 123…      Digits
- \d        %Any Digit
- \D        Any Non-digit character
- .         Any Character
- \.        Period
- [abc]     Only a, b, or c
- [^abc]    Not a, b, nor c
- [a-z]     Characters a to z
- [0-9]     Numbers 0 to 9
- \w        Any Alphanumeric character
- \W        Any Non-alphanumeric character
- {m}       m Repetitions
- {m,n}     m to n Repetitions
- *         Zero or more repetitions
- +         One or more repetitions
- ?         Optional character
- \s        Any Whitespace
- \S        Any Non-whitespace character
- ^…$       Starts and ends
```

## Simple exercises

In [10]:
text = """METALNOX INDÚSTRIA METALÚRGICA LTDA
RUA JOSÉ THEODORO RIBEIRO - 3571, ILHA DA FIGUEIRA
CEP: 89.258-001 JARAGUÁ DO SUL, SC

CNPJ; 78.810.975/0001-72
TE: XXX XKXXXX

20/06/2015 11:55:23 CCF: 012249 COD: 020990
CNPJICPF consumidor: 754.523,157-05
NOME: JOÃO DOS SANTOS
END: RUA GETÚLIO VARGAS, 449 - SÃO PAULO
CUPOM FISCAL

ITEM CÓDIGO DESCRIÇÃO QTD UN VLUNIT(RS) ST VL ITEMIRS)

001 1955 PONTEIRAPARAPER 204250 TI700% 50,00
001 1875  RODIZIOSFAZMM 204280  T1700% 56,00
TOTAL R$ 106,00
Dinheiro 150,00
TROCO 44,00
Vendedor: 000008 000213946-01 - CX 1 -SIP-
Valor trib. aprox. [Fed= 0,44] [Est= 0,50)

Fonte: IBPIFECOMERCIO SP 9013aC
BR
"""

In [11]:
# 1) find words that are uppercased XXXX...

In [12]:
re.findall(r"[A-Z]+", text)

['METALNOX',
 'IND',
 'STRIA',
 'METAL',
 'RGICA',
 'LTDA',
 'RUA',
 'JOS',
 'THEODORO',
 'RIBEIRO',
 'ILHA',
 'DA',
 'FIGUEIRA',
 'CEP',
 'JARAGU',
 'DO',
 'SUL',
 'SC',
 'CNPJ',
 'TE',
 'XXX',
 'XKXXXX',
 'CCF',
 'COD',
 'CNPJICPF',
 'NOME',
 'JO',
 'O',
 'DOS',
 'SANTOS',
 'END',
 'RUA',
 'GET',
 'LIO',
 'VARGAS',
 'S',
 'O',
 'PAULO',
 'CUPOM',
 'FISCAL',
 'ITEM',
 'C',
 'DIGO',
 'DESCRI',
 'O',
 'QTD',
 'UN',
 'VLUNIT',
 'RS',
 'ST',
 'VL',
 'ITEMIRS',
 'PONTEIRAPARAPER',
 'TI',
 'RODIZIOSFAZMM',
 'T',
 'TOTAL',
 'R',
 'D',
 'TROCO',
 'V',
 'CX',
 'SIP',
 'V',
 'F',
 'E',
 'F',
 'IBPIFECOMERCIO',
 'SP',
 'C',
 'BR']

In [None]:
# 2) find words that are uppercased and are two caracters long XX

In [15]:
re.findall(r"\b[A-Z]{2}\b", text)

['DA', 'DO', 'SC', 'TE', 'UN', 'RS', 'ST', 'VL', 'CX', 'SP', 'BR']

In [None]:
# 3) find words that are uppercased and are three characters long or more XXX... 

In [16]:
re.findall(r"\b[A-Z]{3,}\b", text)

['METALNOX',
 'LTDA',
 'RUA',
 'THEODORO',
 'RIBEIRO',
 'ILHA',
 'FIGUEIRA',
 'CEP',
 'SUL',
 'CNPJ',
 'XXX',
 'XKXXXX',
 'CCF',
 'COD',
 'CNPJICPF',
 'NOME',
 'DOS',
 'SANTOS',
 'END',
 'RUA',
 'VARGAS',
 'PAULO',
 'CUPOM',
 'FISCAL',
 'ITEM',
 'QTD',
 'VLUNIT',
 'ITEMIRS',
 'PONTEIRAPARAPER',
 'RODIZIOSFAZMM',
 'TOTAL',
 'TROCO',
 'SIP',
 'IBPIFECOMERCIO']

In [None]:
# 4) find capitalized words Xxxxx

In [19]:
re.findall(r"\b[A-Z][a-z]*\b", text)

['R', 'Dinheiro', 'Vendedor', 'Valor', 'Fed', 'Est', 'Fonte']

In [None]:
# 5) find all tokens (sequence of characters separated by space) with numbers (tips: \d, \S, +, *)

In [20]:
re.findall(r"\S*\d+\S*", text)

['3571,',
 '89.258-001',
 '78.810.975/0001-72',
 '20/06/2015',
 '11:55:23',
 '012249',
 '020990',
 '754.523,157-05',
 '449',
 '001',
 '1955',
 '204250',
 'TI700%',
 '50,00',
 '001',
 '1875',
 '204280',
 'T1700%',
 '56,00',
 '106,00',
 '150,00',
 '44,00',
 '000008',
 '000213946-01',
 '1',
 '0,44]',
 '0,50)',
 '9013aC']

## Further regex overview
```
- (...)     Matches ..., consumes it and catches it to a group
- (?:...)   Matches ..., consumes it and doesn't catch it
- (?=...)   Matches ..., doesn't consume and doesn't catch it
- (?!...)   Matches if not ..., doesn't consume it and doesn't catch it

- \n n=1,2...  Matches the n-th catching group    

```

In [None]:
# 6) find all unique consonants (tips: (?!...), python set)

In [23]:
set(re.findall(r"(?=[^aeiouAEIOU])[a-zA-Z]", text))

{'B',
 'C',
 'D',
 'F',
 'G',
 'H',
 'J',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'X',
 'Z',
 'b',
 'c',
 'd',
 'h',
 'l',
 'm',
 'n',
 'p',
 'r',
 's',
 't',
 'x'}

In [None]:
# 7) find all words with the patter xyx, where x is a vowel and y is a consonant

In [26]:
re.search(r"[a-zA-Z]*([aeiouAEIOU])(?=[^aeiouAEIOU])[a-zA-Z]\1[a-zA-Z]*", text)

<re.Match object; span=(45, 53), match='THEODORO'>

In [35]:
re.findall(r"([a-zA-Z]*([aeiouAEIOU])(?=[^aeiouAEIOU])[a-zA-Z]\2[a-zA-Z]*)", text)

[('THEODORO', 'O'),
 ('JARAGU', 'A'),
 ('PONTEIRAPARAPER', 'A'),
 ('RODIZIOSFAZMM', 'I'),
 ('TROCO', 'O')]

<hr>

<hr>

## More exercises

### 1) Find the domains
```
Example
http://www.aaaaa.com/bbbb/cccc?ddd=ee -> aaaaa.com
```

In [36]:
# https://www.hackerrank.com/challenges/detect-the-domain-name/problem?isFullScreen=true
text = """
<div class="reflist" style="list-style-type: decimal;">
<ol class="references">
<li id="cite_note-1"><span class="mw-cite-backlink"><b>^ ["Train (noun)"](http://www.askoxford.com/concise_oed/train?view=uk). <i>(definition – Compact OED)</i>. Oxford University Press<span class="reference-accessdate">. Retrieved 2008-03-18</span>.</span><span title="ctx_ver=Z39.88-2004&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrain&rft.atitle=Train+%28noun%29&rft.genre=article&rft_id=http%3A%2F%2Fwww.askoxford.com%2Fconcise_oed%2Ftrain%3Fview%3Duk&rft.jtitle=%28definition+%E2%80%93+Compact+OED%29&rft.pub=Oxford+University+Press&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal" class="Z3988"><span style="display:none;"> </span></span></span></li>
<li id="cite_note-2"><span class="mw-cite-backlink"><b>^</b></span> <span class="reference-text"><span class="citation book">Atchison, Topeka and Santa Fe Railway (1948). <i>Rules: Operating Department</i>. p. 7.</span><span title="ctx_ver=Z39.88-2004&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrain&rft.au=Atchison%2C+Topeka+and+Santa+Fe+Railway&rft.aulast=Atchison%2C+Topeka+and+Santa+Fe+Railway&rft.btitle=Rules%3A+Operating+Department&rft.date=1948&rft.genre=book&rft.pages=7&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook" class="Z3988"><span style="display:none;"> </span></span></span></li>
<li id="cite_note-3"><span class="mw-cite-backlink"><b>^ [Hydrogen trains](http://www.hydrogencarsnow.com/blog2/index.php/hydrogen-vehicles/i-hear-the-hydrogen-train-a-comin-its-rolling-round-the-bend/)</span></li>
<li id="cite_note-4"><span class="mw-cite-backlink"><b>^ [Vehicle Projects Inc. Fuel cell locomotive](http://www.bnsf.com/media/news/articles/2008/01/2008-01-09a.html)</span></li>
<li id="cite_note-5"><span class="mw-cite-backlink"><b>^</b></span> <span class="reference-text"><span class="citation book">Central Japan Railway (2006). <i>Central Japan Railway Data Book 2006</i>. p. 16.</span><span title="ctx_ver=Z39.88-2004&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrain&rft.au=Central+Japan+Railway&rft.aulast=Central+Japan+Railway&rft.btitle=Central+Japan+Railway+Data+Book+2006&rft.date=2006&rft.genre=book&rft.pages=16&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook" class="Z3988"><span style="display:none;"> </span></span></span></li>
<li id="cite_note-6"><span class="mw-cite-backlink"><b>^ ["Overview Of the existing Mumbai Suburban Railway"](http://web.archive.org/web/20080620033027/http://www.mrvc.indianrail.gov.in/overview.htm). _Official webpage of Mumbai Railway Vikas Corporation_. Archived from [the original](http://www.mrvc.indianrail.gov.in/overview.htm) on 2008-06-20<span class="reference-accessdate">. Retrieved 2008-12-11</span>.</span><span title="ctx_ver=Z39.88-2004&rfr_id=info%3Asid%2Fen.wikipedia.org%3ATrain&rft.atitle=Overview+Of+the+existing+Mumbai+Suburban+Railway&rft.genre=article&rft_id=http%3A%2F%2Fwww.mrvc.indianrail.gov.in%2Foverview.htm&rft.jtitle=Official+webpage+of+Mumbai+Railway+Vikas+Corporation&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Ajournal" class="Z3988"><span style="display:none;"> </span></span></span></li>
</ol>
</div>
"""

In [40]:
set(sorted(re.findall(r"http://(?:www\.|ww2/|)([^/]*)", text)))

{'askoxford.com',
 'bnsf.com',
 'hydrogencarsnow.com',
 'mrvc.indianrail.gov.in',
 'web.archive.org'}

In [None]:
# answer
"""
askoxford.com;
bnsf.com;
hydrogencarsnow.com;
mrvc.indianrail.gov.in;
web.archive.org
"""

### 2) Find the emails

In [41]:
# https://www.hackerrank.com/challenges/detect-the-email-addresses/problem?isFullScreen=true
text = """
HackerRank is more than just a company
    We are a tight group of hackers, bootstrappers, entrepreneurial thinkers and innovators. We are building an engaged community of problem solvers. Imagine the intelligence and value that a room would hold if it contained hackers/problem solvers from around the world? We're building this online.
Hypothesis: Every hacker loves a particular type of challenge presented in a certain set of difficulty. If we build a large collection of real world challenges in different domains with an engaging interface, it is going to be incredible! Join us to create history.
Available Positions
Product Hacker product@hackerrank.com
Challenge Curator
Product Evangelist
Product Designer
Content Creator
ACM World Finals Hacker
Backend C++ Hacker
Mail us at hackers@hackerrank.com to chat more. Or you can write to us at interviewstreet@hackerrank.com!
HACKERRANK PERKS
Working for a startup is hard work, but there are plenty of benefits of working for a small, fun, growing team.
[Image] Perk: Get tools for the jobAll the Right ToolsWe know that everyone's perfect workspace is unique to them. We will get you set up with whatever equipment you need to start hacking - a new 15” Macbook Pro or iMac, or a computer of your choice plus a display if you need it. Additionally, if you require any software or other tools, we've got it covered.[Image] Perk: Flexible HoursFlexible HoursBecause we work so hard, we encourage our employees to keep flexible hours and don't require them to track their time. A morning scrum and open communication ensures that the job gets done on time, and we rely on the honor system so that you can work on your own pace.[Image] Perk: HealthcareWellness SupportTo work hard, you have to be healthy. We will cover your health, dental, and visual insurance with no wait period. That means instant benefits from the day you're hired.[Image] Perk: Choice of LocationLocation, Location, LocationWe are the first Indian company to be backed by Y-Combinator, and as a result we have a thriving office in Bangalore and a growing office in Mountain View, CA. Depending on your residency or visa status, we will get you situated in one of our two offices, both of which are located in the heart of their country's tech industry.[Image] Perk: Choice of LocationCreative SupportIf you have a cool side project that you want to launch, we will pay for EC2/heroku servers to get it off the ground. Side projects fuel creativity and learning, which are crucial to the HackerRank culture.
CULTURE
The culture of a startup is reflective of the founders’ DNA. Larry Page & Sergey Brin were PhD’s from Stanford and that’s why Google is filled with high scoring graders from top schools and is very hard to get in if you’re not a CS major. Similarly, the hacker culture at Facebook is inspired by Zuckerberg, a hacker, the design culture by Steve Jobs and so on.
The adjective to describe the environment/founders here is relentless hardworkers. It might be a general trait of a startup but I’m pretty sure it’s a notch higher here and defines the culture. This is what has taken us this far. It’s not working in weekends or allnighters that count, but the effort that goes into building something intellectually engaging for hackers and making it fun is high.
You’ll have to embrace randomness and chaos. There’s some level of discipline (eg: daily scrums) but only so much. We push boundaries everyday, stretch our limits but no one complains because there’s a feeling of doing something great at the end of the day, every single day.

"""

In [43]:
re.findall(r"[a-zA-Z0-9_]+@[a-zA-Z0-9_\.]+", text)

['product@hackerrank.com',
 'hackers@hackerrank.com',
 'interviewstreet@hackerrank.com']

In [None]:
# answer
"""
hackers@hackerrank.com;
interviewstreet@hackerrank.com;
product@hackerrank.com
"""

### 3) Find C code comments

In [45]:
# (modified) https://www.hackerrank.com/challenges/ide-identifying-comments/problem?isFullScreen=true
text = """
 /*This is a program to calculate area of a * circle  * after getting the radius as input from the user*/
#include<stdio.h>
int main()
{
   double radius,area;//variables for storing radius and area
   printf("Enter the radius of the circle whose area is to be calculated\n");
   scanf("%lf",&radius);//entering the value for radius of the circle as float data type
   area=(22.0/7.0)*pow(radius,2);//Mathematical function pow is used to calculate square of radius
   printf("The area of the circle is %lf",area);//displaying the results
   getch();
}
/*A test run for the program was carried out and following output was observed
If 50 is the radius of the circle whose area is to be calculated
The area of the circle is 7857.1429*/
/** final comments**/
"""

In [47]:
re.findall(r"/\*(.*?)\*/", text, flags=re.DOTALL)

['This is a program to calculate area of a * circle  * after getting the radius as input from the user',
 'A test run for the program was carried out and following output was observed\nIf 50 is the radius of the circle whose area is to be calculated\nThe area of the circle is 7857.1429',
 '* final comments*']

In [48]:
re.findall(r"/\*\*(.*?)\*\*/", text, flags=re.DOTALL)

[' final comments']

In [50]:
re.findall(r"//([^\n]*)", text)

['variables for storing radius and area',
 'entering the value for radius of the circle as float data type',
 'Mathematical function pow is used to calculate square of radius',
 'displaying the results']