In [1]:
# import regular expression library

import re

# METACHARACTERS

### 1.  '.'

Find any characters in a string except new line (incl spaces)

In [2]:
sentence1 = re.findall(r'.', 'I am learning text analytics')
print(sentence1) #Each letter will be selected including spaces (Total 28 characters)

['I', ' ', 'a', 'm', ' ', 'l', 'e', 'a', 'r', 'n', 'i', 'n', 'g', ' ', 't', 'e', 'x', 't', ' ', 'a', 'n', 'a', 'l', 'y', 't', 'i', 'c', 's']


### 2. '\w'

Find any single character except new line and spaces

In [3]:
sentence2 = re.findall(r'\w', 'I am learning text analytics')
print(sentence2) # Each letter is selected except space (Total 24 characters)

['I', 'a', 'm', 'l', 'e', 'a', 'r', 'n', 'i', 'n', 'g', 't', 'e', 'x', 't', 'a', 'n', 'a', 'l', 'y', 't', 'i', 'c', 's']


### 3. '\w*'

Matches any characters with 0 or more characters incl spaces

In [4]:
sentence3 = re.findall(r'\w*', 'I am learning text analytics')
print(sentence3) # each word selected (5 words + spaces)

['I', '', 'am', '', 'learning', '', 'text', '', 'analytics', '']


### 4. '\w+'

Matches 1 or more characters excl spaces

In [5]:
sentence4 = re.findall(r'\w+', 'I am learning text analytics')
print(sentence4) # each word selected (5 words - spaces)

['I', 'am', 'learning', 'text', 'analytics']


### 5. ^\w+

find first word in string

In [6]:
sentence5 = re.findall(r'^\w+', 'I am learning text analytics')
print(sentence5)

['I']


### 6. \w+$

find last word in string

In [7]:
sentence6 = re.findall(r'\w+$', 'I am learning text analytics')
print(sentence6)

['analytics']


### 7. \w\w

find 2 consecutive characters

In [8]:
sentence7 = re.findall(r'\w\w', 'I am learning text analytics')
print(sentence7) # 'I' not selected because one character only

['am', 'le', 'ar', 'ni', 'ng', 'te', 'xt', 'an', 'al', 'yt', 'ic']


### 8. \b\w\w

find only the first two consecutive characters 

In [9]:
sentence8 = re.findall(r'\b\w\w', 'I am learning text analytics')
print(sentence8)

['am', 'le', 'te', 'an']


# Extract domain type of email

In [10]:
# 1

sentence9 = re.findall (r'@\w+', 'user@text.com.my, user@analytics.gov.my, user@textanalytics.edu.my')
print (sentence9) 

# first word domain name selected after the '@'

['@text', '@analytics', '@textanalytics']


In [11]:
# 2

sentence10 = re.findall (r'@\w+.\w+','user@text.com.my, user@analytics.gov.my, user@textanalytics.edu.my')
print (sentence10)

# domain name '@ + domain name + domain name' will be selected

['@text.com', '@analytics.gov', '@textanalytics.edu']


In [12]:
# 3

sentence11 = re.findall (r'@\w+.\w+.\w+', 'user@text.com.my, user@analytics.gov.my, user@textanalytics.edu.my')
print (sentence11) 
 
# The full domain name '@ + domain name + domain name + domain name' is selected 

['@text.com.my', '@analytics.gov.my', '@textanalytics.edu.my']


In [13]:
# 4

sentence12 = re.findall (r'@\w+.(\w+.\w+)', 'user@text.com.my, user@analytics.gov.my, user@textanalytics.edu.my') 
print (sentence12) 
 
# To display the type of domain where only (\w+.\w+) will be chosen

['com.my', 'gov.my', 'edu.my']


# Extract date

In [14]:
# 1

sentence13 = re.findall (r'\d{2}-\d{2}-\d{2}', 'Ahmad BIT(IS) 15-05-2001, Johnny BCS(SE) 20-08-2000')
print (sentence13) 
 
# To display the date in the format of dd-mm-yy. {2} means only 2 number will be chosen

['15-05-20', '20-08-20']


In [15]:
# 2

sentence14 = re.findall (r'\d{2}-\d{2}-\d{4}', 'Ahmad BIT(IS) 15-05-2001, Johnny BCS(SE) 20-08-2000')
print (sentence14) 
 
# To display the date in the format of dd-mm-yyyy. {4} means only 4 number will be chosen

['15-05-2001', '20-08-2000']


In [16]:
# 3

sentence15 = re.findall (r'\d{2}-\d{2}-(\d{4})', 'Ahmad BIT(IS) 15-05-2001, Johnny BCS(SE) 20-08-2000')
print (sentence15) 
 
# Only the year will be displayed 

['2001', '2000']


# Select words that start with vowels

In [17]:
# 1

sentence16 = re.findall (r'[aeiouAEIOU]\w+', 'I have eight story books. I often read them in afternoon')
print(sentence16) # A sequence that starts with a vowel followed by one o rmore characters are selected 

['ave', 'eight', 'ory', 'ooks', 'often', 'ead', 'em', 'in', 'afternoon']


In [18]:
# 2

sentence17 = re.findall (r'\b[aeiouAEIOU]\w+', 'I have eight story books. I often read them in afternoon')
print(sentence17) # Only words that start with vowels are selected 

['eight', 'often', 'in', 'afternoon']


In [19]:
# 3

sentence18 = re.findall (r'\b[^aeiouAEIOU\s]\w+', 'I have eight story books. I often read them in afternoon')
print(sentence18) # Only words that start with non-vowels are selected 

['have', 'story', 'books', 'read', 'them']


# Splitting a string with multiple delimiters

In [20]:
# 1

sentence19 = re.split (r'[;,]', 'I have many story books, colouring books; I often read them in the afternoon.')
print (sentence19) 

# split the words based on the delimiters semi colon and comma 

['I have many story books', ' colouring books', ' I often read them in the afternoon.']


In [21]:
# 2

sentence20 = re.split (r'[;,\s]', 'I have many story books, colouring books; I often read them in the afternoon.')
print (sentence20) 

# split the words based on the delimiters semi colon, comma and space 

['I', 'have', 'many', 'story', 'books', '', 'colouring', 'books', '', 'I', 'often', 'read', 'them', 'in', 'the', 'afternoon.']


# Substituting delimiters

In [22]:
sentence21 = re.sub (r'[;,]', '.', 'I have many story books, colouring books; I often read them in the afternoon.')
print (sentence21) 
 
# Substitute the delimiters semi colon and comma with fullstop 

I have many story books. colouring books. I often read them in the afternoon.
