## REGULAR EXPRESSIONS

In [39]:
#Import Regular Expressions
import re
from pathlib import Path

In [6]:
filenames= ['nov-12.txt', 'november-14.txt', 'Oct-17.txt', 'Nov-22.txt']

### 1 Case: Normal Email

In [7]:
text = 'Hi there you here example@example.com some more text here and there another@example.de'

In [8]:
pattern = re.compile("[a-z]+@[a-z]+.[a-z]+")#Any letter from A to Z ( + = more than one letter)
matches = pattern.findall(text)
print(matches)

['example@example.com', 'another@example.de']


### 2 Case: Email with character

In [9]:
text2 = 'Hi there you here exa+mple@example.com some more text here and there another@example.de'

In [10]:
pattern = re.compile("[^ ]+@[a-z]+.[a-z]+")#[^ ] Any character but not space
matches = pattern.findall(text2)
print(matches)

['exa+mple@example.com', 'another@example.de']


### Meta Characteres

In [31]:
# .        Matches any single character
# \        Escapes one of the meta characters to treat it as a regular character
# [...]    Matches a single character or a range that is contained within brackets. Order does not matter but without brackets order does matter
# +        Matches the preeceding element one or more times
# ?        Matches the preeceding element zero or one time
# *        Matches the preeceding element zero or more times
# {m,n}    Matches the preeceding element at least m and not more than n times
# ^        Matches the beginning of a line or string
# $        Matches the end of a line or string
# [^...]   Matches a single character or a range that is not contained within the brackets
# ?:...|..."Or" operator
# ()       Matches an optional expression

####  .	 Matches any character (except newline character)	"he..o"	

In [13]:
#Changing the dot for =
text3 = 'Hi there you here exa+mple@example=com some more text here and there another@example.de'

In [15]:
pattern = re.compile("[^ ]+@[a-z]+.[a-z]+")
matches = pattern.findall(text3)
print(matches)

['exa+mple@example=com', 'another@example.de']


#### \	Signals a special sequence (can also be used to escape special characters)	"\d"	


In [16]:
text4 = 'Hi there you here exa+mple@example=com some more text here and there another@example.de'

In [18]:
pattern = re.compile("[^ ]+@[a-z]+\.[a-z]+")
matches = pattern.findall(text4)
print(matches)

['another@example.de']


#### [...]	A set of characters	"[a-m]"  	

In [21]:
# Insert 'another' into the brackets
pattern = re.compile("[another]+@[a-z]+\.[a-z]+") #the order of another doesn't matter. it could be naother, for example
matches = pattern.findall(text4)
print(matches)

['another@example.de']


#### +	One or more occurrences	"he.+o"	

#### ?	Zero or one occurrence	"he.?o"	

In [22]:
pattern = re.compile("[^ ]?@[a-z]+\.[a-z]+") #the order of another doesn't matter. it could be naother, for example
matches = pattern.findall(text4)
print(matches)

['r@example.de']


#### *	Zero or more occurrences	"he.*o"	

In [23]:
text5 = 'Hi there you here exa+mple@example=com @blabla.com some more text here and there another@example.de'

In [24]:
pattern = re.compile("[^ ]*@[a-z]+\.[a-z]+") #the order of another doesn't matter. it could be naother, for example
matches = pattern.findall(text5)
print(matches)

['@blabla.com', 'another@example.de']


#### {}	Exactly the specified number of occurrences	"he.{2}o"	

In [27]:
pattern = re.compile("[^ ]{4}@[a-z]+.[a-z]+") #the order of another doesn't matter. it could be naother, for example
matches = pattern.findall(text5)
print(matches)

['mple@example=com', 'ther@example.de']


#### |	Either or	"falls|stays"	

In [28]:
text6 = 'Hi there you here exa+mple@example=com @blabla.com some more text here and there another@example.de and another@example.ne'

In [30]:
pattern = re.compile("[^ ]@[^ ]+.(?:com|ne)+") #the order of another doesn't matter. it could be naother, for example
matches = pattern.findall(text6)
print(matches)

['e@example=com', 'r@example.ne']


# Extract URLs Using Regex

In [32]:
path = 'C:/Users/kenny/Documents/ProjetoPython/Automation/regex/'

In [33]:
with open(path+'urls.txt', 'r') as file:
    content = file.read()
    
content

'http://google.com\nhttps://example.com\nhttp://www.wikipedia.com\nhttp://pythonhow.com\nhttps://python.org'

In [34]:
pattern = re.compile("https?://(?:www)?[^ \n]+\.com") #getting http or https/sometimes have www, sometimes don't/ use "\" to use normal "." character an not re character
matches = pattern.findall(content)
matches

['http://google.com',
 'https://example.com',
 'http://www.wikipedia.com',
 'http://pythonhow.com']

# Extract IP Adresses

In [36]:
with open(path+'ips.txt', 'r') as file:
    content = file.read()
    
content

'912.131.120.111\n912.131.134.000\n912.131.129.129'

In [38]:
pattern = re.compile("[0-9]{3}\.[0-9]{3}\.12[0-9]{1}\.[0-9]{3}")
matches = pattern.findall(content)
matches

['912.131.120.111', '912.131.129.129']

# Filter Filenames

In [41]:
root_dir = Path(path+'files')
filenames = root_dir.iterdir()
filenames_str = [filename.name for filename in filenames]
filenames_str

['billy_Nov-13.txt',
 'Nov-12.txt',
 'nov-20.txt',
 'Nov-22.txt',
 'november-14.txt',
 'November-24.txt',
 'Oct-17.txt']

In [42]:
pattern = re.compile("nov[a-z]*-(?:[1-9]|1[0-9]|20).txt", re.IGNORECASE)
matches = [filename for filename in filenames_str if pattern.findall(filename)]
matches

['billy_Nov-13.txt', 'Nov-12.txt', 'nov-20.txt', 'november-14.txt']

# Exercise 1 - Find a Word in Text

<!-- Write a regular expression that returns all the list items that contain the word Delhi. The list is stored in the data variable:

data=[
    "mr Jim Cloudy, Texas, 01091231, 1 dog 1 cat, jim.cloudy@example.com", 
    "mrs Anna Cloudy, Delhi, 2dogs 1fish bathlover@example.com",
    "Mrs. Sarah Prost, Baghdad, +4327629101, 1 hamster, 2 crocodiles",
    "Ms Beta Palm Ontario 08234211 12 cats, beta@example.com",
    "mr. Dog Bells texas 09234211 3 honey badgers alta_bells.example.com",
    "ms. Claudia More, Gujarat, 012311, 3 dogs",
    "mrs Alma Stills Delhi 01231981 1 dog",
    "mr Sen Kumar Delhi 3456 ants" -->

In [43]:
# Write a regular expression that returns all the list items that contain the word Delhi. The list is stored in the data variable:

# data=[
#     "mr Jim Cloudy, Texas, 01091231, 1 dog 1 cat, jim.cloudy@example.com", 
#     "mrs Anna Cloudy, Delhi, 2dogs 1fish bathlover@example.com",
#     "Mrs. Sarah Prost, Baghdad, +4327629101, 1 hamster, 2 crocodiles",
#     "Ms Beta Palm Ontario 08234211 12 cats, beta@example.com",
#     "mr. Dog Bells texas 09234211 3 honey badgers alta_bells.example.com",
#     "ms. Claudia More, Gujarat, 012311, 3 dogs",
#     "mrs Alma Stills Delhi 01231981 1 dog",
#     "mr Sen Kumar Delhi 3456 ants"

In [45]:
data=[
    "mr Jim Cloudy, Texas, 01091231, 1 dog 1 cat, jim.cloudy@example.com", 
    "mrs Anna Cloudy, Delhi, 2dogs 1fish bathlover@example.com",
    "Mrs. Sarah Prost, Baghdad, +4327629101, 1 hamster, 2 crocodiles",
    "Ms Beta Palm Ontario 08234211 12 cats, beta@example.com",
    "mr. Dog Bells texas 09234211 3 honey badgers alta_bells.example.com",
    "ms. Claudia More, Gujarat, 012311, 3 dogs",
    "mrs Alma Stills Delhi 01231981 1 dog",
    "mr Sen Kumar Delhi 3456 ants"]

In [48]:
pattern = re.compile(".*Delhi.*", re.IGNORECASE)
matches = [match for match in data if pattern.findall(match)]
print(matches)

['mrs Anna Cloudy, Delhi, 2dogs 1fish bathlover@example.com', 'mrs Alma Stills Delhi 01231981 1 dog', 'mr Sen Kumar Delhi 3456 ants']


# Exercise 2 - Find Lines Containing a Word and an Email Address

In [49]:
pattern = re.compile(".*Delhi.*[^ ]+@[^ ]+\.[a-z]+", re.IGNORECASE)
matches = [match for match in data if pattern.findall(match)]
print(matches)

['mrs Anna Cloudy, Delhi, 2dogs 1fish bathlover@example.com']


# Exercise 3 - Find Lines Containing a Word and a Phone Number

In [52]:
pattern = re.compile(".*Delhi.*[0|+][0-9]{4,50}", re.IGNORECASE)
matches = [match for match in data if pattern.findall(match)]
print(matches)

['mrs Alma Stills Delhi 01231981 1 dog']


# Exercise 4 - Find Lines Containing a Phone Number and an Email Address

In [53]:
pattern = re.compile(".*Delhi.*([0|+][0-9]{4,50}|[^ ]+@[^ ]+.[a-z]+)", re.IGNORECASE)
matches = [match for match in data if pattern.findall(match)]
print(matches)

['mrs Anna Cloudy, Delhi, 2dogs 1fish bathlover@example.com', 'mrs Alma Stills Delhi 01231981 1 dog']
