## Regular Expression for NLP

In [1]:
import re

### Extracting Phone Numbers Using RegEx

In [2]:
chat1 = 'jkutty: you ask lot of questions 😠  1235678912, abc@xyz.com'
chat2 = 'jkutty: here it is: (123)-567-8912, abc08@xyz.com'
chat3 = 'jkutty: yes, phone: 1235678912,9745664831,email: abc.kyc.101@xyz.com'

In [3]:
#Extracting the Phone number from the first chat
pattern = '\d{10}'  # \d{10} means continous 10 digits 
matches = re.findall(pattern,chat1)
matches

['1235678912']

In [4]:
#Extracting the Phone number from the second chat
pattern = '\(\d{3}\)-\d{3}-\d{4}'
matches = re.findall(pattern,chat2)
matches

['(123)-567-8912']

In [5]:
#Extracting the Phone number from the Third chat
pattern = '\d{10}|\(\d{3}\)-\d{3}-\d{4}'
matches = re.findall(pattern,chat3)
matches

['1235678912', '9745664831']

### Extracting Email from the text using RegEx

In [6]:
#Extracting email from first Chat
pattern = '[a-zA-Z0-9]*@[a-z]*\.[a-z]*'
matches = re.findall(pattern,chat1)
matches

['abc@xyz.com']

In [7]:
#Extracting email from second Chat
pattern = '[a-zA-Z0-9]*@[a-z]*\.[a-z]*'
matches = re.findall(pattern,chat2)
matches

['abc08@xyz.com']

In [8]:
#Extracting email from second Chat
pattern = '[a-zA-Z0-9-\.]*@[a-z]*\.[a-z]*'
matches = re.findall(pattern,chat3)
matches

['abc.kyc.101@xyz.com']

### Retrieving order number from the text using Regex

In [9]:
chat1='jkutty: Hello, I am having an issue with my order # 412889912'
chat2='jkutty: I have a problem with my order number 412889912'
chat3='jkutty: My order 412889912 is having an issue, I was charged 300$ when online it says 280$'

In [10]:
#Extracting the order number from chat1
pattern = 'order[^\d]*(\d*)'
matches = re.findall(pattern,chat1)
matches

['412889912']

In [11]:
#Extracting the order number from chat2
pattern = 'order[^\d]*(\d*)'
matches = re.findall(pattern,chat2)
matches

['412889912']

In [12]:
#Extracting the order number from chat3
matches = re.findall(pattern,chat3)
matches

['412889912']

## Information Extraction using Regular Expression

In [13]:
text='''
Born	Elon Reeve Musk
June 28, 1971 (age 50)
Pretoria, Transvaal, South Africa
Citizenship	
South Africa (1971–present)
Canada (1971–present)
United States (2002–present)
Education	University of Pennsylvania (BS, BA)
Title	
Founder, CEO and Chief Engineer of SpaceX
CEO and product architect of Tesla, Inc.
Founder of The Boring Company and X.com (now part of PayPal)
Co-founder of Neuralink, OpenAI, and Zip2
Spouse(s)	
Justine Wilson
​
​(m. 2000; div. 2008)​
Talulah Riley
​
​(m. 2010; div. 2012)​
​
​(m. 2013; div. 2016)
'''

In [14]:
#Extracting the age from the text
patterns = 'age (\d+)'
matches = re.findall(patterns,text)
matches

['50']

In [15]:
#Extracting Name from the text
patterns = 'Born(.*)'
matches = re.findall(patterns,text)
matches[0].strip()

'Elon Reeve Musk'

In [16]:
#Extracting Date of Birth from the text
patterns = 'Born.*\n(.*)\(age'
matches = re.findall(patterns,text)
matches[0].strip()

'June 28, 1971'

In [17]:
#EXtracting Birth place from the text
patterns = 'age.*\n(.*)'
matches = re.findall(patterns,text)
matches[0].strip()

'Pretoria, Transvaal, South Africa'

In [18]:
#Creating a function for simplyfying the work
def get_pattern_match(patterns,text):
    matches = re.findall(patterns,text)
    if matches:
        return matches[0]

In [19]:
#Creating a function for Extracting personal information from text
def get_personal_information(text):
    age = get_pattern_match('age (\d+)',text)
    full_name = get_pattern_match('Born(.*)',text)
    birth_date = get_pattern_match('Born.*\n(.*)\(age',text)
    birth_place = get_pattern_match('age.*\n(.*)',text)
    return {
        'age':int(age),
        'full_name':full_name.strip(),
        'birth_date':birth_date.strip(),
        'birth_place':birth_place.strip()
    }

### Examples of Regular Expression for text extraction

In [20]:
text1=text = '''
Born	Mukesh Dhirubhai Ambani
19 April 1957 (age 64)
Aden, Colony of Aden
(present-day Yemen)[1][2]
Nationality	Indian
Alma mater	
St. Xavier's College, Mumbai
Institute of Chemical Technology (B.E.)
Stanford University (drop-out)
Occupation	Chairman and MD, Reliance Industries
Spouse(s)	Nita Ambani ​(m. 1985)​[3]
Children	3
Parent(s)	
Dhirubhai Ambani (father)
Kokilaben Ambani (mother)
Relatives	Anil Ambani (brother)
Tina Ambani (sister-in-law)
'''

In [21]:
extraction = get_personal_information(text1)
extraction

{'age': 64,
 'full_name': 'Mukesh Dhirubhai Ambani',
 'birth_date': '19 April 1957',
 'birth_place': 'Aden, Colony of Aden'}

In [22]:
text3='''Born	William Henry Gates III
October 28, 1955 (age 67)
Seattle, Washington, U.S.
Education	
Harvard University
(dropped out)
Occupations	
Businessmaninvestorphilanthropistprogrammerwriter
Years active	1972–present
Known for	Co-founder of Microsoft and Bill & Melinda Gates Foundation
Title	
Partial list of founded and chaired companies
Spouse	Melinda French
​
​(m. 1994; div. 2021)'''

In [23]:
extraction2 = get_personal_information(text3)
extraction2

{'age': 67,
 'full_name': 'William Henry Gates III',
 'birth_date': 'October 28, 1955',
 'birth_place': 'Seattle, Washington, U.S.'}