# NLP : Information Extraction using Regex

In [1]:
import re

In [2]:
text="""Born: Ratan Naval Tata 
28 December 1937 (age 85)
Bombay, Bombay Presidency, British India
school: Cornell University (BArch)
Occupation(s): Industrialist, Philanthropist
Title	Chairman Emeritus, Tata Sons and Tata Group[1]
Term: 1991–2012 2016–2017
Predecessor: J. R. D. Tata
Successor: Cyrus Mistry (2012–2016)
Natarajan Chandrasekaran (2017–present)
Parent: Naval Tata
Relatives: Tata family
Awards: 
Order of Australia (2023)
Assam Baibhav (2021)
Honorary Knight Grand Cross of the Order of the British Empire (2014)
Padma Vibhushan (2008)
Maharashtra Bhushan (2006)
Padma Bhushan (2000) """

# --- Regular expression for getting Name

In [3]:
pattern_for_Name="Born.([\dA-Za-z ]+)"

In [4]:
re.findall(pattern_for_Name,text)

[' Ratan Naval Tata ']

# --- Regular expression for getting Birthdate and Birth Place

In [5]:
pattern_for_birthdate='\d+ [a-zA-Z]+ \d{4}' #or \d+ \D+ \d{4}

In [6]:
re.findall(pattern_for_birthdate,text)

['28 December 1937']

In [7]:
pattern_for_place_DOB='(\d+ \D+ \d{4})|\(age.*\n([A-Za-z, ]+)'

In [8]:
re.findall(pattern_for_place_DOB,text) # DOB with Birth Place

[('28 December 1937', ''), ('', 'Bombay, Bombay Presidency, British India')]

In [9]:
pattern_for_place_DOB='(\d+ \D+ \d{4})|\(age.*\n(.*)'

In [10]:
re.findall(pattern_for_place_DOB,text) # DOB with Birth Place

[('28 December 1937', ''), ('', 'Bombay, Bombay Presidency, British India')]

# --- Regular expression for getting age

In [11]:
pattern_for_age='age.(\d+)'

In [12]:
re.findall(pattern_for_age,text)

['85']

# --- Regular expression for getting Awards

In [13]:
pattern_awards='Awards: \n([A-za-z0-9() \n]+)'

In [14]:
re.findall(pattern_awards,text)

['Order of Australia (2023)\nAssam Baibhav (2021)\nHonorary Knight Grand Cross of the Order of the British Empire (2014)\nPadma Vibhushan (2008)\nMaharashtra Bhushan (2006)\nPadma Bhushan (2000) ']

# --- A Python function to get all neccessary detail at a time.

In [15]:
def get_pattern_match(pattern,text):
    matches = re.findall(pattern,text)
    if matches:
        return matches[0]

In [16]:
get_pattern_match('age.(\d+)', text)

'85'

In [17]:
get_pattern_match('Awards: \n([A-za-z0-9() \n]+)', text)

'Order of Australia (2023)\nAssam Baibhav (2021)\nHonorary Knight Grand Cross of the Order of the British Empire (2014)\nPadma Vibhushan (2008)\nMaharashtra Bhushan (2006)\nPadma Bhushan (2000) '

# -- A Python Function for getting all details in single code

 Regex for Information Extraction

In [18]:
def extract_personal_information(text):
    age = get_pattern_match('age (\d+)', text)
    full_name = get_pattern_match('Born(.*)\n', text)
    birth_date = get_pattern_match('Born.*\n(.*)\(age', text)
    birth_place = get_pattern_match('\(age.*\n(.*)', text)
    awards = get_pattern_match('Awards: \n([A-za-z0-9() \n]+)', text)
    return {
        'age': int(age),
        'name': full_name.strip(),
        'birth_date': birth_date.strip(),
        'birth_place': birth_place.strip(),
        'awards': awards.strip()
    }

In [19]:
extract_personal_information(text)

{'age': 85,
 'name': ': Ratan Naval Tata',
 'birth_date': '28 December 1937',
 'birth_place': 'Bombay, Bombay Presidency, British India',
 'awards': 'Order of Australia (2023)\nAssam Baibhav (2021)\nHonorary Knight Grand Cross of the Order of the British Empire (2014)\nPadma Vibhushan (2008)\nMaharashtra Bhushan (2006)\nPadma Bhushan (2000)'}

In [22]:
text = '''
Born	Mukesh Dhirubhai Ambani
19 April 1957 (age 64)
Aden, Colony of Aden
(present-day Yemen)[1][2]
Nationality	Indian
Alma mater	
St. Xavier's College, Mumbai
Institute of Chemical Technology (B.E.)
Stanford University (drop-out)
Occupation	Chairman and MD, Reliance Industries
Spouse(s)	Nita Ambani ​(m. 1985)​[3]
Children	3
Parent(s)	
Dhirubhai Ambani (father)
Kokilaben Ambani (mother)
Relatives	Anil Ambani (brother)
Tina Ambani (sister-in-law)
'''

In [24]:
def extract_personal_information(text):
    age = get_pattern_match('age (\d+)', text)
    full_name = get_pattern_match('Born(.*)\n', text)
    birth_date = get_pattern_match('Born.*\n(.*)\(age', text)
    birth_place = get_pattern_match('\(age.*\n(.*)', text)
    #awards = get_pattern_match('Awards: \n([A-za-z0-9() \n]+)', text)
    return {
        'age': int(age),
        'name': full_name.strip(),
        'birth_date': birth_date.strip(),
        'birth_place': birth_place.strip(),
       # 'awards': awards.strip()
    }

In [25]:
extract_personal_information(text)

{'age': 64,
 'name': 'Mukesh Dhirubhai Ambani',
 'birth_date': '19 April 1957',
 'birth_place': 'Aden, Colony of Aden'}