# NLP Tutorial: Regular Expressions


## (1) Regex in customer support
### Retrieve order number


In [1]:
import re # regular expression
chat = 'Bor Klebber Toledo 14 June 1986 (age 37) Bom Jesus dos Perdões, São Paulo, Brazil Occupation	Actor Years active	2007–present Spouse	Camila Queiroz ​m. 2018)'
# if you want to find one digit use r"\d" 
# if you   want to find more digit like year (2023) use r"\d\d\d\d
# Learn more https://regex101.com/

pattern = '\(\d{3}\)-\d{3}-\d{4}'
pattern2 = '\d{6}'
pattern3 = 'one'
pattern4 = '\d{4}'
matches = re.findall(pattern4,chat)

matches

['1986', '2007', '2018']

# Find all email address form text file

In [2]:
import re

def find_emails(text):
    # Regrular expression pattern for matching email addresses
    pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z]{2,}'
    emails = re.findall(pattern,text)
    return emails

if __name__ == "__main__":
    input_text = """Here are some example email addresses: john.doe@example.com, jane_smith123@yahoo.co.uk,
    info@company.org, contact_us123@example-mail-server.info. Don't forget to contact us!
    """

    found_emails = find_emails(input_text)
    print("Find all email addresses below :")

    for email in found_emails:
        print(email)

Find all email addresses below :
john.doe@example
jane_smith123@yahoo
info@company
contact_us123@example


In [3]:
import re

def find_invoice_numbers(text):
    # Regular expression pattern for matching invoice numbers (assumes alphanumeric format)
    pattern = r'\b[A-Za-z0-9]+\b'

    # Find all occurrences of the pattern in the input text
    invoice_numbers = re.findall(pattern, text)

    return invoice_numbers

# Test the function
if __name__ == "__main__":
    input_text = """
    Here are some example invoice numbers: INV1234, 56789, INV-2021, ABC-INV-12345.
    """

    found_invoice_numbers = find_invoice_numbers(input_text)
    print("Found invoice numbers:")
    for invoice_number in found_invoice_numbers:
        print(invoice_number)


Found invoice numbers:
Here
are
some
example
invoice
numbers
INV1234
56789
INV
2021
ABC
INV
12345


In [4]:
import re

def find_invoice_numbers(text):
    # Regular expression patterns for matching invoice numbers
    patterns = [
        # r'\b[A-Za-z0-9]+\b',                            # Alphanumeric (e.g., ABC123)
        r'\b[A-Z0-9]{6,}+\b',                            # Alphanumeric (e.g., ABC123) at 6 digit and Only upper case letter
        r'\bINV-\d{4}-\d+\b',                            # Numeric with hyphen (e.g., INV-2021-1234)
        r'\bINV_\d+\b',                                 # Numeric with underscore (e.g., INV_56789)
        r'\bINV-\d+-\d{4}\b',                           # Numeric with prefix and suffix (e.g., INV-1234-2021)
        r'\bINV-[A-Za-z]+\b',                           # Alphabetic with hyphen (e.g., INV-XYZ)
        r'\bINV_[A-Za-z]+\b',                           # Alphabetic with prefix (e.g., INV_ABC)
        r'\bINV[A-Za-z0-9#-]+_\d{4}-[A-Za-z]+\b',       # Alphanumeric with special characters (e.g., INV#2021-AB12)
        r'\bINV\d{8}-\d+\b',                            # Numeric with prefix and date (e.g., INV20210801-5678)
        r'\b[A-Za-z]+-\d{4}-\d{2}-\d{2}\b',             # Alphabetic with suffix and date (e.g., ABC-2021-08-01)
        r'\bINV[@A-Za-z0-9_]+\b',                       # Combination of alphanumeric and special characters (e.g., INV@2021_ABC)
    ]

    # Find all occurrences of the patterns in the input text
    invoice_numbers = set()
    for pattern in patterns:
        matches = re.findall(pattern, text)
        invoice_numbers.update(matches)

    return invoice_numbers

# Test the function
if __name__ == "__main__":
    input_text = """
    Here are some example invoice numbers: ABC123, INV-2021-1234, INV_56789,
    INV-1234-2021, INV-XYZ, INV_ABC, INV#2021-AB12, INV20210801-5678,
    ABC-2021-08-01, INV@2021_ABC.
    """

    found_invoice_numbers = find_invoice_numbers(input_text)
    print("Found invoice numbers     :     ")
    for invoice_number in found_invoice_numbers:
        print(invoice_number)


Found invoice numbers     :     
INV-1234-2021
INV-2021-1234
ABC123
INV_56789
INV@2021_ABC
INV20210801-5678
INV20210801
INV_ABC
INV-XYZ
ABC-2021-08-01


In [5]:
# Phone number pattern

# get_pattern_match('(\d{10})|(\(\d{3}\)-\d{3}-\d{4})',chat1)
# '1235678912'
# get_pattern_match('(\d{10})|(\(\d{3}\)-\d{3}-\d{4})', chat2)
# ('', '(123)-567-8912')
# get_pattern_match('(\d{10})|(\(\d{3}\)-\d{3}-\d{4})', chat3)

In [6]:
import re


def extract_personal_information(text):
    age = get_pattern_match('age (\d+)', text)
    full_name = get_pattern_match('Born(.*)\n', text)
    birth_date = get_pattern_match('Born.*\n(.*)\(age', text)
    birth_place = get_pattern_match('\(age.*\n(.*)', text)
    return {
        'age': int(age),
        'name': full_name.strip(),
        'birth_date': birth_date.strip(),
        'birth_place': birth_place.strip()
    }


def get_pattern_match(pattern, text):
    matches = re.findall(pattern, text)
    if matches:
        return matches[0]
    

input_text = ''' Born	Mukesh Dhirubhai Ambani
19 April 1957 (age 64)
Aden, Colony of Aden
(present-day Yemen)[1][2]
Nationality	Indian
Alma mater	
St. Xavier's College, Mumbai
Institute of Chemical Technology (B.E.)
Stanford University (drop-out)
Occupation	Chairman and MD, Reliance Industries
Spouse(s)	Nita Ambani ​(m. 1985)​[3]
Children	3
Parent(s)	
Dhirubhai Ambani (father)
Kokilaben Ambani (mother)
Relatives	Anil Ambani (brother)
Tina Ambani (sister-in-law) '''

extract_personal_information(input_text)

{'age': 64,
 'name': 'Mukesh Dhirubhai Ambani',
 'birth_date': '19 April 1957',
 'birth_place': 'Aden, Colony of Aden'}

## Exercise
### 1. Extract all twitter handles from following text. Twitter handle is the text that appears after https://twitter.com/ and is a single word. Also it contains only alpha numeric characters i.e. A-Z a-z , o to 9 and underscore _

In [7]:
text = '''
Follow our leader Elon musk on twitter here: https://twitter.com/elonmusk, more information 
on Tesla's products can be found at https://www.tesla.com/. Also here are leading influencers 
for tesla related news,
https://twitter.com/teslarati
https://twitter.com/dummy_tesla
https://twitter.com/dummy_2_tesla
'''
pattern = 'https://twitter\.com/([a-zA-Z0-9_]+)'

re.findall(pattern, text)

['elonmusk', 'teslarati', 'dummy_tesla', 'dummy_2_tesla']

## 2. Extract Concentration Risk Types. It will be a text that appears after "Concentration Risk:", In below example, your regex should extract these two strings

### (1) Credit Risk

### (2) Supply Rish


In [8]:
import re
text = '''
Concentration of Risk: Credit Risk
Financial instruments that potentially subject us to a concentration of credit risk consist of cash, cash equivalents, marketable securities,
restricted cash, accounts receivable, convertible note hedges, and interest rate swaps. Our cash balances are primarily invested in money market funds
or on deposit at high credit quality financial institutions in the U.S. These deposits are typically in excess of insured limits. As of September 30, 2021
and December 31, 2020, no entity represented 10 more of our total accounts receivable balance. The risk of concentration for our convertible note
hedges and interest rate swaps is mitigated by transacting with several highly-rated multinational banks.
Concentration of Risk: Supply Risk
We are dependent on our suppliers, including single source suppliers, and the inability of these suppliers to deliver necessary components of our
products in a timely manner at prices, quality levels and volumes acceptable to us, or our inability to efficiently manage these components from these
suppliers, could have a material adverse effect on our business, prospects, financial condition and operating results.
'''

pattern = 'Concentration of Risk: ([^\n]*)'
re.findall(pattern,text)


['Credit Risk', 'Supply Risk']

## 3. Companies in europe reports their financial numbers of semi annual basis and you can have a document like this. To exatract quarterly and semin annual period you can use a regex as shown below



In [10]:
text = '''
Tesla's gross cost of operating lease vehicles in FY2021 Q1 was $4.85 billion.
BMW's gross cost of operating vehicles in FY2021 S1 was $8 billion.
'''

pattern = 'FY(\d{4} (?:Q[1-4]|S[1-2]))'
matches = re.findall(pattern, text)
match es

['2021 Q1', '2021 S1']