In [1]:
import re

In [39]:
emails = [
'''
Sender: person1@site29.com
CC: ccperson1@site26.com, ccperson.32@site0.net, cc.person5@udacity.org
BCC: bccperson43@site99.com, bccperson.42@site21.net.uk, bcc.person.7@google.org.pk
Body:
We are happy to invite you to the following events:
    Raj's birthday at 29-10-2023 on 06:00 PM.
    Sheila's morning yoga practice dated 2022-03-31, 09:30 AM
'''
,
'''
Sender: person1@site59.net.pk
CC: ccperson1@site26.com, ccperson.32@site0.net, cc.person5@udacity.org
BCC: bccperson43@site99.com, bccperson.42@site21.net.uk, bcc.person.7@google.org.pk
Body:
You are cordially invited to:
    Avril's business meeting on Environmental Disasters, airing live on 2023-02-08 at 01:00 PM.
    Sheila's yoga practice on 12:30 PM, 31-03-2022
'''
]

In [31]:
# you can find the characteristics of username and domain parts of email here:
# https://snov.io/knowledgebase/what-is-a-valid-email-address-format/
username_regex = r'[\-_]?([a-z0-9]+[\.\-_]?[a-z0-9]+)+'
domain_regex = r'[a-z0-9]+(\.[a-z0-9]+(\-*[a-z0-9]+)*)+'

email_regex = f'{username_regex}@{domain_regex}'

In [32]:
year_regex = r'\d{4}'

jan_sept_regex = r'(0[1-9])'
oct_dec_regex = r'(1[0-2])'
month_regex = f'({jan_sept_regex}|{oct_dec_regex})'

_1_to_9_days_regex = r'(0[1-9])'
_10_to_29_days_regex = r'([12][0-9])'
_30_to_31_days_regex = r'(3[01])'
day_regex = f'({_1_to_9_days_regex}|{_10_to_29_days_regex}|{_30_to_31_days_regex})'

year_month_day_date_regex = f'({year_regex}\-{month_regex}\-{day_regex})'
day_month_year_date_regex = f'({day_regex}\-{month_regex}\-{year_regex})'

date_regex = f'{year_month_day_date_regex}|{day_month_year_date_regex}'

In [33]:
_1_to_9_hours_regex = r'(0[1-9])'
_10_to_12_hours_regex = r'(1[012])'
hour_regex = f'({_1_to_9_hours_regex}|{_10_to_12_hours_regex})'

minute_regex = r'[0-5][0-9]'
ampm_regex = r'(AM|PM)'

time_regex = f'{hour_regex}:{minute_regex} {ampm_regex}'

In [40]:
def get_matched_strings(regex, string):
    #  sandwich the elements of the list passed to join() with 'regex', preparing regex for re.findall()
    regex = regex.join(['(', ')']) # e.g. regex = 'abc'; regex.join(['(', ')']) => (abc)
    #  return group of each match that matches with the whole 'regex'(found in 'string'), instead of just 1 group
    return [matched_group[0] for matched_group in re.findall(regex, string)]

In [53]:
def process_email(email):
    email_lines = [line for line in re.split(r'\n+', email) if len(line) > 0]
    
    sender_line, cc_line, bcc_line, *body_lines = email_lines
    
    event_lines = body_lines[2:]
    
    sender = get_matched_strings(email_regex, sender_line)[0]
    
    cc_email_addresses = get_matched_strings(email_regex, cc_line)
    
    bcc_email_addresses = get_matched_strings(email_regex, bcc_line)
    
    event_details = []
    
    for event_line in event_lines:
        
        event_date = get_matched_strings(date_regex, event_line)[0]
        event_time = get_matched_strings(time_regex, event_line)[0]
        
        event_details.append([event_date, event_time])
        
    return (sender, cc_email_addresses, bcc_email_addresses, event_details)

In [58]:
def extractFromEmails(emails):
    senders, CCs_by_email, BCCs_by_email, event_details_by_email = [], [], [], []
    
    for email in emails:
        
        sender, cc_emails, bcc_emails, event_details = process_email(email)
        
        senders.append(sender)
        CCs_by_email.append(cc_emails)
        BCCs_by_email.append(bcc_emails)
        event_details_by_email.append(event_details)
        
    return senders, CCs_by_email, BCCs_by_email, event_details_by_email

In [55]:
labels_packed_results = zip(['Senders', 'CCs', 'BCCs', "Events' Details"], extractFromEmails(emails))
print(*labels_packed_results, sep='\n\n\n')

('Senders', ['person1@site29.com', 'person1@site59.net.pk'])


('CCs', [['ccperson1@site26.com', 'ccperson.32@site0.net', 'cc.person5@udacity.org'], ['ccperson1@site26.com', 'ccperson.32@site0.net', 'cc.person5@udacity.org']])


('BCCs', [['bccperson43@site99.com', 'bccperson.42@site21.net.uk', 'bcc.person.7@google.org.pk'], ['bccperson43@site99.com', 'bccperson.42@site21.net.uk', 'bcc.person.7@google.org.pk']])


("Events' Details", [[['29-10-2023', '06:00 PM'], ['2022-03-31', '09:30 AM']], [['2023-02-08', '01:00 PM'], ['31-03-2022', '12:30 PM']]])


In [56]:
s = '''Ahsan

Huzaifa

Irshad'''
re.split('\n+', s)

['Ahsan', 'Huzaifa', 'Irshad']

In [57]:
extractFromEmails(emails)

(['person1@site29.com', 'person1@site59.net.pk'],
 [['ccperson1@site26.com', 'ccperson.32@site0.net', 'cc.person5@udacity.org'],
  ['ccperson1@site26.com', 'ccperson.32@site0.net', 'cc.person5@udacity.org']],
 [['bccperson43@site99.com',
   'bccperson.42@site21.net.uk',
   'bcc.person.7@google.org.pk'],
  ['bccperson43@site99.com',
   'bccperson.42@site21.net.uk',
   'bcc.person.7@google.org.pk']],
 [[['29-10-2023', '06:00 PM'], ['2022-03-31', '09:30 AM']],
  [['2023-02-08', '01:00 PM'], ['31-03-2022', '12:30 PM']]])