## Notebook for testing the Outlook API

#### first attempt was to get the content, later I switched to MIME becaus I could get more infos at once

In [None]:
from pandas import DataFrame

#### text content

In [1]:
import requests
from datetime import datetime, timezone
import msal
import json
import os
import re
import argparse
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.data.tables import TableServiceClient, TableEntity

OUTLOOK_API_USER_ID = os.environ.get('OUTLOOK_API_USER_ID')
OUTLOOK_API_CLIENT_ID = os.environ.get('OUTLOOK_API_CLIENT_ID')
OUTLOOK_API_SECRET = os.environ.get('OUTLOOK_API_SECRET')
OUTLOOK_CONTENT_CONNECTION_STRING = os.environ.get('OUTLOOK_CONTENT_CONNECTION_STRING')

#config for msal
config = {
    "authority": "https://login.microsoftonline.com/c08b32af-3535-4f63-8a3a-51247cf1f022",
    "client_id": OUTLOOK_API_CLIENT_ID,
    "scope": ["https://graph.microsoft.com/.default"],
    "secret": OUTLOOK_API_SECRET
}


#function to get token
def get_token():
    app = msal.ConfidentialClientApplication(
        config['client_id'], authority=config['authority'], client_credential=config['secret']
    )

    result = None
    result = app.acquire_token_silent(config['scope'], account=None)

    if not result:
        result = app.acquire_token_for_client(scopes=config['scope'])

    if "access_token" in result:
        access_token = result['access_token']
        return access_token

#function to get folder ids by name
def get_folder_id(folder_name, access_token, user_id):

    # Set the URL and headers
    folder_url = f"https://graph.microsoft.com/v1.0/users/{user_id}/mailFolders"
    headers = {'Authorization': f"Bearer {access_token}", 'Content-Type': "application/json"}

    response = requests.get(folder_url, headers=headers)

    # Collect the ids of the folders
    folder_ids = { folder['displayName']: folder['id'] for folder in response.json()['value']}

    folder_id = folder_ids[folder_name]
    return folder_id


#function to get subfolder ids by name
def get_subfolder_id(subfolder_name, access_token, user_id, folder_id):
    
        # Set the URL and headers
        subfolder_url = f"https://graph.microsoft.com/v1.0/users/{user_id}/mailFolders/{folder_id}/childFolders"
        headers = {'Authorization': f"Bearer {access_token}", 'Content-Type': "application/json"}
    
        response = requests.get(subfolder_url, headers=headers)
    
        # Collect the ids of the folders
        subfolder_ids = { folder['displayName']: folder['id'] for folder in response.json()['value']}
    
        subfolder_id = subfolder_ids[subfolder_name]
        return subfolder_id

#function to get email ids by daterange
def get_email_ids_by_daterange(start_time, end_time, access_token, user_id, folder_id, subfolder_id):
        
        if subfolder_id is not None:
            # Set the URL and headers
            message_url = f"https://graph.microsoft.com/v1.0/users/{user_id}/mailFolders/{folder_id}/childFolders/{subfolder_id}/messages"
        else:
            # Set the URL and headers
            message_url = f"https://graph.microsoft.com/v1.0/users/{user_id}/mailFolders/{folder_id}/messages"
        
        headers = {'Authorization': f"Bearer {access_token}"}
        
        # Filter the messages by the receivedDateTime
        params = {
            '$filter': f"receivedDateTime ge {start_time} and receivedDateTime lt {end_time}",
            '$select': 'id',
            '$top': '1000'
        }
        
        response = requests.get(message_url, headers=headers, params=params)
        
        message_ids = [email['id'] for email in response.json()['value']]
        return message_ids





#class to get email content as text, sender, recipient, subject, receivedDateTime, conversationId, webLink
class Email:
    '''
    Class to get email content as text, sender, recipient, subject, receivedDateTime, conversationId, webLink
    args:message_id, token, user_id
    returns: email content as text, sender, recipient, subject, receivedDateTime, conversationId, webLink as dict
    '''
    def __init__(self, message_id, token, user_id):
        self.message_id = message_id
        self.token = token
        self.user_id = user_id
        self.content = None
        self.sender = None
        self.recipient = None
        self.received_datetime = None
        self.conversation_id = None
        self.web_link = None
        self._subject = None
        self._fetch_email()

    def _fetch_email(self):
        headers = {
            'Authorization': f"Bearer {self.token}",
            'Prefer': 'outlook.body-content-type="text"'
        }
        url = f"https://graph.microsoft.com/v1.0/users/{self.user_id}/messages/{self.message_id}"
        response = requests.get(url, headers=headers)
        email_data = response.json()

        self.content = email_data['body']['content']
        self.sender = email_data['from']['emailAddress']['address']
        self.recipient = [recipient['emailAddress']['address'] for recipient in email_data['toRecipients']]
        self.received_datetime = email_data['receivedDateTime']
        self.conversation_id = email_data['conversationId']
        self.web_link = email_data['webLink']
        self._subject = email_data['subject']

    def _clean_content(self, content):
        # Replace multiple newlines with a single newline
        content = re.sub(r'\n+', '\n', content)
        content = re.sub(r'http\S+|www.\S+', '[link removed]', content)# Replace URLs with '[link removed]'
        content = re.sub(r'An:.*?Sie erreichen Ihr PKM unter folgendem Link', '', content, flags=re.DOTALL)
        content = re.sub(r'An:.*?Betreff:', '', content) # remove email headers
        content =re.sub(r'[\w]+@[\.\w]+',"",content) #removing email addresses
        content = re.sub(r'\d{4}\s\w+',"",content) #removing adress
        content = re.sub(r'[PMT:]*\s*\+\d{1,3}\s[(0)]?(?:[()\s]?\d{1,3}){1,10}',"",content) #removing phone numbers

        return content
    

    def to_dict(self):
        return {
            'message_id': self.message_id,
            'subject': self._subject,
            'content': self._clean_content(self.content),
            'sender': self.sender,
            'recipient': ', '.join(self.recipient),  # convert list to string
            'received_datetime': self.received_datetime,
            'conversation_id': self.conversation_id,
            'web_link': self.web_link
        }

    def __str__(self):
        return json.dumps(self.to_dict(), indent=4)
    
    def save_to_azure(self, azure_table):
        entity = self.to_dict()
        # Azure Table Storage needs a PartitionKey and a RowKey
        entity['PartitionKey'] = self.sender
        entity['RowKey'] = self.message_id
        azure_table.create_entity(entity)



class AzureTable:
    def __init__(self, connection_string=OUTLOOK_CONTENT_CONNECTION_STRING, table_name="outlooktest"):
        self.table_name = table_name
        self.table_client = TableServiceClient.from_connection_string(connection_string).get_table_client(table_name)
        

    def create_entity(self, entity):
        try:
            self.table_client.create_entity(entity=entity)
            #print(f"Entity with RowKey:  added to table: ")
        except Exception as ex:
            print(f"Could not add entity to table: {ex}")

# create the top-level parser
#ef parse_args():
#   parser = argparse.ArgumentParser()
#   parser.add_argument("--start_date", help="start_date")
#   parser.add_argument("--end_date", help="end_time")
#   parser.add_argument("--subfolder_name", help="subfolder_name", default=None)
#   #parser.add_argument("azure_table", help="azure_table", default=AzureTable())
#   return parser.parse_args()
#


#### MIME content

In [None]:
import requests
from datetime import datetime, timezone
import msal
import json
import os
import re
import argparse
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.data.tables import TableServiceClient, TableEntity

OUTLOOK_API_USER_ID = os.environ.get('OUTLOOK_API_USER_ID')
OUTLOOK_API_CLIENT_ID = os.environ.get('OUTLOOK_API_CLIENT_ID')
OUTLOOK_API_SECRET = os.environ.get('OUTLOOK_API_SECRET')
OUTLOOK_CONTENT_CONNECTION_STRING = os.environ.get('OUTLOOK_CONTENT_CONNECTION_STRING')

#config for msal
config = {
    "authority": "https://login.microsoftonline.com/c08b32af-3535-4f63-8a3a-51247cf1f022",
    "client_id": OUTLOOK_API_CLIENT_ID,
    "scope": ["https://graph.microsoft.com/.default"],
    "secret": OUTLOOK_API_SECRET
}


#function to get token
def get_token():
    app = msal.ConfidentialClientApplication(
        config['client_id'], authority=config['authority'], client_credential=config['secret']
    )

    result = None
    result = app.acquire_token_silent(config['scope'], account=None)

    if not result:
        result = app.acquire_token_for_client(scopes=config['scope'])

    if "access_token" in result:
        access_token = result['access_token']
        return access_token

#function to get folder ids by name
def get_folder_id(folder_name, access_token, user_id):

    # Set the URL and headers
    folder_url = f"https://graph.microsoft.com/v1.0/users/{user_id}/mailFolders"
    headers = {'Authorization': f"Bearer {access_token}", 'Content-Type': "application/json"}

    response = requests.get(folder_url, headers=headers)

    # Collect the ids of the folders
    folder_ids = { folder['displayName']: folder['id'] for folder in response.json()['value']}

    folder_id = folder_ids[folder_name]
    return folder_id


#function to get subfolder ids by name
def get_subfolder_id(subfolder_name, access_token, user_id, folder_id):
    
        # Set the URL and headers
        subfolder_url = f"https://graph.microsoft.com/v1.0/users/{user_id}/mailFolders/{folder_id}/childFolders"
        headers = {'Authorization': f"Bearer {access_token}", 'Content-Type': "application/json"}
    
        response = requests.get(subfolder_url, headers=headers)
    
        # Collect the ids of the folders
        subfolder_ids = { folder['displayName']: folder['id'] for folder in response.json()['value']}
    
        subfolder_id = subfolder_ids[subfolder_name]
        return subfolder_id

#function to get email ids by daterange
def get_email_ids_by_daterange(start_time, end_time, access_token, user_id, folder_id, subfolder_id=None):
        
        if subfolder_id is not None:
            # Set the URL and headers
            message_url = f"https://graph.microsoft.com/v1.0/users/{user_id}/mailFolders/{folder_id}/childFolders/{subfolder_id}/messages"
        else:
            # Set the URL and headers
            message_url = f"https://graph.microsoft.com/v1.0/users/{user_id}/mailFolders/{folder_id}/messages"
        
        headers = {'Authorization': f"Bearer {access_token}"}
        
        # Filter the messages by the receivedDateTime
        params = {
            '$filter': f"receivedDateTime ge {start_time} and receivedDateTime lt {end_time}",
            '$select': 'id, webLink',
            '$top': '1000'
        }
        
        response = requests.get(message_url, headers=headers, params=params)
        
        message_ids = [(email['id'], email['webLink']) for email in response.json()['value']]
        return message_ids





#class to get email content as text, sender, recipient, subject, receivedDateTime, conversationId, webLink
class oldEmail:
    '''
    Class to get email content as text, sender, recipient, subject, receivedDateTime, conversationId, webLink
    args:message_id, token, user_id
    returns: email content as text, sender, recipient, subject, receivedDateTime, conversationId, webLink as dict
    '''
    def __init__(self, message_id, token, user_id):
        self.message_id = message_id
        self.token = token
        self.user_id = user_id
        self.content = None
        self.sender = None
        self.recipient = None
        self.received_datetime = None
        self.conversation_id = None
        self.web_link = None
        self._subject = None
        self._fetch_email()

    def _fetch_email(self):
        headers = {
            'Authorization': f"Bearer {self.token}"
        }
        url = f"https://graph.microsoft.com/v1.0/users/{self.user_id}/messages/{self.message_id}/$value"
        response = requests.get(url, headers=headers)
        email_data = response.json()

        self.content = email_data['body']['content']
        self.sender = email_data['from']['emailAddress']['address']
        self.recipient = [recipient['emailAddress']['address'] for recipient in email_data['toRecipients']]
        self.received_datetime = email_data['receivedDateTime']
        self.conversation_id = email_data['conversationId']
        self.web_link = email_data['webLink']
        self._subject = email_data['subject']

    def _clean_content(self, content):
        # Replace multiple newlines with a single newline
        content = re.sub(r'\n+', '\n', content)
        content = re.sub(r'http\S+|www.\S+', '[link removed]', content)# Replace URLs with '[link removed]'
        content = re.sub(r'An:.*?Sie erreichen Ihr PKM unter folgendem Link', '', content, flags=re.DOTALL)
        content = re.sub(r'An:.*?Betreff:', '', content) # remove email headers
        content =re.sub(r'[\w]+@[\.\w]+',"",content) #removing email addresses
        content = re.sub(r'\d{4}\s\w+',"",content) #removing adress
        content = re.sub(r'[PMT:]*\s*\+\d{1,3}\s[(0)]?(?:[()\s]?\d{1,3}){1,10}',"",content) #removing phone numbers

        return content
    

    def to_dict(self):
        return {
            'message_id': self.message_id,
            'subject': self._subject,
            'content': self._clean_content(self.content),
            'sender': self.sender,
            'recipient': ', '.join(self.recipient),  # convert list to string
            'received_datetime': self.received_datetime,
            'conversation_id': self.conversation_id,
            'web_link': self.web_link
        }

    def __str__(self):
        return json.dumps(self.to_dict(), indent=4)
    
    def save_to_azure(self, azure_table):
        entity = self.to_dict()
        # Azure Table Storage needs a PartitionKey and a RowKey
        entity['PartitionKey'] = self.sender
        entity['RowKey'] = self.message_id
        azure_table.create_entity(entity)



class AzureTable:
    def __init__(self, connection_string=OUTLOOK_CONTENT_CONNECTION_STRING, table_name="outlooktest"):
        self.table_name = table_name
        self.table_client = TableServiceClient.from_connection_string(connection_string).get_table_client(table_name)
        

    def create_entity(self, entity):
        try:
            self.table_client.create_entity(entity=entity)
            #print(f"Entity with RowKey:  added to table: ")
        except Exception as ex:
            print(f"Could not add entity to table: {ex}")

# create the top-level parser
#ef parse_args():
#   parser = argparse.ArgumentParser()
#   parser.add_argument("--start_date", help="start_date")
#   parser.add_argument("--end_date", help="end_time")
#   parser.add_argument("--subfolder_name", help="subfolder_name", default=None)
#   #parser.add_argument("azure_table", help="azure_table", default=AzureTable())
#   return parser.parse_args()
#


In [2]:
access_token = get_token()

In [None]:
access_token

In [None]:
folder_id = get_folder_id("Posteingang", access_token, OUTLOOK_API_USER_ID)

In [None]:
#function to get email ids by daterange
def get_email_ids_by_daterange(start_time, end_time, access_token, user_id, folder_id):
        start_time = datetime.strptime(start_time, "%d-%m-%Y").replace(tzinfo=timezone.utc).isoformat()
        end_time = datetime.strptime(end_time, "%d-%m-%Y").replace(tzinfo=timezone.utc).isoformat()
        message_url = f"https://graph.microsoft.com/v1.0/users/{user_id}/mailFolders/{folder_id}/messages"
        
        headers = {'Authorization': f"Bearer {access_token}"}
        
        # Filter the messages by the receivedDateTime
        params = {
            '$filter': f"receivedDateTime ge {start_time} and receivedDateTime lt {end_time}",
            '$select': 'id',
            '$top': '1000'
        }
        
        response = requests.get(message_url, headers=headers, params=params)
        
        message_ids = [email['id'] for email in response.json()['value']]
        return message_ids

In [None]:
message_ids = get_email_ids_by_daterange(  "01-05-2023", "05-05-2023",access_token, OUTLOOK_API_USER_ID, folder_id)


In [None]:
message_ids[0]

In [None]:
def fetch_email(message_id, user_id, access_token):
    headers = {
        'Authorization': f"Bearer {access_token}"
    }
    url = f"https://graph.microsoft.com/v1.0/users/{user_id}/messages/{message_id}/$value"
    response = requests.get(url, headers=headers)
    email_data = response.content
    return email_data

In [None]:
t = fetch_email(message_ids[2], OUTLOOK_API_USER_ID, access_token)

In [None]:
t

In [None]:
import requests
import html2text
from email.mime.text import MIMEText
from email.mime.multipart import MIMEBase
from email.mime.multipart import MIMEMultipart
from email.parser import Parser
from email.utils import parseaddr
from datetime import datetime

import json
import os
import re
import argparse
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.data.tables import TableServiceClient, TableEntity

class Email:
    def __init__(self, message_id, token, user_id):
        self.message_id, self.webLink = message_id
        self.token = token
        self.user_id = user_id
        self.mime = self.fetch_mime()
        self.parse_message()
        self.sender = self.get_sender()
        self.recipients = self.get_recipients()
        self.received_date = self.get_received_date()
        self.conversation_id = self.get_conversation_id()
        self.subject = self.get_subject()
        self.content = self.extract_content()

    def fetch_mime(self):
        headers = {'Authorization': f"Bearer {self.token}"}
        url = f"https://graph.microsoft.com/v1.0//users/{self.user_id}/messages/{self.message_id}/$value"
        response = requests.get(url, headers=headers)
        return response.content

    def parse_message(self):
        self.message = Parser().parsestr(self.mime.decode('utf-8'))

    def get_sender(self):
        message = Parser().parsestr(self.mime.decode('utf-8'))
        sender = parseaddr(message['From'])[1]
        return sender

    def get_recipients(self):
        message = Parser().parsestr(self.mime.decode('utf-8'))
        recipients = [parseaddr(recipient)[1] for recipient in message['To'].split(',')]
        return recipients

    def get_received_date(self):
        message = Parser().parsestr(self.mime.decode('utf-8'))
        received_date = datetime.strptime(message['Date'], '%a, %d %b %Y %H:%M:%S %z')
        return received_date

    def get_conversation_id(self):
        message = Parser().parsestr(self.mime.decode('utf-8'))
        conversation_id = message['Thread-Index']
        return conversation_id

    def get_subject(self):
        message = Parser().parsestr(self.mime.decode('utf-8'))
        conversation_id = message['Subject']
        return conversation_id

    def extract_content(self):
        message = Parser().parsestr(self.mime.decode('utf-8'))
        if message.is_multipart():
            for part in message.walk():
                content_type = part.get_content_type()
                if content_type == 'text/plain':
                    content = part.get_payload(decode=True).decode('utf-8')
                elif content_type == 'text/html':
                    content = html2text.html2text(part.get_payload(decode=True).decode('utf-8'))
        else:
            content_type = message.get_content_type()
            if content_type == 'text/plain':
                content = message.get_payload(decode=True).decode('utf-8')
            elif content_type == 'text/html':
                content = html2text.html2text(message.get_payload(decode=True).decode('utf-8'))
        return content

In [89]:
import requests
from datetime import datetime, timezone
import msal
import json
import os
import re
import argparse
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from azure.data.tables import TableServiceClient, TableEntity

import html2text
from email.mime.text import MIMEText
from email.mime.multipart import MIMEBase
from email.mime.multipart import MIMEMultipart
from email.parser import Parser
from email.utils import parseaddr


OUTLOOK_API_USER_ID = os.environ.get('OUTLOOK_API_USER_ID')
OUTLOOK_API_CLIENT_ID = os.environ.get('OUTLOOK_API_CLIENT_ID')
OUTLOOK_API_SECRET = os.environ.get('OUTLOOK_API_SECRET')
OUTLOOK_CONTENT_CONNECTION_STRING = os.environ.get('OUTLOOK_CONTENT_CONNECTION_STRING')

#config for msal
config = {
    "authority": "https://login.microsoftonline.com/c08b32af-3535-4f63-8a3a-51247cf1f022",
    "client_id": OUTLOOK_API_CLIENT_ID,
    "scope": ["https://graph.microsoft.com/.default"],
    "secret": OUTLOOK_API_SECRET
}


#function to get token
def get_token():
    app = msal.ConfidentialClientApplication(
        config['client_id'], authority=config['authority'], client_credential=config['secret']
    )

    result = None
    result = app.acquire_token_silent(config['scope'], account=None)

    if not result:
        result = app.acquire_token_for_client(scopes=config['scope'])

    if "access_token" in result:
        access_token = result['access_token']
        return access_token

#function to get folder ids by name
def get_folder_id(folder_name, access_token, user_id):

    # Set the URL and headers
    folder_url = f"https://graph.microsoft.com/v1.0/users/{user_id}/mailFolders"
    headers = {'Authorization': f"Bearer {access_token}", 'Content-Type': "application/json"}

    response = requests.get(folder_url, headers=headers)

    # Collect the ids of the folders
    folder_ids = { folder['displayName']: folder['id'] for folder in response.json()['value']}

    folder_id = folder_ids[folder_name]
    return folder_id


#function to get subfolder ids by name
def get_subfolder_id(subfolder_name, access_token, user_id, folder_id):
    
        # Set the URL and headers
        subfolder_url = f"https://graph.microsoft.com/v1.0/users/{user_id}/mailFolders/{folder_id}/childFolders"
        headers = {'Authorization': f"Bearer {access_token}", 'Content-Type': "application/json"}
    
        response = requests.get(subfolder_url, headers=headers)
    
        # Collect the ids of the folders
        subfolder_ids = { folder['displayName']: folder['id'] for folder in response.json()['value']}
    
        subfolder_id = subfolder_ids[subfolder_name]
        return subfolder_id

#function to get email ids by daterange
def get_email_ids_by_daterange(start_time, end_time, access_token, user_id, folder_id, subfolder_id):
        
        if subfolder_id is not None:
            # Set the URL and headers
            message_url = f"https://graph.microsoft.com/v1.0/users/{user_id}/mailFolders/{folder_id}/childFolders/{subfolder_id}/messages"
        else:
            # Set the URL and headers
            message_url = f"https://graph.microsoft.com/v1.0/users/{user_id}/mailFolders/{folder_id}/messages"
        
        headers = {'Authorization': f"Bearer {access_token}"}
        
        # Filter the messages by the receivedDateTime
        params = {
            '$filter': f"receivedDateTime ge {start_time} and receivedDateTime lt {end_time}",
            '$select': 'id, webLink, conversationId, receivedDateTime',
            '$top': '1000'
        }
        
        response = requests.get(message_url, headers=headers, params=params)
        
        message_ids = [(email['id'], email['webLink'],email['conversationId'],email['receivedDateTime'] ) for email in response.json()['value']]
        return message_ids



class EmailMime:
    def __init__(self, message_id, token, user_id):
        self.message_id, self.web_link, self.conversation_id, self.received_datetime = message_id
        self.token = token
        self.user_id = user_id
        self.mime = self.fetch_mime()
        #self.charset = self.get_charset()
        self.parse_message()
        self.sender = self.get_sender()
        self.recipients = self.get_recipients()
        #self.received_datetime = self.get_received_date()
        #self.conversation_id = self.get_conversation_id()
        self.subject = self.get_subject()
        self.content = self.extract_content()

    def fetch_mime(self):
        headers = {'Authorization': f"Bearer {self.token}"}
        url = f"https://graph.microsoft.com/v1.0//users/{self.user_id}/messages/{self.message_id}/$value"
        response = requests.get(url, headers=headers)
        return response.content

    #def get_charset(self):
    #    message = Parser().parsestr(self.mime.decode('utf-8'))
    #    charset = message.get_content_charset()
    #    return charset

    def parse_message(self):
        self.message = Parser().parsestr(self.mime.decode('utf-8'))

    def get_sender(self):
        message = Parser().parsestr(self.mime.decode('utf-8'))
        sender = parseaddr(message['From'])[1]
        return sender

    def get_recipients(self):
        message = Parser().parsestr(self.mime.decode('utf-8'))
        if message['To'] is not None:
            recipients = [parseaddr(recipient)[1] for recipient in message['To'].split(',')]
        else:
            recipients = []
        return recipients

    def get_received_date(self):
        message = Parser().parsestr(self.mime.decode('utf-8'))
        received_date = parseaddr(message['Date']) #datetime.strptime(message['Date'], '%a, %d %b %Y %H:%M:%S %z')
        return received_date

    def get_conversation_id(self):
        message = Parser().parsestr(self.mime.decode('utf-8'))
        conversation_id = message['Thread-Index']
        return conversation_id

    def get_subject(self):
        message = Parser().parsestr(self.mime.decode('utf-8'))
        conversation_id = message['Subject']
        return conversation_id
    
    #check if part is not attachment
    def is_attachment(self, part):
        if part.get('Content-Disposition') is None:
            return False
        else:
            return True

    

    def extract_content(self):
        message = Parser().parsestr(self.mime.decode('utf-8'))
        if message.is_multipart():
            for part in message.walk():
                if self.is_attachment(part):
                    continue
                content_type = part.get_content_type()
                if content_type == 'text/plain':
                    charset = part.get_content_charset()
                    content = part.get_payload(decode=True).decode(charset)
                elif content_type == 'text/html':
                    charset = part.get_content_charset()
                    content = html2text.html2text(part.get_payload(decode=True).decode(charset))
        else:
            content_type = message.get_content_type()
            
            charset = message.get_content_charset()
            if charset is None:
                charset = 'utf-8'
            if content_type == 'text/plain':
                content = message.get_payload(decode=True).decode(charset)
            elif content_type == 'text/html':
                content = html2text.html2text(message.get_payload(decode=True).decode(charset))
        return content
    
    def _clean_content(self, content):
        # Replace multiple newlines with a single newline
        content = re.sub(r'\n+', '\n', content)
        content = re.sub(r'http\S+|www.\S+', '[link removed]', content)# Replace URLs with '[link removed]'
        content = re.sub(r'An:.*?Sie erreichen Ihr PKM unter folgendem Link', '', content, flags=re.DOTALL)
        content = re.sub(r'An:.*?Betreff:', '', content) # remove email headers
        content =re.sub(r'[\w]+@[\.\w]+',"",content) #removing email addresses
        content = re.sub(r'\d{4}\s\w+',"",content) #removing adress
        content = re.sub(r'[PMT:]*\s*\+\d{1,3}\s[(0)]?(?:[()\s]?\d{1,3}){1,10}',"",content) #removing phone numbers

        return content

    def to_dict(self):
        return {
            #'message_id': self.message_id,
            'subject': self.subject,
            'content': self._clean_content(self.content),
            'sender': self.sender,
            'recipients': ', '.join(self.recipients),  # convert list to string
            'received_datetime': str(self.received_datetime),
            'conversation_id': self.conversation_id,
            'web_link': self.web_link
        }

    def __str__(self):
        return json.dumps(self.to_dict(), indent=4)
    
    def save_to_azure(self, azure_table):
        entity = self.to_dict()
        # Azure Table Storage needs a PartitionKey and a RowKey
        entity['PartitionKey'] = self.conversation_id
        entity['RowKey'] = self.message_id
        azure_table.create_entity(entity)    
    


class AzureTable:
    def __init__(self, connection_string=OUTLOOK_CONTENT_CONNECTION_STRING, table_name="outlookjohannes"):
        self.table_name = table_name
        self.table_client = TableServiceClient.from_connection_string(connection_string).get_table_client(table_name)
        

    def create_entity(self, entity):
        try:
            self.table_client.create_entity(entity=entity)
            #print(f"Entity with RowKey:  added to table: ")
        except Exception as ex:
            print(f"Could not add entity to table: {ex}")

# create the top-level parser
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--start_date", help="start_date")
    parser.add_argument("--end_date", help="end_time")
    parser.add_argument("--subfolder_name", help="subfolder_name", default=None)
    #parser.add_argument("azure_table", help="azure_table", default=AzureTable())
    return parser.parse_args()

#main function to run the script for a specific time range
def main():

    azure_table=AzureTable()
    args = parse_args()

    start_time = datetime.strptime(args.start_date, "%d-%m-%Y").replace(tzinfo=timezone.utc).isoformat()
    end_time = datetime.strptime(args.end_date, "%d-%m-%Y").replace(tzinfo=timezone.utc).isoformat()

    user_id = OUTLOOK_API_USER_ID
    # Get the access token
    access_token = get_token()

    # Get the folder id
    folder_id = get_folder_id("Posteingang", access_token, user_id)

    # Get the subfolder id if a subfolder name is not none

    if args.subfolder_name is not None:
        subfolder_id = get_subfolder_id(args.subfolder_name, access_token, user_id, folder_id)
    else:
        subfolder_id = None
    # Get the message ids
    message_ids = get_email_ids_by_daterange(start_time, end_time, access_token, user_id, folder_id, subfolder_id)
    print(f"Found {len(message_ids)} emails")
    # Get the email content
    for message_id in message_ids:
        print(f"Processing email with id: {message_id}")
        try:
            email = EmailMime(message_id, access_token, user_id)
            email.save_to_azure(azure_table)
        except:
            print(f"Could not upload email with id: {message_id}")


In [69]:
start_date = "01-06-2023"
end_date = "03-06-2023"
subfolder_name = None

In [70]:
azure_table=AzureTable()

start_time = datetime.strptime(start_date, "%d-%m-%Y").replace(tzinfo=timezone.utc).isoformat()
end_time = datetime.strptime(end_date, "%d-%m-%Y").replace(tzinfo=timezone.utc).isoformat()
user_id = OUTLOOK_API_USER_ID
# Get the access token
access_token = get_token()
# Get the folder id
folder_id = get_folder_id("Posteingang", access_token, user_id)
# Get the subfolder id if a subfolder name is not none
if subfolder_name is not None:
    subfolder_id = get_subfolder_id(subfolder_name, access_token, user_id, folder_id)
else:
    subfolder_id = None
# Get the message ids
message_ids = get_email_ids_by_daterange(start_time, end_time, access_token, user_id, folder_id, subfolder_id)
print(f"Found {len(message_ids)} emails")
# Get the email content


Found 44 emails


In [None]:
for message_id in message_ids[:5]:
    print(f"Processing email with id: {message_id}")
    
    email = EmailMime(message_id, access_token, user_id)
    email.save_to_azure(azure_table)
    #print(email)

In [91]:
email = EmailMime(message_ids[0], access_token, OUTLOOK_API_USER_ID)

In [None]:
access_token = get_token()
access_token

In [None]:
print(email.sender)
print(email.recipients)

print(email.subject)
print(email.conversation_id)

print(email.content)

In [24]:
def fetch_mime(message_id):
    headers = {'Authorization': f"Bearer {access_token}"}
    url = f"https://graph.microsoft.com/v1.0//users/{user_id}/messages/{message_id}/$value"
    response = requests.get(url, headers=headers)
    return response.content

In [60]:
message_id, link = message_ids[4]

In [90]:
message_ids = 

In [61]:
mime = fetch_mime(message_id)

In [32]:
def get_charset(self):
    message = Parser().parsestr(mime.decode('utf-8'))
    charset = parseaddr(message['Content-Type'])
    return charset

In [33]:
get_charset(mime)

('', 'multipart/related')

In [42]:
def extract_content(mime):
    message = Parser().parsestr(mime.decode('utf-8'))
    if message.is_multipart():
        for part in message.walk():
            content_type = part.get_content_type()
            if content_type == 'text/plain':
                charset = part.get_content_charset()
                print(charset)
                content = part.get_payload(decode=True).decode(charset)
            elif content_type == 'text/html':
                content = html2text.html2text(part.get_payload(decode=True).decode(charset))
    else:
        content_type = message.get_content_type()
        charset = part.get_content_charset()
        print(charset)
        if content_type == 'text/plain':
            
            content = message.get_payload(decode=True).decode(charset)
        elif content_type == 'text/html':
            
            content = html2text.html2text(message.get_payload(decode=True).decode(charset))
    return content
    

In [None]:
extract_content(mime)

In [None]:
import email

# Assume 'mime_data' is the MIME data of the email
msg = email.message_from_bytes(t)

# Get the sender, recipient, and subject of the email
sender = msg['From']
recipient = msg['To']
subject = msg['Subject']

# Get the content of the email
if msg.is_multipart():
    # If the email has multiple parts, iterate over them
    for part in msg.walk():
        content_type = part.get_content_type()
        if content_type == 'text/plain':
            # If the part is plain text, get the content
            content = part.get_payload(decode=True).decode('utf-8')
else:
    # If the email has a single part, get the content
    content_type = msg.get_content_type()
    if content_type == 'text/plain':
        content = msg.get_payload(decode=True).decode('utf-8')

In [None]:
msg.is_multipart()

In [None]:
import html2text

In [None]:
#main function to run the script for a specific time range
def main(start_date, end_date, subfolder_name=None):
    email_list = []
    azure_table=AzureTable()
    #args = parse_args()

    start_time = datetime.strptime(start_date, "%d-%m-%Y").replace(tzinfo=timezone.utc).isoformat()
    end_time = datetime.strptime(end_date, "%d-%m-%Y").replace(tzinfo=timezone.utc).isoformat()

    user_id = OUTLOOK_API_USER_ID
    # Get the access token
    access_token = get_token()

    # Get the folder id
    folder_id = get_folder_id("Posteingang", access_token, user_id)

    # Get the subfolder id if a subfolder name is not none

    if subfolder_name is not None:
        subfolder_id = get_subfolder_id(subfolder_name, access_token, user_id, folder_id)
    else:
        subfolder_id = None
    # Get the message ids
    message_ids = get_email_ids_by_daterange(start_time, end_time, access_token, user_id, folder_id, subfolder_id)
    print(f"Found {len(message_ids)} emails")
    # Get the email content
    for message_id in message_ids:
        #print(f"Processing email with id: {message_id}")
        try:
            email = Email(message_id, access_token, user_id)
            email_list.append(email)
            #email.save_to_azure(azure_table)
        except:
            print(f"Could not upload email with id: {message_id}")
    return email_list

In [None]:
email_json = main("01-05-2023", "05-05-2023", subfolder_name=None)

In [None]:
get_token()

In [None]:
email_json

In [None]:
import aiohttp
import asyncio

class Email:
    '''
    Class to get email content as text, sender, recipient, subject, receivedDateTime, conversationId, webLink
    args:message_id, token, user_id
    returns: email content as text, sender, recipient, subject, receivedDateTime, conversationId, webLink as dict
    '''
    def __init__(self, message_id, token, user_id):
        self.message_id = message_id
        self.token = token
        self.user_id = user_id
        self.content = None
        self.sender = None
        self.recipient = None
        self.received_datetime = None
        self.conversation_id = None
        self.web_link = None
        self._subject = None
        self.session = aiohttp.ClientSession()

    async def _fetch_email(self):
        headers = {
            'Authorization': f"Bearer {self.token}",
            'Prefer': 'outlook.body-content-type="text"'
        }
        url = f"https://graph.microsoft.com/v1.0/users/{self.user_id}/messages/{self.message_id}"
        async with self.session.get(url, headers=headers) as response:
            email_data = await response.json()

            self.content = email_data['body']['content']
            self.sender = email_data['from']['emailAddress']['address']
            self.recipient = [recipient['emailAddress']['address'] for recipient in email_data['toRecipients']]
            self.received_datetime = email_data['receivedDateTime']
            self.conversation_id = email_data['conversationId']
            self.web_link = email_data['webLink']
            self._subject = email_data['subject']

    def _clean_content(self, content):
        # Replace multiple newlines with a single newline
        content = re.sub(r'\n+', '\n', content)
        content = re.sub(r'http\S+|www.\S+', '[link removed]', content)# Replace URLs with '[link removed]'
        content = re.sub(r'An:.*?Sie erreichen Ihr PKM unter folgendem Link', '', content, flags=re.DOTALL)
        content = re.sub(r'An:.*?Betreff:', '', content) # remove email headers
        content =re.sub(r'[\w]+@[\.\w]+',"",content) #removing email addresses
        content = re.sub(r'\d{4}\s\w+',"",content) #removing adress
        content = re.sub(r'[PMT:]*\s*\+\d{1,3}\s[(0)]?(?:[()\s]?\d{1,3}){1,10}',"",content) #removing phone numbers

        return content

    def to_dict(self):
        return {
            'message_id': self.message_id,
            'subject': self._subject,
            'content': self._clean_content(self.content),
            'sender': self.sender,
            'recipient': ', '.join(self.recipient),  # convert list to string
            'received_datetime': self.received_datetime,
            'conversation_id': self.conversation_id,
            'web_link': self.web_link
        }

    def __str__(self):
        return json.dumps(self.to_dict(), indent=4)


In [None]:
#main function to run the script for a specific time range
async def main(start_date, end_date, subfolder_name=None):
    
    start_time = datetime.strptime(start_date, "%d-%m-%Y").replace(tzinfo=timezone.utc).isoformat()
    end_time = datetime.strptime(end_date, "%d-%m-%Y").replace(tzinfo=timezone.utc).isoformat()

    user_id = OUTLOOK_API_USER_ID
    # Get the access token
    access_token = get_token()

    # Get the folder id
    folder_id = get_folder_id("Posteingang", access_token, user_id)

    # Get the subfolder id if a subfolder name is not none

    if subfolder_name is not None:
        subfolder_id = get_subfolder_id(subfolder_name, access_token, user_id, folder_id)
    else:
        subfolder_id = None
    # Get the message ids
    message_ids = get_email_ids_by_daterange(start_time, end_time, access_token, user_id, folder_id, subfolder_id)
    print(f"Found {len(message_ids)} emails")
    # Get the email content
    emails = [Email(message_id, access_token, user_id) for message_id in message_ids]

    # Fetch all emails concurrently
    await asyncio.gather(*(email._fetch_email() for email in emails))
    data = [email.to_dict() for email in emails]
    return data


In [None]:
data = main("01-05-2023", "05-05-2023", subfolder_name=None)

In [None]:
data

In [None]:
# Create a list of Email instances
emails = [Email(message_id1, token, user_id), Email(message_id2, token, user_id), ...]

# Fetch all emails concurrently
await asyncio.gather(*(email._fetch_email() for email in emails))


In [None]:
# Create a list of dictionaries
data = [email.to_dict() for email in emails]

# Create a DataFrame
df = DataFrame(data)


In [None]:
#load data from azure storage table and create data frame

def load_data():
    # Create the TableServiceClient object which will be used to create a container client
    connect_str = OUTLOOK_CONTENT_CONNECTION_STRING
    table_service = TableServiceClient.from_connection_string(connect_str)
    table_name = "outlooktest"
    table_client = table_service.get_table_client(table_name)
    documents = []
    for entity in table_client.list_entities():
        documents.append(entity)
    df = DataFrame(documents)
    return df


In [None]:
df = load_data()