In [7]:
import os 
import pickle
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from base64 import urlsafe_b64decode, urlsafe_b64encode
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.mime.image import MIMEImage
from email.mime.audio import MIMEAudio
from email.mime.base import MIMEBase
from mimetypes import guess_type as guess_mime_type

In [8]:
class EmailDownloader:
        
        '''
            This class is responsible for downloading emails from a gmail account.
            
            Attributes:
                scopes (list): The scope of the gmail account.
                our_gmail (str): The gmail account to be accessed.
        '''

        def __init__(self, scopes, our_gmail):

            '''
                The constructor for EmailDownloader class.
            '''
            self.scopes = scopes
            self.our_gmail = our_gmail


        
        def gmail_authenticate(self):

            '''
                This function is responsible for authenticating the gmail account.
                It checks if the token.pickle file exists, if it does not exist, it creates it.
                It returns the service object.
            
                Returns:
                    build: The service object.
            '''
            creds = None

            if os.path.exists('token.pickle'):
                with open('token.pickle', 'rb') as token:
                    creds = pickle.load(token)

            if not creds or not creds.valid:

                if creds and creds.expired and creds.refresh_token:
                    creds.refresh(Request())
                
                else: 
                    flow = InstalledAppFlow.from_client_secrets_file('credentials.json', scopes=self.scopes)
                    creds = flow.run_local_server(port=0)
                
                with open("token.pickle", "wb") as token:
                    pickle.dump(creds, token)
            return build('gmail', 'v1', credentials=creds)


        def search_messages(self, service, query, max_results):

            '''
                This function is responsible for searching messages in the gmail account.
                It returns list of messages found.

                Args:
                    service (build): The service object.
                    query (str): The query to be searched.
                    max_results (int): The maximum number of results to return.

                Returns:
                    list: The list of messages found.
            '''

            result = service.users().messages().list(userId='me', q=query, maxResults=max_results).execute()
            messages = []

            if 'messages' in result:
                messages.extend(result['messages'])

            while 'nextPageToken' in result and len(messages) < max_results:
                page_token = result['nextPageToken']
                remaining_results = max_results - len(messages)
                result = service.users().messages().list(userId='me', q=query, pageToken=page_token, maxResults=remaining_results).execute()
                if 'messages' in result:
                    messages.extend(result['messages'])

            return messages[:max_results]


        def clean(self, subject):

            '''
                This function is responsible for creating a folder 
                name that does not contain spaces and special characters.

                Args:
                    subject (str): The subject of the email.
                
                returns:
                    str: The cleaned subject.
            '''
            return "".join(c if c.isalnum() else "_" for c in subject)

        def get_size_format(self, size, factor=1024, suffix="B"):

            '''
                This function is responsible for getting the size format scale bytes
                to its proper byte format. It returns the size format.

                Args:
                    size (int): The size of the email.
                    factor (int): The factor of the email.
                    suffix (str): The suffix of the email.
                
                Returns:
                    str: The size format of the email.
            '''

            for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]:
                if size < factor:
                    return f"{size:.2f}{unit}{suffix}"
                size /= factor
            return f"{size:.2f}Y{suffix}"

        def save_email_as_eml(self, service, message, folder_name, file_name):
                
            '''
                    This function is responsible for saving the email as an .eml file.
                    It returns the email as an .eml file.
    
                    Args:
                        service (build): The service object.
                        message (dict): The message to be saved.
                        folder_name (str): The folder name of the email.
                        file_name (str): The file name of the email.
            '''

            raw_msg = service.users().messages().get(userId='me', id=message['id'], format='raw').execute()
            eml_data = urlsafe_b64decode(raw_msg['raw'].encode('ASCII'))
            with open(os.path.join(folder_name, file_name), 'wb') as eml_file:
                eml_file.write(eml_data)

        def parse_parts(self, service, parts, folder_name, message):
                
            '''
                   This function analyzes the parts of the email and saves the message .html and attachment.
    
                    Args:
                        service (build): The service object.
                        parts (list): The list of parts of the email.
                        folder_name (str): The folder name of the email.
                        message (dict): The message to be parsed.
            '''
            if parts:
                for part in parts:
                    filename = part.get("filename")
                    mimeType = part.get("mimeType")
                    body = part.get("body")
                    data = body.get("data")
                    file_size = body.get("size")
                    part_headers = part.get("headers")

                    if part.get("parts"):
                        self.parse_parts(service, part.get("parts"), folder_name, message)

                    if mimeType == "text/plain":
                        if data:
                            text = urlsafe_b64decode(data).decode()
                            print(text)

                    elif mimeType == "text/html":
                        if not filename:
                            filename = "index.html"
                        filepath = os.path.join(folder_name, filename)
                        print("Saving HTML to", filepath)
                        with open(filepath, "wb") as f:
                            f.write(urlsafe_b64decode(data))
                    
                    else:
                        for part_header in part_headers:
                            part_header_name = part_header.get("name")
                            part_header_value = part_header.get("value")

                            if part_header_name == "Content-Disposition":
                                if "attachment" in part_header_value:
                                    print("Saving the file:", filename, "size:", self.get_size_format(file_size))
                                    attachment_id = body.get("attachmentId")
                                    attachment = service.users().messages().attachments().get(
                                        id=attachment_id, userId='me', messageId=message['id']).execute()
                                    data = attachment.get("data")
                                    filepath = os.path.join(folder_name, filename)
                                    if data:
                                        with open(filepath, "wb") as f:
                                            f.write(urlsafe_b64decode(data))

        def read_message(self, service, message, base_folder="emails"):

            '''
                This function is responsible for reading the message and saving using save_email_as_eml function.

                Args:
                    service (build): The service object.
                    message (dict): The message to be read.
                    base_folder (str): The base folder of the email.
            '''
            msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()
            payload = msg['payload']
            headers = payload.get("headers")
            parts = payload.get("parts")
            folder_name = base_folder
            has_subject = False

            if headers:
                for header in headers:
                    name = header.get("name")
                    value = header.get("value")
                    if name.lower() == 'from':
                        print("From:", value)
                    if name.lower() == "to":
                        print("To:", value)
                    if name.lower() == "subject":
                        has_subject = True
                        folder_name = os.path.join(base_folder, self.clean(value))
                        os.makedirs(folder_name, exist_ok=True)
                        print("Subject:", value)
                    if name.lower() == "date":
                        print("Date:", value)
            
            if not has_subject:
                folder_name = os.path.join(base_folder, "no_subject")
                os.makedirs(folder_name, exist_ok=True)
  
            self.parse_parts(service, parts, folder_name, message)
            self.save_email_as_eml(service, message, folder_name, f"{message['id']}.eml")

            print("="*50)


In [9]:
if __name__ == "__main__":

    ''' Define the scopes and the gmail account.'''
    SCOPES = ['https://mail.google.com/']
    our_gmail = os.environ.get('GMAIL')

    ''' Create an instance of EmailDownloader class.'''
    email_download = EmailDownloader(SCOPES, our_gmail)
    auth = email_download.gmail_authenticate()
    
    ''' Search for messages from user and get the messages.'''
    message_list = auth.users().messages().list(userId='me', q='in: inbox', maxResults=300).execute()
    messages = message_list.get('messages', [])

    ''' Create a folder to save the emails if not exists.'''
    base_folder = "emails"
    if not os.path.exists(base_folder):
        os.makedirs(base_folder)

    ''' Read the messages and save them.'''
    for message in messages:
        email_download.read_message(auth, message, base_folder)


    # References: https://thepythoncode.com/article/use-gmail-api-in-python

From: "Comunicação UECE" <assecom@uece.br>
Date: Tue, 18 Jun 2024 17:04:42 -0300
Subject: [Geral-Alunos] [NOTÍCIA] Descarbonização na Uece: chegou a hora de encarar o desafio
To: undisclosed-recipients:;
[image: DestaqueDescarbonizaUece.png]

O cenário climático mundial enfrenta desafios significativos devido ao
aumento das emissões de gases de efeito estufa, causando aquecimento
global, eventos climáticos extremos mais frequentes e muitos outros
impactos. Há uma urgência crescente por ações globais coordenadas para
reduzir as emissões, adotar energias renováveis e promover a educação
climática, visando garantir um futuro mais sustentável.

A Universidade Estadual do Ceará (Uece), consciente dessa realidade e de
seu papel na sociedade, dará início ao processo para planejar a adequação
de suas políticas, práticas, infraestrutura e logística à necessária
redução de suas próprias emissões, tendo como base a campanha já aderida
pela instituição, “Race to Zero”, apoiada pela Organização das