In [32]:
import os
import base64
import datetime
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
import PyPDF2
import logging

# Set up logging
logging.basicConfig(filename="attachment_errors.log", level=logging.ERROR, format="%(asctime)s - %(levelname)s - %(message)s")

# If modifying these SCOPES, delete the file token.json.
SCOPES = ['https://www.googleapis.com/auth/gmail.modify']

# Directory to save attachments
attachment_dir = "C:\\Users\\Isabe\\OneDrive\\Bureau\\factures\\exercise_en_cours"

# Whitelist text file (domains only)
whitelist_file = "whitelist.txt"

# Words to search for in the PDFs
search_words = ["facture", "invoice",'INVOICE','Invoice','Facture','Rechnung']

def load_whitelist():
    if os.path.exists(whitelist_file):
        with open(whitelist_file, 'r') as file:
            return [line.strip() for line in file.readlines()]
    return []

def authenticate_gmail():
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    return creds

def fetch_emails(service, whitelist):
    page_token = 0
    while True:
        try:
            results = service.users().messages().list(
                userId='me',
                labelIds=['INBOX'],
                pageToken=page_token
            ).execute()
            
            messages = results.get('messages', [])
            print(f"Found {len(messages)} messages in the inbox.")
            
            for message in messages:
                print(f"Processing message ID: {message['id']}")
                msg = service.users().messages().get(userId='me', id=message['id']).execute()
                
                headers = msg.get('payload', {}).get('headers', [])
                from_email = None
                received_date = None
                for header in headers:
                    if header['name'] == 'From':
                        from_email = header['value']
                        
                    if header['name'] == 'Date':
                        received_date = header['value']
                
                # Convert the email date to a datetime object
                print(received_date)
                #email_date = datetime.datetime.strptime(received_date, '%a, %d %b %Y %H:%M:%S %z')
                date_parts = received_date.split()

                # Extract the first three components (day, month, year)
                date_strip = ' '.join(date_parts[:3])

                domain = from_email.split('@')[-1].strip("<>")

                
                #print(domain)
                if from_email and any(whitelisted_domain in domain for whitelisted_domain in whitelist):
                    print(f"Email from {from_email}:")
                    print("Subject:", get_header(headers, 'Subject'))
                    
                    if process_attachments(service, msg, domain, date_strip):
                        archive_email(service, message['id'], domain)
                    
            page_token = results.get('nextPageToken')
            if not page_token:
                break
        except Exception as e:
            print(f"Error fetching emails: {e}")
            break

def get_header(headers, name):
    for header in headers:
        if header['name'] == name:
            return header['value']
    return None

def process_attachments(service, msg, domain, email_date):
    try:
        found_valid_attachment = False
        
        # Ensure the temp directory exists
        temp_dir = "C:\\Users\\Isabe\\OneDrive\\Bureau\\attachments\\temp"
        if not os.path.exists(temp_dir):
            os.makedirs(temp_dir)
        
        parts = msg.get('payload', {}).get('parts', [])
        for part in parts:
            
            if 'filename' in part and part['filename'].lower().endswith('.pdf'):
                att_id = part['body'].get('attachmentId')
                attachment = service.users().messages().attachments().get(
                    userId='me', messageId=msg['id'], id=att_id).execute()
                data = base64.urlsafe_b64decode(attachment['data'])
                
                # Create a temporary file to check if it contains the desired words
                temp_filepath = os.path.join(temp_dir, part['filename'])
                with open(temp_filepath, 'wb') as f:
                    f.write(data)
                
                if is_word_in_pdf(temp_filepath, search_words):
                    #('test'+part['filename'])
                    save_attachment(part['filename'], data, domain, email_date)
                    found_valid_attachment = True
                
                # Clean up the temporary file
                os.remove(temp_filepath)
    except Exception as e:
            print(f"Error fetching emails: {e}")
            
    return found_valid_attachment


def save_attachment(filename, data, domain, email_date):
    try:
        domain_dir = os.path.join(attachment_dir, domain)
        matching_dir = os.path.join(domain_dir, "factures")
        non_matching_dir = os.path.join(domain_dir, "autres")

        for dir_path in [matching_dir, non_matching_dir]:
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)

        # Format the email's received date
        date_str = email_date#.strftime("%Y%m%d")
        #print(filename)
        new_filename = f"_{os.path.splitext(filename)[0]}_{date_str}.pdf"
        

        temp_filepath = os.path.join(domain_dir, new_filename)
        with open(temp_filepath, 'wb') as f:
            f.write(data)

        if is_word_in_pdf(temp_filepath, search_words):
            final_filepath = os.path.join(matching_dir, new_filename)
        else:
            final_filepath = os.path.join(non_matching_dir, new_filename)

        os.rename(temp_filepath, final_filepath)
        print(f"Saved attachment to {final_filepath}")

    except FileExistsError as e:
        print(f"FileExistsError: {e} - {new_filename}")
    except Exception as e:
        print(f"Error: {e}")


def is_word_in_pdf(pdf_path, words):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            text = page.extract_text()
            for word in words:
                if word.lower() in text.lower():
                    print('found word'+word)
                    return True
    print('did not find keyword')
    return False

def archive_email(service, message_id, domain):
    try:
        label_name = f"En traitement factures"
        
        labels = service.users().labels().list(userId='me').execute().get('labels', [])
        label_id = None
        for label in labels:
            if label['name'] == label_name:
                label_id = label['id']
                break
        
        if not label_id:
            label = service.users().labels().create(
                userId='me',
                body={'name': label_name, 'labelListVisibility': 'labelShow', 'messageListVisibility': 'show'}
            ).execute()
            label_id = label['id']
        
        service.users().messages().modify(
            userId='me', id=message_id, body={
                'addLabelIds': [label_id],
                'removeLabelIds': ['INBOX']
            }
        ).execute()
        print(f"Email {message_id} archived under {label_name}.")
    except Exception as e:
        print(f"Error archiving email {message_id} under {label_name}: {e}")
 
def main():
    whitelist = load_whitelist()
    creds = authenticate_gmail()
    service = build('gmail', 'v1', credentials=creds)
    fetch_emails(service, whitelist)

#if __name__ == '__main__':
main()


Found 36 messages in the inbox.
Processing message ID: 1935f13ef7207eaa
Sun, 24 Nov 2024 16:49:17 +0000
Processing message ID: 1935e109fc3efaf1
Sun, 24 Nov 2024 12:06:01 +0000
Processing message ID: 1935609d1853ff18
22 Nov 2024 16:41:38 -0600
Email from "'Cisco Webex' via Info (General)" <info@polyrix.com>:
Subject: Your subscription renewal confirmation
Processing message ID: 19355568e80f7c55
Fri, 22 Nov 2024 14:25:28 -0500
Email from Louis-Andre Deschenes <louis-andre.deschenes@polyrix.com>:
Subject: Fwd: Invoice for Your October 9th Order with McMaster-Carr
found wordinvoice
found wordinvoice
Saved attachment to C:\Users\Isabe\OneDrive\Bureau\factures\exercise_en_cours\polyrix.com\factures\_Invoice 34669049_Fri, 22 Nov.pdf
Email 19355568e80f7c55 archived under En traitement factures.
Processing message ID: 193554753c5d3f91
Fri, 22 Nov 2024 14:08:41 -0500
Email from Philippe Lambert <philippe.lambert@polyrix.com>:
Subject: Fwd: Funds transfer remittance advice
Processing message ID: 