Notebook to scrape emails from given gmail

In [32]:
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build
import base64
import os.path
import pickle

SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

In [33]:
def get_gmail_service():
    creds = None
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=8080)
        
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    return build('gmail', 'v1', credentials=creds)

In [34]:
def get_email_body(payload):
    if 'parts' in payload:
        for part in payload['parts']:
            if part['mimeType'] == 'text/plain':
                try:
                    return base64.urlsafe_b64decode(part['body']['data']).decode()
                except Exception as e:
                    print(f"Error decoding email body: {e}")
                    return None
    elif payload.get('body', {}).get('data'):
        try:
            return base64.urlsafe_b64decode(payload['body']['data']).decode()
        except Exception as e:
            print(f"Error decoding email body: {e}")
            return None
    return None

In [35]:
def fetch_all_emails(service):
    try:
        # Get all messages in inbox
        results = service.users().messages().list(userId='me').execute()
        messages = results.get('messages', [])
        
        if not messages:
            print("No emails found.")
            return []
        
        email_list = []
        print(f"Found {len(messages)} emails. Processing...")
        
        for message in messages:
            msg = service.users().messages().get(userId='me', id=message['id']).execute()
            
            # Get subject from headers
            subject = ''
            sender = ''
            for header in msg['payload']['headers']:
                if header['name'] == 'Subject':
                    subject = header['value']
                elif header['name'] == 'From':
                    sender = header['value']
            
            # Get body
            body = get_email_body(msg['payload'])
            
            email_data = {
                'subject': subject,
                'from': sender,
                'body': body
            }
            email_list.append(email_data)
            
            print(f"\nEmail:")
            print(f"From: {sender}")
            print(f"Subject: {subject}")
            print(f"Body: {body[:200]}..." if body else "No plain text body found")
            print("-" * 50)
        
        return email_list
            
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [36]:
service = get_gmail_service()
emails = fetch_all_emails(service)
print(f"\nTotal emails processed: {len(emails)}")

Found 2 emails. Processing...

Email:
From: urock-request@lists.uchicago.edu
Subject: Welcome to list urock
Body: Welcome to list urock@lists.uchicago.edu
Your subscription email is rsochatbot@gmail.com


Looking forward to the 2022/23 climbing club season! Email will be our primary
form of communication thi...
--------------------------------------------------

Email:
From: UChicago Mailing Lists System <sympa@lists.uchicago.edu>
Subject: University of Chicago Mailing Lists / your environment
Body: Someone, probably you, requested to allocate or renew your password for your
list server account lists.uchicago.edu.
You may ignore this request or click on the following URL in order to choose
you...
--------------------------------------------------

Total emails processed: 2
