In [9]:
!pip3 install numpy pandas



In [10]:
!pip install google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client google-cloud-storage google-cloud-secret-manager



### FUNCTION TO PULL THE EMAILS FROM GMAIL AND DUMP TO GCS

In [None]:
import base64
import datetime
import logging
import os.path

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("gmail_fetch.log"),  # Log to a file
        logging.StreamHandler(),  # Log to the console
    ],
)

# Define the scopes
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]

# Define the allowed email address
ALLOWED_EMAIL = "kumarprkhr@gmail.com"  # Replace with the specific email
USER_ID = ALLOWED_EMAIL.split("@")[0]
TOKEN_FILE = f"{USER_ID}_token.json"

# Define the folder to save emails
INTAKE_EMAIL_FOLDER = f"{USER_ID}_intake_emails"


def authenticate_gmail():
    logging.info("Authenticating Gmail...")
    creds = None
    # The file token.json stores the user's access and refresh tokens
    if os.path.exists(TOKEN_FILE):
        logging.info(f"Loading credentials from {TOKEN_FILE}...")
        creds = Credentials.from_authorized_user_file(TOKEN_FILE, SCOPES)

    # If there are no valid credentials, prompt the user to log in
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            logging.info("Refreshing expired credentials...")
            creds.refresh(Request())
        else:
            logging.info("No valid credentials found. Starting OAuth flow...")
            flow = InstalledAppFlow.from_client_secrets_file("../backend/credentials.json", SCOPES)
            creds = flow.run_local_server(port=0)

        # Save the credentials for the next run
        logging.info(f"Saving credentials to {TOKEN_FILE}...")
        with open(TOKEN_FILE, "w") as token:
            token.write(creds.to_json())

    logging.info("Authentication successful.")
    return creds


def get_authenticated_email(creds):
    logging.info("Fetching authenticated email address...")
    # Build the Gmail API service
    service = build("gmail", "v1", credentials=creds)

    # Get the user's profile information
    profile = service.users().getProfile(userId="me").execute()

    # Extract and return the email address
    email_address = profile["emailAddress"]
    logging.info(f"Authenticated email address: {email_address}")
    return email_address


def list_emails_in_time_range(service, start_timestamp, end_timestamp=None):
    logging.info(
        f"Fetching emails between timestamps {start_timestamp} and {end_timestamp}..."
    )
    # Query to filter emails received in the specified time range
    query = f"after:{start_timestamp}"
    if end_timestamp:
        query += f" before:{end_timestamp}"

    # Fetch emails matching the query
    results = service.users().messages().list(userId="me", q=query).execute()
    messages = results.get("messages", [])

    if not messages:
        logging.info("No emails found in the specified time range.")
        return []

    logging.info(f"Found {len(messages)} emails in the specified time range.")
    return messages


def save_email_as_eml(service, msg_id, folder):
    logging.info(f"Saving email {msg_id} to folder {folder}...")
    # Fetch the raw email content
    msg = service.users().messages().get(userId="me", id=msg_id, format="raw").execute()
    raw_email = base64.urlsafe_b64decode(msg["raw"].encode("ASCII"))

    # Create the folder if it doesn't exist
    if not os.path.exists(folder):
        logging.info(f"Creating folder {folder}...")
        os.makedirs(folder)

    # Save the raw email as an .eml file
    eml_file_path = os.path.join(folder, f"{msg_id}.eml")
    with open(eml_file_path, "wb") as eml_file:
        eml_file.write(raw_email)

    logging.info(f"Saved email {msg_id} to {eml_file_path}")


def main():
    logging.info("Starting Gmail email fetch script...")
    # Authenticate and get credentials
    creds = authenticate_gmail()

    # Get the authenticated email address
    authenticated_email = get_authenticated_email(creds)

    # Check if the authenticated email matches the allowed email
    if authenticated_email != ALLOWED_EMAIL:
        logging.error(f"Authentication failed. Only {ALLOWED_EMAIL} is allowed.")
        # Optionally, delete the token file to force re-authentication
        if os.path.exists(TOKEN_FILE):
            logging.info(f"Deleting token file {TOKEN_FILE}...")
            os.remove(TOKEN_FILE)
        return

    # Print the authenticated email
    logging.info(f"Authenticated with email: {authenticated_email}")

    # Build the Gmail API service
    logging.info("Building Gmail API service...")
    service = build("gmail", "v1", credentials=creds)

    # Define custom timestamps for points a and b
    # Example: Use specific dates or calculate timestamps dynamically
    point_a = datetime.datetime(2025, 1, 6)  # Replace with your desired start date
    point_b = datetime.datetime(2025, 1, 27)  # Replace with your desired end date

    # Convert to Unix timestamps
    point_a_timestamp = int(point_a.timestamp())
    point_b_timestamp = int(point_b.timestamp())

    # Fetch and save emails between point_a and point_b
    logging.info(f"Fetching emails between {point_a} and {point_b}...")
    emails = list_emails_in_time_range(service, point_a_timestamp, point_b_timestamp)
    if emails:
        logging.info(
            f"Saving {len(emails)} emails to '{INTAKE_EMAIL_FOLDER}' folder..."
        )
        for email in emails:
            msg_id = email["id"]
            save_email_as_eml(service, msg_id, INTAKE_EMAIL_FOLDER)
    else:
        logging.info(f"No emails found between {point_a} and {point_b}.")

    logging.info("Script execution completed.")


if __name__ == "__main__":
    main()


In [2]:
!pip install pyarrow
import pyarrow

Collecting pyarrow
  Downloading pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.3 kB)
Downloading pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
Successfully installed pyarrow-19.0.1


In [4]:
downloads = "/Users/pradnyeshchoudhari/Downloads/try2/emails_batch_00004.parquet"
import pandas as pd
import numpy as np
df = pd.read_parquet(downloads)
df.head()

Unnamed: 0,message_id,from_email,to,cc,bcc,subject,date,content,plain_text,html,attachments,thread_id,labels
0,1952e32c938cb0a3,Uber <uber@uber.com>,[pc612001@gmail.com],,,Last call to save 40% off your next 3 orders,2025-02-22T15:07:12+00:00,PCFkb2N0eXBlIGh0bWw-DQo8aHRtbCB4bWxucz0iaHR0cD...,,PCFkb2N0eXBlIGh0bWw-DQo8aHRtbCB4bWxucz0iaHR0cD...,[],1952e32c938cb0a3,"[IMPORTANT, CATEGORY_UPDATES, INBOX]"
1,1952e21e3cd1c6e6,Indeed <alert@indeed.com>,[<pc612001@gmail.com>],,,1 new data+intern job,2025-02-22T14:48:47+00:00,SW5kZWVkIEpvYiBBbGVydAoxIG5ldyBkYXRhK2ludGVybi...,SW5kZWVkIEpvYiBBbGVydAoxIG5ldyBkYXRhK2ludGVybi...,PCFET0NUWVBFIGh0bWwgUFVCTElDICItLy9XM0MvL0RURC...,[],1952e21e3cd1c6e6,"[IMPORTANT, CATEGORY_UPDATES, INBOX]"
2,1952d9169ef90310,"""Aeon+Psyche Daily"" <support@aeon.co>",[<pc612001@gmail.com>],,,Playing the game of conversation,2025-02-22T12:00:32+00:00,KiogVG9kYXkgaW4NCi0tLS0tLS0tLS0tLS0tLS0tLS0tLS...,KiogVG9kYXkgaW4NCi0tLS0tLS0tLS0tLS0tLS0tLS0tLS...,PCFET0NUWVBFIGh0bWw-PGh0bWwgeG1sbnM9Imh0dHA6Ly...,[],1952d9169ef90310,"[IMPORTANT, CATEGORY_UPDATES, INBOX]"
3,1952d81bb3b292bc,Uber Receipts <noreply@uber.com>,[pc612001@gmail.com],,,"Your Instacart restaurant order, powered by Ub...",2025-02-22T11:53:50+00:00,PCFkb2N0eXBlIGh0bWw-PGh0bWw-PGhlYWQ-PG1ldGEgY2...,,PCFkb2N0eXBlIGh0bWw-PGh0bWw-PGhlYWQ-PG1ldGEgY2...,[],1952d81bb3b292bc,"[IMPORTANT, CATEGORY_UPDATES, INBOX]"
4,1952d7c59fdb2703,"""Star Health & Allied Insurance"" <staradmin@st...",[pc612001@gmail.com],,,You Made This Possible - We’re Now the ‘Best H...,2025-02-22T10:45:22+00:00,IA0KICAgICAgICAgIA0KICAgICAgICAgICAgICAgICAgIA...,IA0KICAgICAgICAgIA0KICAgICAgICAgICAgICAgICAgIA...,PGh0bWwgZGlyPSJsdHIiIGxhbmc9ImVuIiB4bWxucz0iaH...,[],1952d7c59fdb2703,"[IMPORTANT, CATEGORY_UPDATES, INBOX]"


In [6]:
import base64

In [10]:
import base64
import pandas as pd

# Function to decode Base64 URL-safe encoded strings
def decode_base64_url_safe(encoded_str):
    if pd.isnull(encoded_str):
        return None
    
    # Replace URL-safe characters
    encoded_str = encoded_str.replace('-', '+').replace('_', '/')
    
    # Add padding if necessary
    padding = len(encoded_str) % 4
    if padding:
        encoded_str += '=' * (4 - padding)
    
    try:
        # Decode and return as UTF-8 string
        return base64.b64decode(encoded_str).decode('utf-8', errors='ignore')
    except Exception as e:
        print(f"Error decoding: {e}")
        return None

# Apply the decoding function to the plain_text column
df['plain_text_decoded'] = df['plain_text'].apply(decode_base64_url_safe)
df['html_decoded'] = df['html'].apply(decode_base64_url_safe)

# Display the updated dataframe
print(df.head())

         message_id                                         from_email  \
0  1952e32c938cb0a3                               Uber <uber@uber.com>   
1  1952e21e3cd1c6e6                          Indeed <alert@indeed.com>   
2  1952d9169ef90310              "Aeon+Psyche Daily" <support@aeon.co>   
3  1952d81bb3b292bc                   Uber Receipts <noreply@uber.com>   
4  1952d7c59fdb2703  "Star Health & Allied Insurance" <staradmin@st...   

                       to    cc   bcc  \
0    [pc612001@gmail.com]  None  None   
1  [<pc612001@gmail.com>]  None  None   
2  [<pc612001@gmail.com>]  None  None   
3    [pc612001@gmail.com]  None  None   
4    [pc612001@gmail.com]  None  None   

                                             subject  \
0       Last call to save 40% off your next 3 orders   
1                              1 new data+intern job   
2                   Playing the game of conversation   
3  Your Instacart restaurant order, powered by Ub...   
4  You Made This Possible - 