<a href="https://colab.research.google.com/github/juno-arpit/personal_email_agent/blob/main/PersonalEmailAgent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Create CSV out of sent email mbox

In [8]:
import mailbox
import csv
from email import policy
from email.parser import BytesParser

def get_body(message):
    if message.is_multipart():
        for part in message.walk():
            if part.is_multipart():
                for subpart in part.walk():
                    if subpart.get_content_type() == 'text/plain':
                        return subpart.get_payload(decode=True)
            elif part.get_content_type() == 'text/plain':
                return part.get_payload(decode=True)
    else:
        return message.get_payload(decode=True)

def mbox_to_csv(mbox_file_path, csv_file_path):
    mbox = mailbox.mbox(mbox_file_path)

    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Subject', 'From', 'Date', 'To', 'Message-ID', 'Body'])

        for message in mbox:
            body = get_body(message)  # Get the message body using the new get_body function
            if body:
                body = body.decode('utf-8', errors='replace').replace('\n', ' ').replace('\r', '')
            else:
                body = ''
            writer.writerow([
                message['subject'],
                message['from'],
                message['date'],
                message['to'],
                message['message-id'],
                body
            ])

# Usage
mbox_file_path = 'Sent.mbox'  # replace with the path to your MBOX file
csv_file_path = 'past_email_mbox.csv'  # replace with the desired path for the output CSV file
mbox_to_csv(mbox_file_path, csv_file_path)

In [2]:
!pip install openai
!pip install python-dotenv
!pip install google-colab

Collecting openai
  Downloading openai-1.47.0-py3-none-any.whl.metadata (24 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.47.0-py3-none-any.whl (375 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.6/375.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━

# Clean the Email Output

In [16]:
import os
# from dotenv import find_dotenv, load_dotenv
# import openai
from openai import OpenAI
from google.colab import userdata
import json
import csv

# load_dotenv(find_dotenv())

client = OpenAI(
    # This is the default and can be omitted
    api_key=userdata.get('OPENAI_API_KEY')
)

# openai.api_key = os.environ.get("OPENAI_API_KEY")


def parse_email(email_thread):

    system_prompt = """
    You are an expert of convert raw email thread into original message / reply pairs.
    You are given a raw email thread that Arpit's reply to others, your goal is to convert it into original message / reply pairs.
    - orignal_message: the last message sent to Arpit, if it is a long email thread, only take the last message
    - arpit_reply: Arpit's reply to the original message

    if there is only one message in the thread, that should be arpit_reply
    incase there's an error, feel free to omit the conversion, try to give the output as much as possible without any error

    The exported format should look something like
    {
        "original_message": "xxxx",
        "arpit_reply": "xxxx"
    }
    """

    # Calculate the maximum number of tokens for the email thread
    max_thread_tokens = 8192 - len(client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_prompt}
        ]
    ).choices[0].message.content.split())

    # Truncate the email thread if it exceeds the maximum length
    if len(email_thread.split()) > max_thread_tokens:
        email_thread = " ".join(email_thread.split()[-max_thread_tokens:])

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": email_thread}
        ]
    )

    # return response["choices"][0]["message"]["content"]
    return response.choices[0].message.content


def process_csv(input_csv_path, output_csv_path):
    with open(input_csv_path, newline='', encoding='utf-8') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        processed_data = []

        for row in csv_reader:
            text = row['Body']  # Get the text from the 'body' column
            json_string = parse_email(text)

            # Check if json_string is valid JSON before attempting to parse
            if json_string:
                try:
                    print(json_string)
                    # Convert JSON string to dictionary
                    json_data = json.loads(json_string)
                    original_message = json_data.get('original_message', '')
                    arpit_reply = json_data.get('arpit_reply', '')
                    # Append original row data and new columns to processed_data
                    processed_data.append([original_message, arpit_reply])
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON: {e}, skipping row")
            else:
                print("Skipping empty row or long email thread")

    # Write processed data to a new CSV file
    with open(output_csv_path, mode='w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        # Write header
        csv_writer.writerow(['original_message', 'arpit_reply'])
        # Write data rows
        csv_writer.writerows(processed_data)


# Paths to your input and output CSV files
input_csv_path = 'past_email_mbox_final.csv'
output_csv_path = 'email_pairs.csv'

# Call the function to process the CSV file
process_csv(input_csv_path, output_csv_path)


{
    "original_message": "This is to request you - regarding the heating nature of the building - 186 property, Sec-28, Gurugram. I know this is not something easy to do and even ask for, but the heat is really not bearable and the tap water is untouchable. If nothing, we can still do something about the water tanks, so at least they do not heat much as they do now. Put them under some kind of shade or something. Also, another request is to help us at least with iron+iron tables on every floor. What have we done to not deserve it while the people at 185 property have it on every floor?",
    "arpit_reply": "Thanks"
}
{
    "original_message": "Dear Resident, We hope you're doing well and staying safe. Please be informed that we cannot track your rent for June 2022 to date due to which the *penalty of INR 1,000* has been deducted from your security deposit as late fees. *Kindly make the payment by EOD along with the late fees to continue your stay & services at CoHo.* If you have alrea

# Extract the imp FAQ if any from the mails.

In [24]:
!pip install langchain
!pip install langchain-community
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [25]:
import csv
import json
from dotenv import find_dotenv, load_dotenv
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain

# load_dotenv()

client = OpenAI(
    # This is the default and can be omitted
    api_key=userdata.get('OPENAI_API_KEY')
)

llm = ChatOpenAI(temperature=0, model_name="gpt-4", openai_api_key=userdata.get('OPENAI_API_KEY'))



def load_csv(file_path):
    # Create a list to hold dictionaries
    data_list = []

    # Open the CSV file and read its content
    with open(file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)

        # For each row, append it as a dictionary to the list
        for row in csv_reader:
            data_list.append(row)

    return data_list

def extract_faq(text_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=20,
        length_function = len,
        is_separator_regex=False)

    texts = text_splitter.split_text(text_data)
    docs = text_splitter.create_documents(texts)


    map_prompt = """
    PAST EMAILS:
    {text}
    ----

    You are a smart AI assistant, above is some past emails from Arpit Rai (a techie living in Gurgaon, India)
    your goal is to learn & extract common FAQ about Arpit Rai (a techie living in Gurgaon, India)
    (include both question & answer, return results in JSON):
    """
    map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

    combine_prompt = """
    The following is set of FAQ about Arpit Rai (a techie living in Gurgaon, India):
    {text}
    Take these and distill it into a final, consolidated array of faq,
    include both question & answer (in JSON format).

    array of FAQ:
    """
    combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

    summary_chain = load_summarize_chain(llm=llm,
                                        chain_type='map_reduce',
                                        map_prompt=map_prompt_template,
                                        combine_prompt=combine_prompt_template,
                                        verbose=True
                                        )

    output = summary_chain.run(docs)
    faqs = json.loads(output)

    return faqs

def save_json_to_csv(data, file_name):
    with open(file_name, mode='w', newline='', encoding='utf-8') as file:
        # Get the keys (column names) from the first dictionary in the list
        fieldnames = data[0].keys()

        # Create a CSV dict writer object
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        # Write the header row
        writer.writeheader()

        # Write the data rows
        for entry in data:
            writer.writerow(entry)


# Print or save the JSON data
past_emails = load_csv("email_pairs.csv")

# Extracting arpit's replies
arpits_replies = [entry["arpit_reply"] for entry in past_emails]
arpits_replies_string = json.dumps(arpits_replies)

faqs = extract_faq(arpits_replies_string)

save_json_to_csv(faqs, "faq.csv")




Prompt after formatting:
[32;1m[1;3m
    PAST EMAILS:
    ["Thanks", "I had already paid the amount on the 5th - You just cannot reduce the amount out of your will. Please check with the RM. This is ridiculous. Get your facts right! I will not provide you with the transaction details. Go back and do the background check with your RM yourself.", "Dear Resident,  We hope you're doing well and staying safe.  Please be informed that we cannot track your rent for June 2022 to date due to which the *penalty of INR 1,000* has been deducted from your security deposit as late fees. *Kindly make the payment by EOD along with the late fees to continue your stay & services at CoHo.*  If you have already paid (don't worry), please share the transaction details/screenshot with us to reconcile the same and send you the receipt. Please reach out to payments@coho.in in case you have any queries.  Best regards, Payments Team at CoHo.in ", "Not resolved yet. Please send someone for re installation.", "



Prompt after formatting:
[32;1m[1;3m
    The following is set of FAQ about Arpit Rai (a techie living in Gurgaon, India):
    {
    "FAQs": [
        {
            "Question": "What is Arpit Rai's profession?",
            "Answer": "Arpit Rai is an app developer."
        },
        {
            "Question": "Where does Arpit Rai live?",
            "Answer": "Arpit Rai lives in Gurgaon, India."
        },
        {
            "Question": "Where is Arpit Rai originally from?",
            "Answer": "Arpit Rai is originally from Lucknow."
        },
        {
            "Question": "What is Arpit Rai's current CTC?",
            "Answer": "Arpit Rai's current CTC is 42 LPA."
        },
        {
            "Question": "What is Arpit Rai's expected CTC?",
            "Answer": "Arpit Rai's expected CTC is approximately 55 LPA."
        },
        {
            "Question": "What is Arpit Rai's notice period?",
            "Answer": "Arpit Rai's notice period is 30 days."
        },
