In [None]:
%pip install llama-index-llms-openai
%pip install llama-index-readers-file
%pip install llama-index-program-openai

In [None]:
# LlamaIndex
!pip install llama-index

# To get text conents from .eml and .msg file
!pip install "unstructured[msg]"

In [None]:
import logging
import sys, json

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
import os
import openai

# os.environ["OPENAI_API_KEY"] = "YOUR_KEY_HERE"
openai.api_key = os.environ["OPENAI_API_KEY"]

In [None]:
from pydantic import BaseModel, Field
from typing import List


class Instrument(BaseModel):
    """Datamodel for ticker trading details."""

    direction: str = Field(description="ticker trading - Buy, Sell, Hold etc")
    ticker: str = Field(
        description="Stock Ticker. 1-4 character code. Example: AAPL, TSLS, MSFT, VZ"
    )
    company_name: str = Field(
        description="Company name corresponding to ticker"
    )
    shares_traded: float = Field(description="Number of shares traded")
    percent_of_etf: float = Field(description="Percentage of ETF")


class Etf(BaseModel):
    """ETF trading data model"""

    etf_ticker: str = Field(
        description="ETF Ticker code. Example: ARKK, FSPTX"
    )
    trade_date: str = Field(description="Date of trading")
    stocks: List[Instrument] = Field(
        description="List of instruments or shares traded under this etf"
    )


class EmailData(BaseModel):
    """Data model for email extracted information."""

    etfs: List[Etf] = Field(
        description="List of ETFs described in email having list of shares traded under it"
    )
    trade_notification_date: str = Field(
        description="Date of trade notification"
    )
    sender_email_id: str = Field(description="Email Id of the email sender.")
    email_date_time: str = Field(description="Date and time of email")

In [None]:
# get donload_loader
from llama_index.core import download_loader

In [None]:
# Create a download loader
from llama_index.readers.file import UnstructuredReader

# Initialize the UnstructuredReader
loader = UnstructuredReader()

# For eml file
eml_documents = loader.load_data("../data/email/ark-trading-jan-12-2024.eml")
email_content = eml_documents[0].text
print("\n\n Email contents")
print(email_content)

In [None]:
# For Outlook msg
msg_documents = loader.load_data("../data/email/ark-trading-jan-12-2024.msg")
msg_content = msg_documents[0].text
print("\n\n Outlook contents")
print(msg_content)

In [None]:
from llama_index.program.openai import OpenAIPydanticProgram
from llama_index.core import ChatPromptTemplate
from llama_index.core.llms import ChatMessage
from llama_index.llms.openai import OpenAI

In [None]:
prompt = ChatPromptTemplate(
    message_templates=[
        ChatMessage(
            role="system",
            content=(
                "You are an expert assitant for extracting insights from email in JSON format. \n"
                "You extract data and returns it in JSON format, according to provided JSON schema, from given email message. \n"
                "REMEMBER to return extracted data only from provided email message."
            ),
        ),
        ChatMessage(
            role="user",
            content=(
                "Email Message: \n" "------\n" "{email_msg_content}\n" "------"
            ),
        ),
    ]
)

llm = OpenAI(model="gpt-3.5-turbo-1106")

program = OpenAIPydanticProgram.from_defaults(
    output_cls=EmailData,
    llm=llm,
    prompt=prompt,
    verbose=True,
)

In [None]:
output = program(email_msg_content=email_content)
print("Output JSON From .eml File: ")
print(json.dumps(output.dict(), indent=2))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Function call: EmailData with args: {"etfs":[{"etf_ticker":"ARKK","trade_date":"1/12/2024","stocks":[{"direction":"Buy","ticker":"TSLA","company_name":"TESLA INC","shares_traded":93654,"percent_of_etf":0.2453},{"direction":"Buy","ticker":"TXG","company_name":"10X GENOMICS INC","shares_traded":159506,"percent_of_etf":0.0907},{"direction":"Buy","ticker":"CRSP","company_name":"CRISPR THERAPEUTICS AG","shares_traded":86268,"percent_of_etf":0.0669},{"direction":"Buy","ticker":"RXRX","company_name":"RECURSION PHARMACEUTICALS","shares_traded":289619,"percent_of_etf":0.0391},{"direction":"Sell","ticker":"HOOD","company_name":"ROBINHOOD MARKETS INC","shares_traded":927,"percent_of_etf":0.0001},{"direction":"Sell","ticker":"EXAS","company_name":"EXA

In [None]:
output = program(email_msg_content=msg_content)

print("Output JSON from .msg file: ")
print(json.dumps(output.dict(), indent=2))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Function call: EmailData with args: {"etfs":[{"etf_ticker":"ARKK","trade_date":"1/12/2024","stocks":[{"direction":"Buy","ticker":"TSLA","company_name":"TESLA INC","shares_traded":93654,"percent_of_etf":0.2453},{"direction":"Buy","ticker":"TXG","company_name":"10X GENOMICS INC","shares_traded":159506,"percent_of_etf":0.0907},{"direction":"Buy","ticker":"CRSP","company_name":"CRISPR THERAPEUTICS AG","shares_traded":86268,"percent_of_etf":0.0669},{"direction":"Buy","ticker":"RXRX","company_name":"RECURSION PHARMACEUTICALS","shares_traded":289619,"percent_of_etf":0.0391},{"direction":"Sell","ticker":"HOOD","company_name":"ROBINHOOD MARKETS INC","shares_traded":927,"percent_of_etf":0.0001},{"direction":"Sell","ticker":"EXAS","company_name":"EXACT SCIENCES CORP","shares_traded":100766,"percent_of_etf":0.0829},{"direction":"