## Convert PDF to string

In [1]:
import pymupdf

In [2]:
def convert_pdf_to_text(filepath, keyword='holdings'):
    pdf_document = pymupdf.open(filepath)
    text = ''

    for page in pdf_document:
        page_text = page.get_text()
        if keyword.lower() in page_text.lower():
            text += page_text

    pdf_document.close()

    return text

## Extract holdings using AI

##### System prompt & user text

In [5]:
system_prompt = """
You are a financial advisor given a string of text from brokerage statements. Please interpret it to identify individual positions in each account. 
If you cannot identify a specific attribute, fill it with 'NA'.

Example input:
'7 of 28\nINVESTMENT REPORT \nJuly 1 – July 31, 2015\u2009\u2002\u2002\nHoldings (continued)\u2002\nDescription\nQuantity\nPrice \nPer Unit\nEnding \nMarket Value\nTotal \nCost Basis\nUnrealized \nGain/Loss\nEst. Annual \nIncome (EAI)\nEst. Yield \n(EY)\n25.00\n525.31\n$13,132.75 \n$9,350.12 \nc\n$3,782.63 \n$304.68 \n2.32%\nCommon Stocks \nAPPLE INC (AAPL) \nAMERCO COM (UHAL)\n30.00\n203.15A\n 6,094.50 \n 4,149.75 \nc\n 1,944.75 \n—\nTotal Common Stock (24% of account holdings)\n $5,517.25 \n$-1,011.12 \n$6,528.37 \n$304.68 \nAccount 111-111111 \nJohn W. Doe - Individual TOD\n*** SAMPLE STATEMENT ***\nFor informational purposes only\n$304.68 \n'

Example output (follow the format exactly and do not collect extra metrics):
{
    "statement_date":2015-07-31,
    "accounts":[
        {
            "account_number":"111-111111",
            "holdings": [
                {
                "description": "APPLE INC",
                "symbol": "AAPL",
                "quantity": 25.00,
                "market_value": 13132.75,
                "total_cost_basis": 9350.12
                },
                {
                "description": "AMERCO COM",
                "symbol": "UHAL",
                "quantity": 30.00,
                "market_value": 6094.50,
                "total_cost_basis": 4149.75
                }
            ]
        }
    ]
}
"""

In [40]:
test_filepath = r'C:\Users\marco\Desktop\panda\statements\fidelity-sample-statement.pdf'
pdf_document = pymupdf.open(test_filepath)
user_text = pdf_document[20].get_text()

##### OpenAI

In [14]:
from openai import OpenAI

client = OpenAI(organization='org-eVEyczFQOXsiXSb1K0yKkpuz')

In [None]:
# Make a request to the OpenAI API
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_text}
    ]
)

# Extract and return the JSON response from the model
response['choices'][0]['message']['content']

##### Google

In [6]:
import google.generativeai as genai
from dotenv import load_dotenv
import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
load_dotenv()
api_key = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=api_key)

# see https://ai.google.dev/api/python/google/generativeai/GenerativeModel
generation_config = {
  'response_mime_type': 'application/json',
}

model = genai.GenerativeModel(
  model_name="gemini-1.5-flash",
  generation_config=generation_config,
  system_instruction=system_prompt
)

def extract_holdings(text):
    response = model.generate_content(text)
    return json.loads(response.text)

In [8]:
test_filepath = r"C:\Users\marco\Downloads\hui_statement_may.pdf"
statement_text = convert_pdf_to_text(test_filepath, 'holdings')

output = extract_holdings(statement_text)

In [9]:
output

{'statement_date': '2024-05-31',
 'accounts': [{'account_number': '242-744213',
   'holdings': [{'description': 'FIDELITY GOVERNMENT MONEY MARKET',
     'symbol': 'SPAXX',
     'quantity': '128.550',
     'market_value': '128.55',
     'total_cost_basis': 'NA'},
    {'description': 'FIDELITY 500 INDEX FUND',
     'symbol': 'FXAIX',
     'quantity': '5.787',
     'market_value': '1062.55',
     'total_cost_basis': '912.72'},
    {'description': 'MONSTER BEVERAGE CORP NEW COM FROM CUSIP 61174X109',
     'symbol': 'NA',
     'quantity': '97.000',
     'market_value': '5036.24',
     'total_cost_basis': '4971.74'}]},
  {'account_number': '243-753451',
   'holdings': [{'description': 'FIDELITY GOVERNMENT MONEY MARKET',
     'symbol': 'SPAXX',
     'quantity': '69.650',
     'market_value': '69.65',
     'total_cost_basis': 'NA'},
    {'description': 'FIDELITY 500 INDEX FUND',
     'symbol': 'FXAIX',
     'quantity': '9.125',
     'market_value': '1675.44',
     'total_cost_basis': '1500.00'

## Convert json to excel

In [10]:
import pandas as pd 

def json_to_dfs(json_data):
    statement_date = json_data['statement_date']
    accounts = json_data['accounts']
    
    dataframes = {}
    
    for account in accounts:
        account_number = account['account_number']
        holdings = account['holdings']
        
        # Create a DataFrame for this account
        df = pd.DataFrame(holdings)
        df['statement_date'] = statement_date
        df['account_number'] = account_number
        
        # Store the DataFrame in the dictionary with the account number as the key
        dataframes[account_number] = df
    
    return dataframes

In [11]:
temp = json_to_dfs(output)

In [26]:
temp['333-333333']

Unnamed: 0,description,symbol,quantity,market_value,total_cost_basis,unrealized_gain_loss,est_annual_income,est_yield,statement_date,account_number
0,NH Portfolio 2015,,1200.291,21221.14,,,,,2015-07-31,333-333333
1,NH Moderate Growth Port,,463.301,7236.76,,,,,2015-07-31,333-333333


## Yfinance

In [13]:
import yfinance as yf
import pandas as pd

In [15]:
tickers_df = pd.DataFrame({
    'symbol': ['AAPL', 'MSFT', 'AAPL', 'GOOGL']
})

tickers_df

Unnamed: 0,symbol
0,AAPL
1,MSFT
2,AAPL
3,GOOGL


In [25]:
def yfinance_enrich(df):
    symbols = df['symbol'].unique().tolist()
    prices_table = yf.download(symbols, period='1d')
    closing_prices = prices_table['Adj Close'].iloc[0]
    df['last_close_price'] = df['symbol'].map(closing_prices)
    return df

In [16]:
yfinance_enrich(tickers_df)
tickers_df

[*********************100%%**********************]  3 of 3 completed


Unnamed: 0,symbol,last_close_price
0,AAPL,230.539993
1,MSFT,453.549988
2,AAPL,230.539993
3,GOOGL,185.070007


In [29]:
temp['242-744213']
# temp['243-753451']

Unnamed: 0,description,symbol,quantity,market_value,total_cost_basis,statement_date,account_number,last_close_price
0,FIDELITY GOVERNMENT MONEY MARKET,SPAXX,128.55,128.55,,2024-05-31,242-744213,1.0
1,FIDELITY 500 INDEX FUND,FXAIX,5.787,1062.55,912.72,2024-05-31,242-744213,195.039993
2,MONSTER BEVERAGE CORP NEW COM FROM CUSIP 61174...,,97.0,5036.24,4971.74,2024-05-31,242-744213,0.41


In [30]:
# Applying the function to each DataFrame in the dictionary using dictionary comprehension
dataframes = {account_number: yfinance_enrich(df) for account_number, df in temp.items()}

[*********************100%%**********************]  3 of 3 completed
[*********************100%%**********************]  12 of 12 completed
