In [None]:
import numpy as np
import pandas as pd

import os
import pdfplumber
import sys

from dotenv import load_dotenv, find_dotenv

# environment settings
load_dotenv(find_dotenv())
ROOT_DIR = os.path.dirname(find_dotenv())
sys.path.append(ROOT_DIR)

# pandas display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

In [None]:
# pdf variables
pdf_name = 'monthly-short'
pdf_path = f'{ROOT_DIR}/data/{pdf_name}.pdf'

# csv variables
csv_path = f'{ROOT_DIR}/data/{pdf_name}.csv'
csv_headers = ['symbol', 'date', 'side', 'quantity', 'price', 'commission', 'fees']

In [None]:
pdf = pdfplumber.open(pdf_path)

In [None]:
# find page where 'TRADE RECORDS' table begins
for page in pdf.pages:
    if 'TRADE RECORDS' in page.extract_text():
        start_page = page.page_number - 1
        print(start_page)

In [None]:
# extract table from start page of 'TRADE RECORDS' until the next table begins
for i in range(start_page, len(pdf.pages) - 1):
    if len(pdf.pages[i].find_tables()) == 1:
        print(pdf.pages[i].extract_table())

    else:
        table = pdf.pages[i].find_tables()
        print(table[0].extract())
        break

In [None]:
tb = pdf.pages[1].extract_table()

In [None]:
df = pd.DataFrame(tb[1:], columns=tb[0])
df.head(10)

In [None]:
df.loc[:, 'date'] = df['Trade Date'] + ' ' + df['Time'] + '00' # concatenate date and time
df.loc[:, 'symbol'] = df['Symbol & Name'].str.extract('(^.+(?=\\n))') # extract symbol
df['commission'] = 0.0 # fill 'commission'

# select columns to keep and rename
df = df[['symbol', 'date', 'Buy/Sell', 'Quantity', 'Traded Price', 'commission', 'Comm/Fee/Tax']]
df = df.rename(columns = {'Buy/Sell': 'side', 
                          'Quantity': 'quantity', 
                          'Traded Price': 'price',
                          'Comm/Fee/Tax': 'fees'})

df.head(10)

In [None]:
# convert date to UTC with Zulu format
df['date'] = df['date'].str.replace('GMT', '')
df.loc[:, 'date'] = pd.to_datetime(df['date'], format = '%d/%m/%Y %H:%M:%S,%z').dt.tz_convert('UTC')
df.loc[:, 'date'] = df.loc[:, 'date'].apply(lambda x: x.isoformat().replace('+00:00', 'Z'))
df['date'] = np.where(df['date'] == 'NaT', None, df['date'])

df.head()

In [None]:
# sort by date
df.sort_values(by = ['date'], inplace = True)
df.reset_index(drop = True, inplace = True)
df

In [None]:
# save csv file
if not os.path.exists(csv_path):
    df.to_csv(csv_path)

df.head()