In [None]:
import pandas as pd

In [3]:
# Load the HTML table into a DataFrame
trades = pd.read_html('./Statement.htm')
df = trades[0].copy()  # Make a copy if it's a slice from another DataFrame

# Set new column headers using a specific row from the DataFrame
header_row_index = 2
df.columns = df.iloc[header_row_index]

# Remove the header row and rows above it
df = df[header_row_index + 1:]

# Drop unwanted columns without using inplace=True
df = df.drop(['Ticket', 'Taxes'], axis=1)

# Rename columns
df.columns = ['Open Time', 'Type', 'Size', 'Asset', 'Open Price', 'Stop Loss', 'Take Profit', 'Close Time', 'Close Price', 'Commissions', 'Swap', 'PnL']

# Filter rows based on 'Type'
valid_trade_types = ['buy', 'sell', 'buy limit', 'sell limit', 'buy stop', 'sell stop']
df = df[df['Type'].str.lower().isin(valid_trade_types)]

# Filter out rows where 'PnL' equals 'cancelled'
df = df[df['PnL'].str.lower() != 'cancelled']

# Convert specific columns to float
cols_to_float = ['Size', 'Open Price', 'Stop Loss', 'Take Profit', 'Close Price', 'Commissions', 'Swap', 'PnL']
for col in cols_to_float:
    df[col] = df[col].astype(str).str.replace(' ', '').astype(float)

# View the last few rows of the DataFrame
df.tail()


Unnamed: 0,Open Time,Type,Size,Asset,Open Price,Stop Loss,Take Profit,Close Time,Close Price,Commissions,Swap,PnL
126,2023.09.12 01:34:54,sell,5.55,usdcad,1.35746,1.35986,1.3531,2023.09.12 05:04:15,1.35903,-27.75,0.0,-641.16
127,2023.09.11 23:37:48,sell,4.41,audusd,0.64307,0.64308,0.64,2023.09.13 02:47:14,0.64309,-14.18,1.76,-8.82
128,2023.09.08 13:02:36,buy,4.4,nzdusd,0.59117,0.58891,0.59393,2023.09.08 19:30:36,0.5889,-13.01,0.0,-998.8
129,2023.09.08 09:03:09,sell,4.2,usdjpy,147.178,147.528,146.159,2023.09.08 15:59:02,147.531,-21.0,0.0,-1004.94
135,2024.01.19 02:30:38,sell,4.95,audusd,0.65877,0.66067,0.6561,,0.65836,-16.3,0.0,202.95


In [None]:
# Checking where the header row starts
for index, row in df.iterrows():
    if row[0] == 'Ticket':
        header_row_index = index
        break


In [None]:
# Set header row
new_header = df.iloc[header_row_index]
df2 = df[header_row_index+1:]
df2.columns = new_header
df2.reset_index(drop=True, inplace=True)

In [None]:
# Find the last row of trades and remove everything below it
mask = df2[df2.columns[0:10]].isna().all(axis=1)
first_nan_index = mask.idxmax() if mask.any() else len(df2)
first_nan_index = mask.idxmax() if mask.any() else len(df2)
df_final = df2.iloc[:first_nan_index]

In [None]:
# Remove deposits and withdrawals from df to get only pnl. 
df_deposit_withdrawal = df_final[df_final['Type'].str.lower().isin(['balance'])]
df_final = df_final[~df_final['Type'].str.lower().isin(['balance'])]
# Removed cancelled orders/trades
df_final = df_final[df_final['Profit'].str.lower() != 'cancelled']



In [None]:
# Correct Dtypes for calculations later on. Also remove white spaces in Profit column.
df_final['Ticket'] = df_final['Ticket'].astype(int)
cols_to_float = ['Size', 'Price', 'S / L', 'T / P', 'Price', 'Commission', 'Taxes', 'Swap' ,'Profit']

# Remove white spaces in Profit to convert from str to float
for col in cols_to_float:
    df_final[col] = df_final[col].replace(' ', '', regex=True).astype(float)

conversion_dict = {col: float for col in cols_to_float}
df = df_final.astype(conversion_dict)

In [None]:
df

In [None]:
df_final.tail()