In [1]:
import os
import re
import pandas as pd
from pandasql import sqldf
from date_processing import *

pysqldf = lambda q: sqldf(q, globals())

In [2]:
ETF_data_file = 'results/UPRO_simulation.xlsx'
date_column = 'Date'
price_column = 'simulated_UPRO'

hold_duration_years = 10
hold_duration_days = 252*hold_duration_years

output_excel_file = 'results/upro_cost_averaging_{}y.xlsx'.format(hold_duration_years)

In [3]:
start_date = '0000-01-01'
end_date = '9999-01-01'

In [4]:
if '.csv' in ETF_data_file.lower():
    data = pd.read_csv(
        ETF_data_file,
        )
else:
    data = pd.read_excel(
        ETF_data_file,
        )

data[date_column] = data[date_column].apply(date_normalize)
data[price_column] = data[price_column].apply(str_to_float)

data = pysqldf(u"""
    select 
    {} as date,
    {} as price
    from data
    """.format(
    date_column, 
    price_column
   ))

data = pysqldf(u"""
    select *
    from data
    where date >= '{}' and date <= '{}'
    order by date asc
    """.format(
    start_date, 
    end_date))


data['id'] = data.index

In [5]:
data.tail(3)

Unnamed: 0,date,price,id
23759,2022-08-03,44.582106,23759
23760,2022-08-04,44.478139,23760
23761,2022-08-05,44.261208,23761


# the return

In [6]:
holding = pysqldf(u"""
    select 
    data.date,
    data.price,
    pre_data.date as pre_date,
    pre_data.price as pre_price
    from data
    join data as pre_data on pre_data.id+{} = data.id
    """.format(hold_duration_days))

In [7]:
holding.tail(3)

Unnamed: 0,date,price,pre_date,pre_price
21239,2022-08-03,44.582106,2012-07-27,4.08195
21240,2022-08-04,44.478139,2012-07-30,4.07603
21241,2022-08-05,44.261208,2012-07-31,4.023245


In [None]:
window_data = pysqldf(u"""
    select boundary.*,
    window.date as window_date,
    1000.0/window.price as window_shares,
    window.price as window_price
    from holding as boundary
    join data as window
    on window.date >= boundary.pre_date and window.date < boundary.date
    """)

In [None]:
window_aggrated = pysqldf(u"""
    select date, 
    count(*)*1000.0 as invested_amount,
    sum(window_shares) as bought_shares,
    price
    from window_data 
    group by date
    """)

In [None]:
window_ratio =  pysqldf(u"""
    select *,
    bought_shares*price/invested_amount as ratio
    from window_aggrated 
    order by date asc
    """)

In [None]:
window_ratio =  pysqldf(u"""
    select *, 
    bought_shares*price as sold_amount
    from window_ratio
    """)

In [None]:
window_ratio.to_excel(output_excel_file, index = False)

In [None]:
window_ratio.tail(3)

In [None]:
window_ratio.plot(x ='date', y=[
    'bought_shares', 
    ], grid = True)

In [None]:
window_ratio.plot(x ='date', y=[
    'ratio', 
    ], grid = True)

In [None]:
window_ratio.head(3)

In [None]:
window_ratio.boxplot(column=['ratio']) 

In [None]:
sold_amount = window_ratio['sold_amount'].median()

print('sold_amount:\t%0.4f million'%(sold_amount/1000000))

In [None]:
ratio = window_ratio['ratio'].median()

print('Ratio:\t%0.4f'%(ratio))

##### END