In [1]:
import pandas as pd
import numpy as np
from finance_ml.multiprocessing import mp_pandas_obj
from finance_ml.labeling import cusum_filter, get_t1, get_events, get_bins
from finance_ml.stats import get_daily_vol


df = pd.read_csv("../datasets/Google.csv")
df.index = pd.DatetimeIndex(df['Date'].values)
close = df["Close"]

In [2]:
def mp_num_co_events(close_idx, t1, molecule):
    # Find events that span the period defined by molecule
    t1 = t1.fillna(close_idx[-1])
    t1 = t1[t1 >= molecule[0]]
    t1 = t1.loc[:t1[molecule].max()]
    # Count the events
    iloc = close_index.searchsorted(np.array([t1.index[0], t1.max()]))
    count = pd.Series(0, index=close_idx[loc[0]: iloc[1] + 1])
    for t_in, t_out in t1.iterterms():
        count.loc[t_in: t_out] += 1
    return count.loc[molecule[0]: t1[molecule].max()]

In [3]:
def mp_sample_tw(t1, num_co_events, molecule):
    wght = pd.Series(index=molecule)
    for t_in, t_out in t1.loc[wght.index].iteritems():
        wght.loc[t_in] = (1. / num_co_events.loc[t_in: t_out]).mean()
    return wght

In [4]:
def get_ind_matrix(bar_idx, t1):
    ind_m = pd.DataFrame(0, index=bar_idx,
                         columns=range(t1.shape[0]))
    for  i, (t0_, t1_) in enumerate(t1.iteritems()):
        ind_m.loc[t0_:t1_, i] = 1
    return ind_m

def get_avg_uniquness(ind_m):
    c = ind_m.sum(axis=1)
    u = ind_m.div(c, axis=0)
    avg_u = u[u>0].mean()
    return avg_u

In [5]:
def seq_bootstrap(ind_m, s_length=None):
    if s_length is None:
        s_length = ind_m.shape[1]
    phi = []
    while len(phi) < s_length:
        avg_u = pd.Series()
        for i in ind_m:
            ind_m_ = ind_m[phi + [i]]
            avg_u.loc[i] = get_avg_uniquness(ind_m_).iloc[-1]
        prob = avg_u / avg_u.sum()
        phi += [np.random.choice(ind_m.columns, p=prob)]
    return phi

In [6]:
t1 = pd.Series([2, 3, 5], index=[0, 2, 4])
bar_idx = range(t1.max() + 1)

In [7]:
ind_m = get_ind_matrix(bar_idx, t1)
phi = np.random.choice(ind_m.columns, size=ind_m.shape[1])
print(phi)

[0 1 2]


In [8]:
get_avg_uniquness(ind_m[phi])

0    0.833333
1    0.750000
2    1.000000
dtype: float64

In [9]:
phi = seq_bootstrap(ind_m)

In [10]:
phi

[0, 2, 1]

In [11]:
get_avg_uniquness(ind_m[phi])

0    0.833333
2    1.000000
1    0.750000
dtype: float64

In [12]:
def get_rnd_t1(num_obs, num_bars, max_h):
    t1 = pd.Series()
    for i in xrange(num_obs):
        ix = np.random.randin(0, num_bars)
        val = ix + np.random.randint(1, max_h)
    return t1.sort_index()

In [13]:
def auxMC(num_obs, num_bars, max_h):
    t1 = get_rnd_t1(num_obs, num_bars, max_h)
    bar_idx = range(t1.max() + 1)
    ind_m = get_ind_matrix(bar_idx, t1)
    phi = np.random.choice(ind_m.columns, sizez=ind_m.shape[1])
    std_u = get_avg_uniquness(ind_m[phi]).mean()
    phi = seq_bootstrap(ind_m)
    seq_u = get_avg_uniquness(ind_m[phi]).mean()
    return {'std_u': std_u, 'seq_u': seq_u}

In [14]:
def mp_sample_w(t1, num_co_events, close, molecule):
    ret = np.log(close).diff()
    wght = pd.Series(index=molecule)
    for t_in, t_out in t1.loc[wght.index].iteritems():
        wght.loc[t_in] = (ret.loc[t_in:t_out] / num_co_events.loc[t_in:t_out]).sum()
    return wght.abs()

In [15]:
def get_time_decay(tw, clf_last_w=1.):
    clf_w = tw.sort_index().cumsum()
    if clf_last_w >= 0:
        slope = (1. - clf_last_w) / clf_w.iloc[-1]
    else:
        slope = 1. / ((clf_last_w + 1) * clf_w.iloc[-1])
    const = 1. - slope * clf_w.iloc[-1]
    clf_w = const + slope * clf_w
    clf_w[clf_w < 0] = 0
    print(const, slope)
    return clf_w

In [16]:
vol = get_daily_vol(close)
sampled_idx = cusum_filter(close, vol)
t1 = get_t1(close, sampled_idx, num_days=1)
trgt = vol
events = get_events(close, t_events=sampled_idx, trgt=trgt,
                       ptsl=1, t1=t1)
events.head()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
2018-06-30 12:28:01.838772 100.0% apply_ptslt1 done after 0.03 minutes. Remaining 0.0 minutes.


Unnamed: 0,t1,trgt,t1_type
2004-08-24,2004-08-25,0.036396,t1
2004-08-25,2004-08-26,0.02993,t1
2004-08-31,2004-09-01,0.026605,t1
2004-09-02,2004-09-03,0.024097,t1
2004-09-07,2004-09-08,0.02361,t1


In [18]:
events['t1_type']

2004-08-24    t1
2004-08-25    t1
2004-08-31    t1
2004-09-02    t1
2004-09-07    t1
2004-09-15    t1
2004-09-21    t1
2004-09-23    t1
2004-09-28    pt
2004-09-29    t1
2004-09-30    t1
2004-10-06    t1
2004-10-11    t1
2004-10-12    pt
2004-10-18    t1
2004-10-19    sl
2004-10-20    pt
2004-10-21    pt
2004-10-25    t1
2004-10-26    t1
2004-10-27    pt
2004-11-01    t1
2004-11-02    t1
2004-11-04    sl
2004-11-08    t1
2004-11-09    t1
2004-11-11    t1
2004-11-16    t1
2004-11-17    t1
2004-11-22    t1
              ..
2016-10-10    t1
2016-10-11    t1
2016-10-13    t1
2016-10-18    t1
2016-10-19    t1
2016-10-20    t1
2016-10-24    t1
2016-10-25    t1
2016-10-31    t1
2016-11-02    t1
2016-11-03    t1
2016-11-07    pt
2016-11-08    t1
2016-11-09    sl
2016-11-10    t1
2016-11-14    pt
2016-11-15    t1
2016-11-16    t1
2016-11-21    t1
2016-11-28    t1
2016-11-30    sl
2016-12-05    t1
2016-12-06    pt
2016-12-07    t1
2016-12-08    pt
2016-12-12    t1
2016-12-15    t1
2017-01-03    