In [1]:
import pandas as pd

df = pd.read_csv("../datasets/Google.csv")
df.index = pd.DatetimeIndex(df['Date'].values)
close = df["Close"]

In [3]:
def get_embargo_times(times, pct_embargo):
    step = int(times.shape[0] * pct_embargo)
    if step == 0:
        embg_times = pd.Series(times, index=times)
    else:
        embg_times = pd.Series(times[step:], index=times[:-step])
        embg_times = embg_times.append(pd.Series(times[-1], index=times[:-step]))
    return embg_times

In [5]:
embg_times = get_embargo_times(close.index, pct_embargo=0.01)
embg_times.head()

2004-08-19   2004-10-04
2004-08-20   2004-10-05
2004-08-23   2004-10-06
2004-08-24   2004-10-07
2004-08-25   2004-10-08
dtype: datetime64[ns]

# 7.2

In [9]:
import pandas as pd
import numpy as np
from finance_ml.stats import get_daily_vol
from finance_ml.labeling import get_t1, cusum_filter, get_events

vol = get_daily_vol(close)
sampled_idx = cusum_filter(close, vol)
t1 = get_t1(close, sampled_idx, num_days=1)
side =  None
events = get_events(close, t_events=sampled_idx, trgt=vol,
                       ptsl=[1, 2], t1=t1, side=side)
events.head()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
2018-07-06 19:26:30.557741 100.0% apply_ptslt1 done after 0.03 minutes. Remaining 0.0 minutes.


Unnamed: 0,t1,trgt,t1_type
2004-08-24,2004-08-25,0.036396,t1
2004-08-25,2004-08-26,0.02993,t1
2004-08-31,2004-09-01,0.026605,t1
2004-09-02,2004-09-03,0.024097,t1
2004-09-07,2004-09-08,0.02361,t1


In [44]:
index = events.index
features_df = df.drop(columns=["Date"]).dropna().loc[index]
features = features_df
label = events['t1_type'].loc[features_df.index]

In [145]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
scores = []
for _ in range(10):   
    clf = RandomForestClassifier()
    kfold = KFold(n_splits=10, shuffle=False)
    scores.append(cross_val_score(clf, features, label, cv=kfold))
print(np.mean(scores), np.var(scores))

0.7217741498441944 0.015327005775199071


In [146]:
scores = []
for _ in range(10):   
    clf = RandomForestClassifier()
    kfold = KFold(n_splits=10, shuffle=True)
    scores.append(cross_val_score(clf, features, label, cv=kfold))
print(np.mean(scores), np.var(scores))

0.7723824685002032 0.0013833184853011958


Shffuling data introduces data leakage because of simlarity among neighborg, If you shuffle data uniformly, training data has more information that overlaps test data.

# 7.3

In [197]:
t1.max()

Timestamp('2017-01-10 00:00:00')

In [198]:
from sklearn.model_selection._split import _BaseKFold
import time


def get_train_times(t1, test_times):
    trn = t1.copy(deep=True)
    for i, j in test_times.iteritems():
        df0 = trn[(i <= trn.index) & (trn.index <= j)].index
        df1 = trn[(i <= trn) & (trn <= j)].index
        df2 = trn[(trn.index <= i) & (j <= trn)].index
        trn = trn.drop(df0.union(df1.union(df2)))
    return trn


class PurgedKFold(_BaseKFold):
    def __init__(self, n_splits=3, t1=None, pct_embargo=0., purging=True):
        if not isinstance(t1, pd.Series):
            raise ValueError('Label through dates must be a pd.Series')
        super(PurgedKFold, self).__init__(n_splits=n_splits, shuffle=False,
                                          random_state=None)
        self.t1 = t1
        self.pct_embargo = pct_embargo
        self.purging = purging
        
    def split(self, X, y=None, groups=None):
        if (X.index == self.t1.index).sum() != len(self.t1):
            raise ValueError('X and t1 must have the same index')
        indices = np.arange(X.shape[0])
        # Embargo width
        embg_size = int(X.shape[0] * self.pct_embargo)
        test_ranges = [(i[0], i[-1] + 1) for i in np.array_split(indices, self.n_splits)]
        for st, end in test_ranges:
            # Test data
            test_indices = indices[st:end]
            # Training data prior to test data
            t0 = self.t1.index[st]
            train_indices = self.t1.index.searchsorted(self.t1[self.t1 <= t0].index)
            # Add training data after test data
            max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
            if max_t1_idx < X.shape[0]:
                train_indices = np.concatenate((train_indices, indices[max_t1_idx + embg_size:]))
            # Purging
            if self.purging:
                train_t1 = t1.iloc[train_indices]
                test_t1 = t1.iloc[test_indices]
                train_t1 = get_train_times(train_t1, test_t1)
                train_indices = self.t1.index.searchsorted(train_t1.index)
            yield train_indices, test_indices

In [199]:
from sklearn.metrics import log_loss, accuracy_score
import numpy as np

from finance_ml.sampling import get_sample_tw, get_num_co_events

def cv_score(clf, X, y, sample_weight=None, scoring='neg_log_loss',
             t1=None, n_splits=3, cv_gen=None, pct_embargo=0., purging=False):
    if scoring not in ['neg_log_loss', 'accuracy']:
        raise Exception('Wrong scoring method')
    if cv_gen is None:
        cv_gen = PurgedKFold(n_splits=n_splits, t1=t1,
                             pct_embargo=pct_embargo,
                             purging=purging)
    scores = []
    for train, test in cv_gen.split(X=X):
        train_params = dict()
        test_params = dict()
        # Sample weight is an optional parameter
        if sample_weight is not None:
            train_params['sample_weight'] = sample_weight.iloc[train].values
            test_params['sample_weight'] = sample_weight.iloc[test].values
        clf_ = clf.fit(X=X.iloc[train, :], y=y.iloc[train], **train_params)
        # Scoring
        if scoring == 'neg_log_loss':
            prob = clf_.predict_proba(X.iloc[test, :])
            score_ = -log_loss(y.iloc[test], prob, labels=clf.classes_, **test_params)
        else:
            pred = clf_.predict(X.iloc[test, :])
            score_ = accuracy_score(y.iloc[test], pred, **test_params)
        scores.append(score_)
    return np.array(scores)

In [214]:
%%time

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
t1_ = t1.loc[features.index]

scores = []
for _ in range(100):
    scores_ = cv_score(clf, features, label, pct_embargo=0.01, t1=t1_, purging=False)
    scores.append(np.mean(scores_))
print(np.mean(scores), np.var(scores))

-2.8218924493391286 0.3882551420305969
CPU times: user 5.83 s, sys: 3.86 ms, total: 5.83 s
Wall time: 5.83 s


In [215]:
%%time

scores = []
for _ in range(100):
    scores_ = cv_score(clf, features, label, pct_embargo=0., t1=t1_, purging=False)
    scores.append(np.mean(scores_))
print(np.mean(scores), np.var(scores))

-2.9363593364167415 0.737761782263656
CPU times: user 5.94 s, sys: 7.84 ms, total: 5.95 s
Wall time: 5.95 s


### With Sample Weights

In [216]:
n_co_events = get_num_co_events(close.index, t1, events.index)
sample_weight = get_sample_tw(t1, n_co_events, events.index)

In [217]:
%%time

scores = []
for _ in range(100):
    scores_ = cv_score(clf, features, label, sample_weight=sample_weight,
                       pct_embargo=0.01, t1=t1_, purging=False)
    scores.append(np.mean(scores_))
print(np.mean(scores), np.var(scores))

-2.8055327356602073 0.5953942615123814
CPU times: user 6.02 s, sys: 0 ns, total: 6.02 s
Wall time: 6.01 s


In [218]:
%%time

scores = []
for _ in range(100):
    scores_ = cv_score(clf, features, label, sample_weight=sample_weight,
                       pct_embargo=0., t1=t1_, purging=False)
    scores.append(np.mean(scores_))
print(np.mean(scores), np.var(scores))

-2.873647648777405 0.9769345931532711
CPU times: user 8.24 s, sys: 11.6 ms, total: 8.25 s
Wall time: 8.25 s
