In [1]:
import numpy as np
import pandas as pd

In [6]:
def mpNumCoEvents(closeIdx, t1, module):
    """
    Compute the number of concurrent events per bar.
    +molecule[0] is the date of the first event on which the weight will be computed
    +molecule[1] is the date of the last event on which the weight will be computed
    Any event that starts before t1[molecule].max() impacts the count.
    """
    
    # 1) find events that span the period [molecule[0], molecule[-1]]
    t1 = t1.fillna(closeIdx[-1]) # unclosed events still must impact other weights
    t1 = t1[t1 >= molecule[0]] # events that end at or after molecule[0]
    t1 = t1.loc[:t1[molecule].max()] # events that start at or before t1[molecule].max()
    
    # 2) count events spanning a bar
    iloc = closeIdx.searchsorted(np.array([t1.index[0], t1.max()]))
    count = pd.Series(0, index=closeIdx[iloc[0]:iloc[1]+1])
    for tIn, tOut in t1.iteritems():
        count.loc[tIn:tOut] += 1
    return count.loc[molecule[0]: t1[molecule.max()]]

def mpSampleTW(t1, numCoEvents, molecule):
    """
    Estimate the average uniqueness of a label
    """
    weight = pd.Series(index=molecule)
    for tIn, tOut in t1.loc[weight.index].iteritems():
        weight.loc[tIn] = (1. / numCoEvents.loc[tIn: tOut]).mean()
    return weight

# numCoEvents = mpPandasObj(mpNumCoEvents, ('molecule', events.index), numThreads, closeIdx=close.index, t1=events['t1'])
# numCoEvents = numCoEvents.loc[~numCoEvents.index.duplicated(keep='last')]
# numCoEvents = numCoEvents.reindex(close.index).fillna(0)
# out['tW'] = mpPandasObj(mpSampleTW, ('molecule', events.index), numThreads, t1=events['t1'], numCoEvents=numCoEvents)


In [7]:
# 4.3 Build and indicator matrix
import pandas as pd
import numpy as np

def getIndMatrix(barIx, t1):
    # Get indicator matrix
    indM = pd.DataFrame(0, index=barIx, columns=range(t1.shape[0]))
    for i, (t0, t1) in enumerate(t1.iteritems()):
        indM.loc[t0:t1, i] = 1
    return indM

# 4.4 Compute average uniqueness

def getAvgUniqueness(indM):
    # Average uniqueness from indicator matrix
    c = indM.sum(axis=1) # concurrency
    u = indM.div(c, axis=0) # uniqueness
    avgU = u[u > 0].mean() # average uniqueness
    return avgU

def seqBootstrap(indM, sLength=None):
    # Generate a sample via sequential bootstrap
    if sLength is None:
        sLength = indM.shape[1]
    phi = []
    while len(phi) < sLength:
        avgU = pd.Series()
        for i in indM:
            indM_ = indM[phi+[i]] # reduce indM
            avgU.loc[i] = getAvgUniqueness(indM_).iloc[-1]
        prob = avgU / avgU.sum() # draw prob
        phi += [np.random.choice(indM.columns, p=prob)]
    return phi

# 4.6 Example of sequential bootstrap
def main():
    t1 = pd.Series([2, 3, 5], index=[0, 2, 4]) # t0, t1 for each feature obs
    barIx = range(t1.max() + 1) # index of bars
    indM = getIndMatrix(barIx, t1)
    phi = np.random.choice(indM.columns, size=indM.shape[1])
    print(phi)
    print("Standard Uniqueness:", getAvgUniqueness(indM[phi]).mean())
    phi = seqBootstrap(indM)
    print(phi)
    print("Sequential Uniqueness", getAvgUniqueness(indM[phi]).mean())
    return

In [11]:
main()

[0 0 2]
Standard Uniqueness: 0.6666666666666666
[1, 0, 1]
Sequential Uniqueness 0.5370370370370371


  avgU = pd.Series()
  avgU = pd.Series()
  avgU = pd.Series()
