In [1]:
import sys
import math

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import scipy.stats as stats

from multiprocessing import cpu_count
from tqdm import tqdm, tqdm_notebook

In [2]:
def cprintf(df):
    if not isinstance(df, pd.DataFrame):
        try:
            df = df.to_frame()
        except:
            raise ValueError('Object cannot be coerced to df.')
    
    print('-'*79)
    print('Data frame information')
    print('-'*79)
    print(df.tail(5))
    print('-'*50)
    print(df.info())
    print('-'*79)    

# Estimating the uniqueness of a label

In [4]:
def mpNumCoEvents(closeIdx, t1, molecule):
    '''
    Compute the number of concurrent events per bar
    +molecule[0] is the date of the first event on which the weight be computed
    +molecule[-1] is the date of the last event on which the weight be computed
    Any event that start before t1[molecule].max() impacts the count.
    '''
    
    #1) find events that span the period [molecule[0], molecule[-1]]
    
    t1 = t1.fillna(closeIdx[-1]) #unclosed events still must impact other weight
    t1 = t1[t1 >= molecule[0]] #events that end at or after molecule[0]
    t1 = t1.loc[:t1[molecule].max()] #events that start at or before t1[molecule]
    
    #2) count events spanning a bar
    iloc = closeIdx.searchsorted(np.array([t1.index[0], t1.max()]))
    count = pd.Series(0, index = closeIdx[iloc[0]:iloc[1] + 1])
    
    for tIn, tOut in t1.iteritems():
        count.loc[tIn:tOut] += 1
        
    return count.loc[molecule[0]:t1[molecule].max()]

# Estimating the average uniqueness of a label

In [None]:
def mpSampleTW(t1, numCoEvents, molecule):
    #Derive average uniqueness over the event's lifespan
    wght = pd.Series(index = molecule)
    for tIn, tOut in t1.loc[wght.index].iteritems():
        wght.loc[tIn] = (1./numCoEvents.loc[tIn:tOut]).mean()
        
    return wght
