# Sample Weights

### Loading Libraries

In [1]:
# Randomness
import random


# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
from pandas import Timestamp

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.graph_objects as go
import plotly.io as pio
%matplotlib inline

# Date & Time
from datetime import datetime, timedelta

# Typing
from typing import Tuple, List, Dict, Union, Optional, Any, Generator

# Scikit-Learn
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, plot_roc_curve

# Scientific Statistical Python
from scipy.stats import jarque_bera

### Number of Concurrent Labels

In [2]:
def num_conc_events(closeIdx: np.ndarray, t1: pd.Series, molecule: np.ndarray) -> pd.Series:
    t1 = t1.fillna(closeIdx[-1])
    t1 = t1[t1 >= molecule[0]]
    t1 = t1.loc[:t1[molecule].max()]
    iloc = closeIdx.searchsorted(pd.DatetimeIndex([t1.index[0], t1.max()]))
    count = pd.Series([0] * (iloc[1] + 1 - iloc[0]), index=closeIdx[iloc[0]: iloc[1] + 1])
    for tIn, tOut in t1.iteritems():
        count.loc[tIn: tOut] += 1
    return count.loc[molecule[0]: t1[molecule].max()]

### Estimating The Label's Average Uniqueness

In [3]:
def sample_weights(t1: pd.Series, num_conc_events: pd.Series, molecule: np.ndarray) -> pd.Series:
    weights = pd.Series([0] * len(molecule), index=molecule)
    for tIn, tOut in t1.loc[weights.index].iteritems():
        weights.loc[tIn] = (1.0 / num_conc_events.loc[tIn: tOut]).mean()
    return weights

### Sequential Bootstrap

#### Indication Matrix

In [4]:
def get_ind_matrix(barIdx: np.ndarray, t1: pd.Series) -> pd.DataFrame:
    indM = pd.DataFrame(0, index=barIdx, columns=range(t1.shape[0]))
    for i, (t0, t1) in enumerate(t1.iteritems()):
        indM.loc[t0:t1, i] = 1.0
    return indM

#### Computing Average Uniqueness

In [6]:
def get_avg_uniqueness(indM: pd.DataFrame) -> float:
    c = indM.sum(axis=1)
    u = indM.div(c, axis=0)
    avg_uniq = u[u > 0].mean()
    return avg_uniq

#### Sequential Bootstrap Sample Return

In [7]:
def seq_bootstrap(indM: pd.DataFrame, sLength: int = None) -> np.ndarray:
    if sLength is None:
        sLength = indM.shape[1]
    phi = []
    while len(phi) < sLength:
        avg_uniq = pd.Series()
        for i in indM:
            indM_ = indM[phi + [i]]
            avg_uniq.loc[i] = get_avg_uniqueness(indM_).iloc[-1]
        prob = avg_uniq / avg_uniq.sum()
        phi += [np.random.choice(indM.columns, p=prob)]
    return np.array(phi)

#### Random T-1 Series

In [8]:
def gen_rand_t1(numObs: int, numBars: int, maxH: int) -> pd.Series:
    t1 = pd.Series()
    for i in range(numObs):
        idx = np.random.randint(0, numBars)
        val = idx + np.random.randint(1, maxH)
        t1.loc[idx] = val
    return t1.sort_index()

#### Multi-Threaded Monte Carlo

In [9]:
def main_MC(numObs: int, numBars: int, maxH: int, numIters: int) -> None:
    out = pd.DataFrame()
    for i in range(numIters):
        out = pd.concat((out, pd.DataFrame([aux_MC(numObs, numBars, maxH)])))
    return out

### Return Attirubtion

#### Sample Weight Determination by Absolute Return Attribution

In [10]:
def sample_return_weights(t1: pd.Series, num_conc_events: pd.Series, close: pd.Series, molecule: np.ndarray) -> pd.Series:
    ret = np.log(close).diff()
    weights = pd.Series(index=molecule, dtype=object)
    for tIn, tOut in t1.loc[weights.index].iteritems():
        weights.loc[tIn] = (ret.loc[tIn: tOut] / num_conc_events.loc[tIn: tOut]).sum()
    return weights.abs()

### Time Decay

#### Time-Decay Factors Implementation 

In [11]:
def get_time_decay(tW: pd.Series, clfLastW: float = 1.0) -> pd.Series:
    clfW = tW.sort_index().cumsum()
    if clfLastW >= 0:
        slope = (1.0 - clfLastW) / clfW.iloc[-1]
    else:
        slope = 1. / ((clfLastW + 1) * clfW.iloc[-1])
    const = 1.0 - slope * clfW.iloc[-1]
    clfW = const + slope * clfW
    clfW[clfW < 0] = 0
    return clfW