# `mlfinlab.bet_sizing` - Module Tutorial
The following is a tutorial in how to apply the functions in the `mlfinlab.bet_sizing` module. The exercises from Chapter 10 are used as examples.

In [2]:
# imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

from IPython.display import display
import datetime as dt

# mlfinlab imports
from mlfinlab.bet_sizing import bet_size_probability


----
#### EXERCISE 10.1
Using the formulation in Section 10.3, plot the bet size ($m$) as a function of the maximum predicted probability ($\tilde{p}$) when $||X|| = 2, 3, ..., 10$.

In [None]:
num_classes_list = [i for i in range(2, 11, 1)]  # array of number of classes, 2 to 10
n = 10_000  # number of points to plot
colors = iter(cm.coolwarm(np.linspace(0,1,len(num_classes_list))))

fig_10_1, ax_10_1 = plt.subplots(figsize=(16, 10))

for num_classes in num_classes_list:
    min_prob, max_prob = 1 / num_classes, 1  # possible range for maximum predicted probability, [1/||X||, 1]
    P = np.linspace(min_prob, max_prob, n, endpoint=False)  # range of maximum predicted probabilities to plot
    z = (P - min_prob) / (P*(1-P))**0.5
    m = 2 * norm.cdf(z) - 1
    ax_10_1.plot(P, m, label=f"||X||={num_classes}", linewidth=2, alpha=1, color=colors.__next__())

ax_10_1.set_ylabel("Bet Size $m=2Z[z]-1$", fontsize=16)
ax_10_1.set_xlabel(r"Maximum Predicted Probability $\tilde{p}=max_i${$p_i$}", fontsize=16)
ax_10_1.set_title("Figure 10.1: Bet Size vs. Maximum Predicted Probability", fontsize=18)
ax_10_1.set_xticks([0.1*i for i in range(11)])
ax_10_1.set_yticks([0.1*i for i in range(11)])
ax_10_1.legend(loc="upper left", fontsize=14, title="Number of bet size labels", title_fontsize=12)
ax_10_1.set_ylim((0,1.05))
ax_10_1.set_xlim((0, 1.05))
ax_10_1.grid(linewidth=1, linestyle=':')

plt.show()

----
#### EXERCISE 10.2
Draw 10,000 random numbers from a uniform distribution with bounds U[.5, 1.]. (Author's note: These exercises are intended to simulate dynamic bet sizing of a long-only strategy.)

__(a)__ Compute bet sizes _m_ for $||X||=2$.

__(b)__ Assign 10,000 consecutive calendar days to the bet sizes.

__(c)__ Draw 10,000 random numbers from a uniform distribution with bounds U[1, 25].

__(d)__ Form a `pandas.Series` indexed by the dates in 2.b, and with values equal to the index shifted forward the number of days in 2.c. This is a `t1` object similar to the ones we used in Chapter 3.

__(e)__ Compute the resulting average active bets, following Section 10.4.
 

In [5]:
# draw random numbers from a uniform distribution (all bets are long)
np.random.seed(0)
sample_size = 10_000
P_t = np.random.uniform(.5, 1., sample_size)  # array of random from uniform dist.
S_t = np.random.choice([-1, 1], sample_size, True, [0.3, 0.7])

# 10.2(b) assign 10,000 consecutive calendar days
start_date = dt.datetime(2000, 1, 1)  # starting at 01-JAN-2000
date_step = dt.timedelta(days=1)
dates = np.array([start_date + i*date_step for i in range(sample_size)])
prob = pd.Series(data=P_t, index=dates)
side = pd.Series(data=S_t, index=dates)

# 10.2(c) draw 10,000 random numbers from a uniform distribution
shift_list = np.random.uniform(1., 25., sample_size)
shift_dt = np.array([dt.timedelta(days=d) for d in shift_list])

# 10.2(d) create a pandas.Series object
dates_shifted = dates + shift_dt
t1 = pd.Series(data=dates_shifted, index=dates)

# Collect the series into a single DataFrame.
# Add a randomized 'side' indicator so we have both long and short bets.
df_events = pd.concat(objs=[t1, prob, side], axis=1)
df_events = df_events.rename(columns={0: 't1', 1: 'prob', 2: 'side'})
df_events = df_events[['t1', 'prob', 'side']]


print(df_events.head(10))

df_bets_1 = bet_size_probability(events=df_events, prob=df_events.prob, pred=df_events.side,
                                 num_classes=2, step_size=0.1, average_active=True, num_threads=6)

print(df_bets_1.head(10))

                                   t1      prob  side
2000-01-01 2000-01-11 09:53:29.852657  0.774407     1
2000-01-02 2000-01-03 23:42:22.297070  0.857595    -1
2000-01-03 2000-01-26 03:49:16.058948  0.801382     1
2000-01-04 2000-01-14 17:59:28.843271  0.772442    -1
2000-01-05 2000-01-28 15:54:23.534959  0.711827    -1
2000-01-06 2000-01-24 08:17:21.501801  0.822947     1
2000-01-07 2000-01-30 00:57:08.252962  0.718794     1
2000-01-08 2000-01-28 18:12:07.568200  0.945887     1
2000-01-09 2000-01-25 12:36:45.000972  0.981831    -1
2000-01-10 2000-01-12 15:07:58.351921  0.691721     1
              signal                         t1
2000-01-01  0.488510 2000-01-11 09:53:29.852657
2000-01-02 -0.693816 2000-01-03 23:42:22.297070
2000-01-03  0.550002 2000-01-26 03:49:16.058948
2000-01-04 -0.484193 2000-01-14 17:59:28.843271
2000-01-05 -0.360002 2000-01-28 15:54:23.534959
2000-01-06  0.602473 2000-01-24 08:17:21.501801
2000-01-07  0.373496 2000-01-30 00:57:08.252962
2000-01-08  0.951258 2

2019-07-03 20:36:56.283177 100.0% mp_avg_active_signals done after 0.28 minutes. Remaining 0.0 minutes.


2000-01-01 00:00:00.000000    0.5
2000-01-02 00:00:00.000000   -0.1
2000-01-03 00:00:00.000000    0.1
2000-01-03 23:42:22.297070    0.5
2000-01-04 00:00:00.000000    0.2
2000-01-05 00:00:00.000000    0.0
2000-01-06 00:00:00.000000    0.2
2000-01-07 00:00:00.000000    0.2
2000-01-08 00:00:00.000000    0.3
2000-01-09 00:00:00.000000    0.1
dtype: float64


In [4]:
df = df_events.copy()
df['bet_size'] = df_bets_1
display(df.head(20))
display(df_bets_1.head(20))

Unnamed: 0,t1,prob,side,bet_size
2000-01-01,2000-01-11 09:53:29.852657,0.774407,1,0.48851
2000-01-02,2000-01-03 23:42:22.297070,0.857595,-1,-0.102653
2000-01-03,2000-01-26 03:49:16.058948,0.801382,1,0.114899
2000-01-04,2000-01-14 17:59:28.843271,0.772442,-1,0.184773
2000-01-05,2000-01-28 15:54:23.534959,0.711827,-1,0.048579
2000-01-06,2000-01-24 08:17:21.501801,0.822947,1,0.159358
2000-01-07,2000-01-30 00:57:08.252962,0.718794,1,0.195048
2000-01-08,2000-01-28 18:12:07.568200,0.945887,1,0.303078
2000-01-09,2000-01-25 12:36:45.000972,0.981831,-1,0.140232
2000-01-10,2000-01-12 15:07:58.351921,0.691721,1,0.160427


2000-01-01 00:00:00.000000    0.488510
2000-01-02 00:00:00.000000   -0.102653
2000-01-03 00:00:00.000000    0.114899
2000-01-03 23:42:22.297070    0.519256
2000-01-04 00:00:00.000000    0.184773
2000-01-05 00:00:00.000000    0.048579
2000-01-06 00:00:00.000000    0.159358
2000-01-07 00:00:00.000000    0.195048
2000-01-08 00:00:00.000000    0.303078
2000-01-09 00:00:00.000000    0.140232
2000-01-10 00:00:00.000000    0.160427
2000-01-11 00:00:00.000000    0.224888
2000-01-11 09:53:29.852657    0.195597
2000-01-12 00:00:00.000000    0.129353
2000-01-12 15:07:58.351921    0.107949
2000-01-13 00:00:00.000000    0.148148
2000-01-14 00:00:00.000000    0.045087
2000-01-14 17:59:28.843271    0.098015
2000-01-15 00:00:00.000000    0.094265
2000-01-16 00:00:00.000000    0.092218
dtype: float64

In [14]:
len(df_bets_1)

20000

In [None]:
# 10.2(a) Compute bet sizes for ||X||=2
z = (P_t - 0.5) /  (P_t*(1-P_t))**0.5
m = 2 * norm.cdf(z) - 1  # bet sizes, x=1


# 10.2(b) assign 10,000 consecutive calendar days
start_date = dt.datetime(2000, 1, 1)  # starting at 01-JAN-2000
date_step = dt.timedelta(days=1)
dates = np.array([start_date + i*date_step for i in range(sample_size)])
bet_sizes = pd.Series(data=m, index=dates)


# 10.2(c) draw 10,000 random numbers from a uniform distribution
shift_list = np.random.uniform(1., 25., sample_size)
shift_dt = np.array([dt.timedelta(days=d) for d in shift_list])


# 10.2(d) create a pandas.Series object
dates_shifted = dates + shift_dt
t1 = pd.Series(data=dates_shifted, index=dates)

# Collect the series into a single DataFrame.
# Add a randomized 'side' indicator so we have both long and short bets.
df_events = pd.concat(objs=[t1, bet_sizes], axis=1)
df_events = df_events.rename(columns={0: 't1', 1: 'bet_size'})
df_events['p'] = P_t
df_events = df_events[['t1', 'p', 'bet_size']]


# 10.2(e) compute the average active bets
avg_bet = pd.Series()
active_bets = pd.Series()
for idx, val in t1.iteritems():
    active_idx = t1[(t1.index<=idx)&(t1>idx)].index
    num_active = len(active_idx)
    active_bets[idx] = num_active
    avg_bet[idx] = bet_sizes[active_idx].mean()

df_events['num_active_bets'] = active_bets
df_events['avg_active_bets'] = avg_bet


print("The first 10 rows of the resulting DataFrame from Exercise 10.2:")
display(df_events.head(10))
print("Summary statistics on the bet size columns:")
display(df_events[['bet_size', 'num_active_bets', 'avg_active_bets']].describe())