In [1]:
import pandas as pd
import numpy as np
import scipy.stats as statsbb
import matplotlib.pyplot as plt

In [2]:
sp500_futures = pd.read_csv("../data/sp500_futures_tick_data.csv")
sp500_futures.head()

Unnamed: 0,date,time,price,volume
0,01/03/2000,08:30:34.000,1496.4,0
1,01/03/2000,08:30:36.000,1496.0,0
2,01/03/2000,08:30:37.000,1495.5,0
3,01/03/2000,08:30:46.000,1495.0,0
4,01/03/2000,08:30:53.000,1495.5,0


## 0. Add SP500 spot price

In [3]:
sp500_original = pd.read_csv('../data/sp500.csv')
# The time is always "00:00:00"
sp500 = pd.DataFrame()
# Remove the time, which is always 00:00:00
sp500['date'] = sp500_original.Date.str.replace("00:00:00", "").str.strip()
# Change date format to match with dates from sp500_futures
sp500['date'] = pd.to_datetime(sp500['date']).dt.strftime('%m/%d/%Y')
# Set the price using the adjusted close
sp500['price'] = sp500_original['Adj Close']

del sp500_original

sp500.head()

Unnamed: 0,date,price
0,01/03/1950,16.66
1,01/04/1950,16.85
2,01/05/1950,16.93
3,01/06/1950,16.98
4,01/09/1950,17.08


In [4]:
sp500_futures = pd.merge(sp500_futures, sp500, on='date') # inner join by default
sp500_futures.rename(columns={'price_x': 'price', 'price_y': 'spot_price'}, inplace=True)
sp500_futures.head()

Unnamed: 0,date,time,price,volume,spot_price
0,01/03/2000,08:30:34.000,1496.4,0,1455.22
1,01/03/2000,08:30:36.000,1496.0,0,1455.22
2,01/03/2000,08:30:37.000,1495.5,0,1455.22
3,01/03/2000,08:30:46.000,1495.0,0,1455.22
4,01/03/2000,08:30:53.000,1495.5,0,1455.22


## 1. Add the vertical barrier (Chapter 3.5)

#### Convert time bars to dollar bars

In [5]:
dollar_size = 10000
sp500_futures['dollar'] = sp500_futures.price * sp500_futures.volume
sp500_futures['dollar_group'] = sp500_futures['dollar'].cumsum().astype(int) // dollar_size
sp500_futures.head()

Unnamed: 0,date,time,price,volume,spot_price,dollar,dollar_group
0,01/03/2000,08:30:34.000,1496.4,0,1455.22,0.0,0
1,01/03/2000,08:30:36.000,1496.0,0,1455.22,0.0,0
2,01/03/2000,08:30:37.000,1495.5,0,1455.22,0.0,0
3,01/03/2000,08:30:46.000,1495.0,0,1455.22,0.0,0
4,01/03/2000,08:30:53.000,1495.5,0,1455.22,0.0,0


In [6]:
def aggregate_dollar_bars(x):
    d = {}
    d['date'] = x['date'].iloc[-1]
    d['time'] = x['time'].iloc[-1]
    d['open'] = x['price'].iloc[0]
    d['close'] = x['price'].iloc[-1]
    d['low'] = x['price'].min()
    d['high'] = x['price'].max()
    d['spot_price'] = x['spot_price'].iloc[-1]
    d['total_volume'] = x['volume'].sum()
    # In edge case, d['total_volume'] can be 0 if all volume in the df is 0
    d['vol_weighted_avg_price'] = x['price'].dot(x['volume']) / d['total_volume'] if d['total_volume'] else 0
    d['dollar'] = x['dollar'].sum()
    d['count'] = x['date'].count()
    return pd.Series(d)

In [7]:
sp500_futures_dollar = sp500_futures.groupby('dollar_group', as_index=False).apply(aggregate_dollar_bars)
sp500_futures_dollar.head()

Unnamed: 0,dollar_group,date,time,open,close,low,high,spot_price,total_volume,vol_weighted_avg_price,dollar,count
0,0,06/30/2003,23:04:16.000,1496.4,972.0,767.5,1574.0,974.5,3,971.966667,2915.9,2812947
1,1,06/30/2003,23:04:34.000,972.0,972.0,972.0,972.0,974.5,9,972.0,8748.0,1
2,10,06/30/2003,23:16:38.000,972.0,971.5,971.5,972.0,974.5,100,971.976,97197.6,8
3,11,06/30/2003,23:21:20.000,971.8,971.8,971.8,971.8,974.5,9,971.8,8746.2,2
4,12,06/30/2003,23:45:14.000,971.8,972.3,971.8,972.3,974.5,10,971.91,9719.1,6


#### Apply CUSUM filter on intraday returns

In [8]:
sp500_futures_dollar['returns'] = np.log(sp500_futures_dollar.close / sp500_futures_dollar.open)
sp500_futures_dollar['return_diff'] = sp500_futures_dollar['returns'].diff().fillna(0)

cusum_threshold = sp500_futures_dollar['returns'].std()
print(cusum_threshold)

0.0007371667092802313


In [9]:
# Use E_{t-1}(y_t) = y_t
def cusum_filter_pos(row, threshold=0.05):
    if row.name == 0:
        cusum_filter_pos.cumsum = 0
        cusum_filter_pos.count = 0
    
    cusum_filter_pos.cumsum = max(0, cusum_filter_pos.cumsum + row.return_diff)
    if cusum_filter_pos.cumsum > threshold:
        cusum_filter_pos.cumsum = 0
        cusum_filter_pos.count =+ 1
    
    return cusum_filter_pos.cumsum, cusum_filter_pos.count


def cusum_filter_neg(row, threshold=0.05):
    if row.name == 0:
        cusum_filter_neg.cumsum = 0
        cusum_filter_neg.count = 0
    
    cusum_filter_neg.cumsum = min(0, cusum_filter_neg.cumsum + row.return_diff)
    if cusum_filter_neg.cumsum < -threshold:
        cusum_filter_neg.cumsum = 0
        cusum_filter_neg.count += 1
    
    return cusum_filter_neg.cumsum, cusum_filter_neg.count

In [10]:
sp500_futures_dollar['cusum_filter_pos'] = sp500_futures_dollar.apply(cusum_filter_pos, args=(cusum_threshold, ), axis=1)
sp500_futures_dollar['cusum_filter_neg'] = sp500_futures_dollar.apply(cusum_filter_neg, args=(cusum_threshold, ), axis=1)

print(sp500_futures_dollar['cusum_filter_pos'].tail())
print(sp500_futures_dollar['cusum_filter_neg'].tail())

1254776    (0, 1)
1254777    (0, 1)
1254778    (0, 1)
1254779    (0, 1)
1254780    (0, 1)
Name: cusum_filter_pos, dtype: object
1254776    (0, 69173)
1254777    (0, 69174)
1254778    (0, 69175)
1254779    (0, 69175)
1254780    (0, 69176)
Name: cusum_filter_neg, dtype: object


#### Add the vertical barrier

In [11]:
# Compute tEvents, which is the timestamp Series selected by CUSUM
# Let's use Negative cause it samples more than Positive
sp500_futures_dollar['prev_cusum_neg'] = sp500_futures_dollar['cusum_filter_neg'].shift(1)
sp500_futures_dollar.dropna(inplace=True)
sp500_futures_dollar.reset_index(inplace=True, drop=True)
sp500_futures_dollar.head()

Unnamed: 0,dollar_group,date,time,open,close,low,high,spot_price,total_volume,vol_weighted_avg_price,dollar,count,returns,return_diff,cusum_filter_pos,cusum_filter_neg,prev_cusum_neg
0,1,06/30/2003,23:04:34.000,972.0,972.0,972.0,972.0,974.5,9,972.0,8748.0,1,0.0,0.431462,"(0, 1)","(0, 0)","(0, 0)"
1,10,06/30/2003,23:16:38.000,972.0,971.5,971.5,972.0,974.5,100,971.976,97197.6,8,-0.000515,-0.000515,"(0, 1)","(-0.0005145356429443377, 0)","(0, 0)"
2,11,06/30/2003,23:21:20.000,971.8,971.8,971.8,971.8,974.5,9,971.8,8746.2,2,0.0,0.000515,"(0.0005145356429443377, 1)","(0, 0)","(-0.0005145356429443377, 0)"
3,12,06/30/2003,23:45:14.000,971.8,972.3,971.8,972.3,974.5,10,971.91,9719.1,6,0.000514,0.000514,"(0, 1)","(0, 0)","(0, 0)"
4,13,06/30/2003,23:45:27.000,972.3,972.3,972.3,972.3,974.5,9,972.3,8750.7,2,0.0,-0.000514,"(0, 1)","(-0.0005143768438087622, 0)","(0, 0)"


In [12]:
sp500_futures_dollar['datetime'] = pd.to_datetime(sp500_futures_dollar.date + " " + sp500_futures_dollar.time)
tEvents = sp500_futures_dollar[sp500_futures_dollar.cusum_filter_neg.str[1] == sp500_futures_dollar.prev_cusum_neg.str[1] + 1].datetime
tEvents.head()

6    2003-07-01 00:03:43
28   2003-07-01 01:41:26
39   2003-07-01 02:10:41
60   2003-07-01 02:40:21
73   2003-07-01 03:10:08
Name: datetime, dtype: datetime64[ns]

In [13]:
days = 1
t1 = sp500_futures_dollar.datetime.searchsorted(tEvents + pd.Timedelta(days=days))
t1 = t1[t1 < sp500_futures_dollar.shape[0]]  # Remove those that are inserted at the end
sp500_futures_dollar['barrier'] = sp500_futures_dollar.index.to_series().apply(lambda x: x in t1)

#### Add the upper and lower barriers

Compute the profit-taking and stop-loss barriers

In [14]:
ptSl = [1, 1]
# unit width of the horizontal barrier
ewm_span = 10
horizontal_barrier_width = 5
n = sp500_futures_dollar.shape[0]
sp500_futures_dollar['exp_std'] = sp500_futures_dollar.close.ewm(span=ewm_span).std()
# Create randomly oscillating profit_taking and stop_loss barriers 
sp500_futures_dollar['profit_taking'] = sp500_futures_dollar.close + sp500_futures_dollar.exp_std * horizontal_barrier_width * np.random.random(n)
sp500_futures_dollar['stop_loss'] = sp500_futures_dollar.close - sp500_futures_dollar.exp_std * horizontal_barrier_width * np.random.random(n)
sp500_futures_dollar.dropna(inplace=True)
sp500_futures_dollar.reset_index(inplace=True, drop=True)

In [15]:
print("Sanity check for profit taking <= close: {}".format((sp500_futures_dollar.profit_taking <= sp500_futures_dollar.close).sum()))
print("Sanity check for stop loss >= close: {}".format((sp500_futures_dollar.stop_loss >= sp500_futures_dollar.close).sum()))
print("Profit taking percentage width:")
print(((sp500_futures_dollar.profit_taking - sp500_futures_dollar.close) / sp500_futures_dollar.close * 100).describe())
print("Stop loss percentage width:")
print(((sp500_futures_dollar.close - sp500_futures_dollar.stop_loss) / sp500_futures_dollar.close * 100).describe())

Sanity check for profit taking <= close: 0
Sanity check for stop loss >= close: 0
Profit taking percentage width:
count    1.254779e+06
mean     1.319411e-01
std      2.808242e-01
min      3.963396e-08
25%      2.908360e-02
50%      6.527537e-02
75%      1.379790e-01
max      2.306908e+01
dtype: float64
Stop loss percentage width:
count    1.254779e+06
mean     1.319532e-01
std      2.810252e-01
min      5.033294e-08
25%      2.907941e-02
50%      6.530170e-02
75%      1.380927e-01
max      2.194701e+01
dtype: float64


For each row, get the index when it first touches either barrier

In [16]:
ABOVE_PROFIT = 1
NORMAL = 0
BELOW_LOSS = -1

def price_barrier_position(x):
    if x.close > x.profit_taking:
        return ABOVE_PROFIT
    elif x.close < x.stop_loss:
        return BELOW_LOSS
    else:
        return NORMAL

sp500_futures_dollar['price_barrier_position'] = sp500_futures_dollar.apply(price_barrier_position, axis=1)

In [17]:
# upper_i = min_{j <= i} price_barrier_position_j == 1
def find_first_touch(series, position):
    result = []
    cur_idx = -1 # not touching even at the end
    for idx, value in reversed(list(series.items())):
        if value == position:
            # Update current idx
            cur_idx = idx
        result.append(cur_idx)

    return pd.Series(reversed(result))

In [18]:
sp500_futures_dollar['first_time_upper_barrier'] = find_first_touch(sp500_futures_dollar.price_barrier_position, ABOVE_PROFIT)
sp500_futures_dollar['first_time_lower_barrier'] = find_first_touch(sp500_futures_dollar.price_barrier_position, BELOW_LOSS)
sp500_futures_dollar['first_time_vertical_barrier'] = find_first_touch(sp500_futures_dollar.barrier, True)

Replace those with index of the first touch = -1 with the largest indext

In [19]:
sp500_futures_dollar["first_time_upper_barrier"] = np.where(sp500_futures_dollar.first_time_upper_barrier == -1, sp500_futures_dollar.shape[0]-1, sp500_futures_dollar.first_time_upper_barrier)
sp500_futures_dollar["first_time_lower_barrier"] = np.where(sp500_futures_dollar.first_time_lower_barrier == -1, sp500_futures_dollar.shape[0]-1, sp500_futures_dollar.first_time_lower_barrier)
sp500_futures_dollar["first_time_vertical_barrier"] = np.where(sp500_futures_dollar.first_time_vertical_barrier == -1, sp500_futures_dollar.shape[0]-1, sp500_futures_dollar.first_time_vertical_barrier)

In [20]:
def assign_labels(df):    
    PROFIT = 1
    NEUTRAL = 0
    LOSS = -1
    
    labels = []
    label_returns = []
    for i in range(df.shape[0]):
        if df.price_barrier_position[i] == NORMAL:
            df_min_idx = min(df.first_time_upper_barrier[i], df.first_time_lower_barrier[i], df.first_time_vertical_barrier[i])
            l_return = df.close[df_min_idx] - df.close[i]
            label_returns.append(l_return)
            
            if df_min_idx == df.first_time_upper_barrier[i]:
                labels.append(PROFIT)
            elif df_min_idx == df.first_time_lower_barrier[i]:
                labels.append(LOSS)
            else:
                # labels.append(NEUTRAL)
                labels.append(np.sign(df.close[df_min_idx] - df.close[i]))
        elif df.price_barrier_position[i] == ABOVE_PROFIT:
            df_min_idx = min(df.first_time_lower_barrier[i], df.first_time_vertical_barrier[i])
            l_return = df.close[df_min_idx] - df.close[i]
            label_returns.append(l_return)
            
            if df_min_idx == df.first_time_lower_barrier[i]:
                labels.append(LOSS)
            else:
                # labels.append(NEUTRAL)
                labels.append(np.sign(df.close[df_min_idx] - df.close[i]))
        else: # df.close[i] == BELOW_LOSS:
            df_min_idx = min(df.first_time_upper_barrier[i], df.first_time_vertical_barrier[i])
            l_return = df.close[df_min_idx] - df.close[i]
            label_returns.append(l_return)
            
            if df_min_idx == df.first_time_upper_barrier[i]:
                labels.append(PROFIT)
            else:
                # labels.append(NEUTRAL)
                labels.append(np.sign(df.close[df_min_idx] - df.close[i]))

    return pd.Series(labels), pd.Series(label_returns)

In [21]:
labels, label_returns = assign_labels(sp500_futures_dollar)

sp500_futures_dollar['labels'] = labels
# Temporarily convert 0 labels to 1. It happens at the index of vertical bars, and the sign of the return is 0
sp500_futures_dollar['labels'] = np.where(sp500_futures_dollar.labels == 0, 1, sp500_futures_dollar.labels) 
sp500_futures_dollar['label_returns'] = label_returns

In [22]:
sp500_futures_dollar['labels'].value_counts()

labels
 1.0    693407
-1.0    561372
Name: count, dtype: int64

In [23]:
sp500_futures_dollar.head()

Unnamed: 0,dollar_group,date,time,open,close,low,high,spot_price,total_volume,vol_weighted_avg_price,...,barrier,exp_std,profit_taking,stop_loss,price_barrier_position,first_time_upper_barrier,first_time_lower_barrier,first_time_vertical_barrier,labels,label_returns
0,10,06/30/2003,23:16:38.000,972.0,971.5,971.5,972.0,974.5,100,971.976,...,False,0.353553,972.875332,970.162398,0,1254778,1254778,424,1.0,11.5
1,11,06/30/2003,23:21:20.000,971.8,971.8,971.8,971.8,974.5,9,971.8,...,False,0.241466,972.750437,971.163314,0,1254778,1254778,424,1.0,11.2
2,12,06/30/2003,23:45:14.000,971.8,972.3,971.8,972.3,974.5,10,971.91,...,False,0.351822,972.796138,971.054563,0,1254778,1254778,424,1.0,10.7
3,13,06/30/2003,23:45:27.000,972.3,972.3,972.3,972.3,974.5,9,972.3,...,False,0.343269,972.941631,970.731361,0,1254778,1254778,424,1.0,10.7
4,14,07/01/2003,00:02:11.000,972.3,972.6,972.0,972.6,982.32,14,972.3,...,False,0.397716,974.342505,970.747197,0,1254778,1254778,424,1.0,10.4


In [24]:
sp500_futures_dollar.to_csv('../data/processed_sp500_futures_dollar.csv', index=False)

#### Using GBT, XGBoost, CatBoost to learn from the dataset

In [25]:
# Features: date, time, price, labels
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier # , Pool, metrics, cv
import xgboost as xgb
from xgboost import XGBClassifier

In [26]:
gbt_classifier = GradientBoostingClassifier()
cat_classifier = CatBoostClassifier(verbose=False)
xgb_classifier = XGBClassifier(verbosity=0)

xgb.set_config(verbosity=0)
Xlabels = ['year', 'month', 'day', 'hour', 'minute', 'second', 'close']
Ylabel = 'labels'

Preparing data for ML models

In [27]:
sp500_futures_dollar['time_format'] = pd.to_datetime(sp500_futures_dollar['time'], format="mixed")
sp500_futures_dollar['date_format'] = pd.to_datetime(sp500_futures_dollar['date'], format="mixed")

In [28]:
sp500_futures_dollar['year'] = sp500_futures_dollar['date_format'].dt.year
sp500_futures_dollar['month'] = sp500_futures_dollar['date_format'].dt.month
sp500_futures_dollar['day'] = sp500_futures_dollar['date_format'].dt.day

sp500_futures_dollar['hour'] = sp500_futures_dollar['time_format'].dt.hour
sp500_futures_dollar['minute'] = sp500_futures_dollar['time_format'].dt.minute
sp500_futures_dollar['second'] = sp500_futures_dollar['time_format'].dt.second

In [29]:
n_splits = 10
expanding_window_split = TimeSeriesSplit(n_splits)

In [30]:
# Training GradientBoostingClassifier
gbt_accuracy_scores = []
for fold, idx in enumerate(expanding_window_split.split(sp500_futures_dollar)):
    train_idx = idx[0]
    test_idx = idx[1]
    Xtrain = sp500_futures_dollar.iloc[train_idx][Xlabels]
    Ytrain = sp500_futures_dollar.iloc[train_idx][Ylabel]
    Xtest = sp500_futures_dollar.iloc[test_idx][Xlabels]
    Ytest = sp500_futures_dollar.iloc[test_idx][Ylabel]
    
    gbt_classifier.fit(Xtrain, Ytrain)
    predictions = gbt_classifier.predict(Xtest)
    accuracy = accuracy_score(predictions, Ytest)
    gbt_accuracy_scores.append(accuracy)
    print("Fold {}: Accuracy={}".format(fold, accuracy))

print("Accuracy scores = {}".format(gbt_accuracy_scores))
print("Average accuracy = {}".format(np.mean(gbt_accuracy_scores)))

Fold 0: Accuracy=0.4517401595511528
Fold 1: Accuracy=0.5001402647497151
Fold 2: Accuracy=0.4246076970281406
Fold 3: Accuracy=0.4677303410186727
Fold 4: Accuracy=0.5362935039887788
Fold 5: Accuracy=0.5389936004207942
Fold 6: Accuracy=0.572692206539844
Fold 7: Accuracy=0.47957394582274043
Fold 8: Accuracy=0.5197334969755414
Fold 9: Accuracy=0.4489436311037083
Accuracy scores = [0.4517401595511528, 0.5001402647497151, 0.4246076970281406, 0.4677303410186727, 0.5362935039887788, 0.5389936004207942, 0.572692206539844, 0.47957394582274043, 0.5197334969755414, 0.4489436311037083]
Average accuracy = 0.49404488471990876


In [31]:
# Training CatBoostClassifier
# Training XGBoostClassifier
cat_accuracy_scores = []
for fold, idx in enumerate(expanding_window_split.split(sp500_futures_dollar)):
    train_idx = idx[0]
    test_idx = idx[1]
    Xtrain = sp500_futures_dollar.iloc[train_idx][Xlabels]
    Ytrain = sp500_futures_dollar.iloc[train_idx][Ylabel]
    Xtest = sp500_futures_dollar.iloc[test_idx][Xlabels]
    Ytest = sp500_futures_dollar.iloc[test_idx][Ylabel]
    
    cat_classifier.fit(Xtrain, Ytrain)
    predictions = cat_classifier.predict(Xtest)
    accuracy = accuracy_score(predictions, Ytest)
    cat_accuracy_scores.append(accuracy)
    print("Fold {}: Accuracy={}".format(fold, accuracy))

print("Accuracy scores = {}".format(cat_accuracy_scores))
print("Average accuracy = {}".format(np.mean(cat_accuracy_scores)))

Fold 0: Accuracy=0.448794599807136
Fold 1: Accuracy=0.5139826422372228
Fold 2: Accuracy=0.43315508021390375
Fold 3: Accuracy=0.48982203909879896
Fold 4: Accuracy=0.5408959410888051
Fold 5: Accuracy=0.5149206627509424
Fold 6: Accuracy=0.5583063031471903
Fold 7: Accuracy=0.5174892609801
Fold 8: Accuracy=0.4807486631016043
Fold 9: Accuracy=0.4735162619444201
Accuracy scores = [0.448794599807136, 0.5139826422372228, 0.43315508021390375, 0.48982203909879896, 0.5408959410888051, 0.5149206627509424, 0.5583063031471903, 0.5174892609801, 0.4807486631016043, 0.4735162619444201]
Average accuracy = 0.4971631454370124


In [32]:
%%capture --no-stderr
# Training XGBoostClassifier
xgb_accuracy_scores = []
for fold, idx in enumerate(expanding_window_split.split(sp500_futures_dollar)):
    train_idx = idx[0]
    test_idx = idx[1]
    Xtrain = sp500_futures_dollar.iloc[train_idx][Xlabels]
    Ytrain = sp500_futures_dollar.iloc[train_idx][Ylabel].apply(lambda x: np.sign(x+1))
    Xtest = sp500_futures_dollar.iloc[test_idx][Xlabels]
    Ytest = sp500_futures_dollar.iloc[test_idx][Ylabel].apply(lambda x: np.sign(x+1))
    
    xgb_classifier.fit(Xtrain, Ytrain)
    predictions = gbt_classifier.predict(Xtest)
    accuracy = accuracy_score(predictions, Ytest)
    xgb_accuracy_scores.append(accuracy)
    print("Fold {}: Accuracy={}".format(fold, accuracy))

print("Accuracy scores = {}".format(xgb_accuracy_scores))
print("Average accuracy = {}".format(np.mean(xgb_accuracy_scores)))

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_