# CNN for Trading - Part 2: From Time-Series Features to Clustered Images

### Loading Libraries

In [38]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import pandas_datareader.data as web

# Data Visualization
import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt

# Warnings
import warnings

# Path 
import sys
from time import time
from pathlib import Path
from random import randint

# Notebook Optimization
from tqdm import tqdm

# SciPy
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet

# Scikit-Learn
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression

In [39]:
idx = pd.IndexSlice

sns.set_style('white')

warnings.filterwarnings('ignore')

In [42]:
MONTH = 21

YEAR = 12 * MONTH

In [44]:
START = '2001-01-01'

END = '2017-12-31'

In [46]:
results_path = Path('results', 'cnn_for_trading')

if not results_path.exists():
    results_path.mkdir(parents=True)

### Loading Model Data

In [51]:
with pd.HDFStore('data.h5') as store:
    features = store.get('features')
    targets = store.get('targets')

In [53]:
features.info()

In [55]:
targets.info()

### Features using Mutual Information Selection

In [58]:
mi = {}

for t in tqdm([1, 5]):
    target = f'r{t:02}_fwd'
    # Sample a smaller number to speed up the computation
    df = features.join(targets[target]).dropna().sample(n=100000)
    X = df.drop(target, axis=1)
    y = df[target]
    mi[t] = pd.Series(mutual_info_regression(X=X, y=y),
                      index=X.columns).sort_values(ascending=False)

In [60]:
mutual_info = pd.DataFrame(mi)

mutual_info.to_hdf('data.h5', 'mutual_info')

In [62]:
mutual_info = pd.read_hdf('data.h5', 'mutual_info')

In [64]:
mi_by_indicator = (mutual_info.groupby(mutual_info.
                                       index.to_series()
                                       .str.split('_').str[-1])
                   .mean()
                   .rank(ascending=False)
                   .sort_values(by=1))

In [66]:
mutual_info.boxplot()
sns.despine();
plt.show()

In [68]:
(mutual_info.groupby(mutual_info.index.to_series().str.split('_').str[-1])[1]
 .mean()
 .sort_values().plot.barh(title='Mutual Information with 1-Day Forward Returns'))
sns.despine()
plt.tight_layout()
plt.savefig(results_path / 'mutual_info_cnn_features', dpi=300)
plt.show()

In [70]:
best_features = mi_by_indicator.head(15).index

In [72]:
size = len(best_features)

### Hierarchical Feature Clustering

In [75]:
features = pd.concat([features.filter(like=f'_{f}') for f in best_features], axis=1)

In [77]:
new_cols = {}

for feature in best_features:
    fnames = sorted(features.filter(like=f'_{feature}').columns.tolist())
    renamed = [f'{i:02}_{feature}' for i in range(1, len(fnames)+ 1)]
    new_cols.update(dict(zip(fnames, renamed)))

features = features.rename(columns=new_cols).sort_index(1)

In [79]:
features.info()

### Hierarchical Clustering

In [82]:
def cluster_features(data, labels, ax, title):
    data = StandardScaler().fit_transform(data)
    pairwise_distance = pdist(data)
    Z = linkage(data, 'ward')
    c, coph_dists = cophenet(Z, pairwise_distance)
    dend = dendrogram(Z,
                      labels=labels,
                      orientation='top',
                      leaf_rotation=0.,
                      leaf_font_size=8.,
                      ax=ax)
    ax.set_title(title)
    return dend['ivl']

In [84]:
fig, axes = plt.subplots(figsize=(15, 4), ncols=2)

labels = sorted(best_features)
title = 'Column Features: Indicators'
col_order = cluster_features(features.dropna().values.reshape(-1, 15).T,
                             labels,
                             axes[0],
                             title)

labels = list(range(1, 16))
title = 'Row Features: Indicator Parameters'
row_order = cluster_features(
    features.dropna().values.reshape(-1, 15, 15).transpose((0, 2, 1)).reshape(-1, 15).T,
    labels, axes[1], title)
axes[0].set_xlabel('Indicators')
axes[1].set_xlabel('Parameters')
sns.despine()
fig.tight_layout()
fig.savefig(results_path / 'cnn_clustering', dpi=300)
plt.show()

In [86]:
feature_order = [f'{i:02}_{j}' for i in row_order for j in col_order]

In [88]:
features = features.loc[:, feature_order]

In [90]:
features = features.apply(pd.to_numeric, downcast='float')

In [92]:
features.info()

In [94]:
features.to_hdf('data.h5', 'img_data')