In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from random import randint, shuffle, sample
from bokeh.io import show, output_notebook, output_file, save
from bokeh.models import Band, ColumnDataSource, HoverTool, Legend, Row, Column
from bokeh.models.callbacks import CustomJS
from bokeh.transform import transform
from bokeh.plotting import figure
from bokeh.models import DataRange1d, Range1d, Step, LinearColorMapper, SingleIntervalTicker
from bokeh.palettes import OrRd, Blues, Reds, Category10, Dark2, Set2, Category20
from bokeh.models.widgets import (DatePicker, Panel, Tabs, Select, 
                                  Slider, DataTable, DateFormatter, TableColumn, HTMLTemplateFormatter,
                                 StringFormatter, Button, Div)

import sys
sys.path.append('../src')
import htmltext
import yfinance as yf   


def chunkIt(seq, num):
    avg = len(seq) / float(num)
    out = []
    last = 0.0

    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg

    return out

# Pull S&P100 stock data

In [3]:
table=pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
df_stock_cats = table[0][['Symbol', 'GICS Sector']]
sp100 = list(pd.read_html('https://en.wikipedia.org/wiki/S%26P_100')[2]['Symbol'])

num_stocks = df_stock_cats.shape[0]

In [4]:
# data = []
# var_names = []
# for i, row in list(df_stock_cats.iterrows())[:num_stocks]:
#     sym = row['Symbol']
#     if (sym in sp100):
#         ser = yf.Ticker(sym).history(period="24mo")['Close'].rename(sym)
#         if (len(ser) > 0):
#             var_names.append(sym)
#             data.append(ser)
        
# df = pd.concat(data, axis=1)
# df.to_csv('../data/sp100.csv')

- BRK.B: No data found, symbol may be delisted


In [5]:
# Read from csv
df = pd.read_csv('../data/sp100.csv', parse_dates=['Date']).set_index('Date')
var_names = list(df.columns)

In [6]:
# fill missing data
df=df.fillna(method='bfill').fillna(method='ffill')

# Calculate EWMA
df_ewm = df.ewm(alpha=0.1).mean()
df_ewm.columns = [i+'_ewm' for i in df_ewm.columns]
df = pd.concat([df, df_ewm], axis=1)

# Calculate UL and LL using benchmark period
for i in var_names:
    df[f'{i}_usl'] = df[f'{i}_ewm'] + 3*np.std(df[f'{i}_ewm'] - df[f'{i}']) 
    df[f'{i}_lsl'] = df[f'{i}_ewm'] - 3*np.std(df[f'{i}_ewm'] - df[f'{i}']) 


# Default plotting data
df['plotVar'] = df.loc[:,var_names[0]]
df['plotVar_ewm'] = df.loc[:,f'{var_names[0]}_ewm']
df['plotVar_usl'] = df.loc[:,f'{var_names[0]}_usl']
df['plotVar_lsl'] = df.loc[:,f'{var_names[0]}_lsl']
df.index.name='date'

df['date_str'] = df.index.strftime('%Y-%m-%d')

# Input 1: ColumnDS
cds_tsplot = ColumnDataSource(df.reset_index())


In [7]:
_categories = dict(df_stock_cats[:num_stocks].groupby('GICS Sector')['Symbol'].apply(list))

categories = {}
for i, (k, v) in enumerate(_categories.items()):
    categories[k] = {'vars':v, 'color':(list(Category10[10])+["white"])[i]}
    
# Input 2: Variable metadata
var_meta = {
    name:{'category':'', 'color':'white'}
    for name in var_names
}

for k,v in categories.items():
    for var in v['vars']:
        try:
            var_meta[var]['category'] = k
            var_meta[var]['color'] = v['color']   
        except KeyError:
            pass

In [8]:
df.head()

Unnamed: 0_level_0,MMM,ABT,ABBV,ACN,ADBE,ALL,GOOGL,GOOG,MO,AMZN,...,WBA_lsl,DIS_usl,DIS_lsl,WFC_usl,WFC_lsl,plotVar,plotVar_ewm,plotVar_usl,plotVar_lsl,date_str
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-12-03,194.705399,71.924355,82.779366,162.138977,255.259995,85.162231,1116.359985,1106.430054,47.948757,1772.359985,...,73.507105,129.436753,97.48822,55.653695,44.608131,194.705399,194.705399,212.106375,177.304422,2018-12-03
2018-12-04,188.587051,69.241852,80.030663,159.602203,245.820007,82.897896,1062.469971,1050.819946,46.804459,1668.400024,...,72.104694,127.95596,96.007426,54.457042,43.411478,188.587051,191.485216,208.886192,174.08424,2018-12-04
2018-12-06,189.034714,69.280586,79.83622,154.781387,250.630005,81.640984,1078.079956,1068.72998,46.804459,1699.189941,...,71.68006,127.992326,96.043793,53.824307,42.778743,189.034714,190.580972,207.981948,173.179995,2018-12-06
2018-12-07,184.893661,67.740784,76.857712,153.031906,238.0,80.412872,1046.579956,1036.579956,46.615173,1629.130005,...,70.996466,127.588121,95.639588,53.28662,42.241056,184.893661,188.927203,206.328179,171.526227,2018-12-07
2018-12-10,184.968277,68.680161,77.43222,153.294342,244.089996,79.90435,1053.180054,1039.550049,45.909672,1641.030029,...,70.582526,127.318429,95.369896,52.636867,41.591303,184.968277,187.960456,205.361432,170.55948,2018-12-10


In [9]:
df.shape

(505, 405)

In [10]:
df.dropna().shape

(505, 405)

# Random Walks

In [11]:
# # Define parameters for the walk
# dims = 25
# step_n = 365*3
# step_set = [-1,0, 1]
# np.random.seed(1)
# shock_num = 5

# # Simulate steps in 1D
# step_shape = (step_n,dims)
# steps = np.random.choice(a=step_set, size=step_shape)
# path = np.concatenate([steps]).cumsum(0)

# Create Inputs

Create the two inputs:
- Column data source with time series data and limits
- Variable metadata dictionary

In [12]:
# dt = pd.date_range(datetime.today()-pd.Timedelta(f'{step_n-1} days'), datetime.today(), normalize=True)
# var_names = [f'x{i}' for i in range(dims)]

# # DataFrame
# df = pd.DataFrame(path, index=dt, columns=var_names)

# for i in var_names:
#     for j in range(shock_num):
#         df[i] = df[i] + np.random.randint(5,10)*(np.arange(0, df.shape[0]) > np.random.randint(df.shape[0]))
#         df[i] = df[i] + np.random.randint(-10,-5)*(np.arange(0, df.shape[0]) > np.random.randint(df.shape[0]))

# # Create a variable very similar to x0
# df[f'x{dims}'] = df['x0'] + 1*np.random.rand(df.shape[0]) + 50
# for i in range(5):
#     df[f'x{dims}'] = df[f'x{dims}'] + np.random.randint(-5,5)*(np.arange(0, df.shape[0]) > np.random.randint(df.shape[0]))
# var_names.append(f'x{dims}')

        
# # Calculate EWMA
# df_ewm = df.ewm(alpha=0.1).mean()
# df_ewm.columns = [i+'_ewm' for i in df_ewm.columns]
# df = pd.concat([df, df_ewm], axis=1)

# # Calculate UL and LL using benchmark period
# for i in var_names:
#     df[f'{i}_usl'] = df[f'{i}_ewm'] + 3*np.std(df[f'{i}_ewm'] - df[f'{i}']) 
#     df[f'{i}_lsl'] = df[f'{i}_ewm'] - 3*np.std(df[f'{i}_ewm'] - df[f'{i}']) 


# # Default plotting data
# df['plotVar'] = df.loc[:,var_names[0]]
# df['plotVar_ewm'] = df.loc[:,f'{var_names[0]}_ewm']
# df['plotVar_usl'] = df.loc[:,f'{var_names[0]}_usl']
# df['plotVar_lsl'] = df.loc[:,f'{var_names[0]}_lsl']
# df.index.name='date'

# # Input 1: ColumnDS
# cds_tsplot = ColumnDataSource(df.reset_index())

# # Variable categorization
# cat_num = 3
# categories = {}
# var_groups = chunkIt(sample(var_names, len(var_names)), cat_num)

# for i,j in enumerate(Set2[cat_num]):
#     categories[i]={'vars':var_groups[i], 'color':j}

# # Input 2: Variable metadata
# var_meta = {
#     name:{'category':'', 'color':'white'}
#     for name in var_names
# }

# for k,v in categories.items():
#     for var in v['vars']:
#         var_meta[var]['category'] = k
#         var_meta[var]['color'] = v['color']   

In [13]:
# df.head()

In [14]:
# var_meta

# TimeSeries Plot

In [15]:
p = figure(title="S&P 100 stocks", x_axis_type='datetime', plot_height=300, plot_width=550, active_drag='box_select', tools='pan, box_select, box_zoom, reset')

p.line(x='date', y='plotVar', color='black', nonselection_line_color='grey', line_width=2, alpha=0.7, source=cds_tsplot)
p.circle(x='date', y='plotVar', color='black', nonselection_fill_color='grey', size=0, alpha=0.8, source=cds_tsplot)
p.line(x='date', y='plotVar_ewm', color='#7E8A97', nonselection_line_color='grey', line_width=2, alpha=0.9, source=cds_tsplot)

p.line(x='date', y='plotVar_usl', color='#e73360', nonselection_line_color='#E97171', line_width=2, alpha=0.9, source=cds_tsplot)
p.circle(x='date', y='plotVar_usl', color='#e73360', nonselection_fill_color='#E97171', size=0, alpha=0.8, source=cds_tsplot)

p.line(x='date', y='plotVar_lsl', color='#e73360', nonselection_line_color='#E97171', line_width=2, alpha=0.9, source=cds_tsplot)
p.circle(x='date', y='plotVar_lsl', color='#e73360', nonselection_fill_color='#E97171', size=0, alpha=0.8, source=cds_tsplot)

p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = var_names[0]


p.add_tools(HoverTool(
            tooltips=[
                ("Date", "@date_str"),
                ("Price", "$@plotVar{0.2f}"),
            ]
        ))

plot_yaxis = p.yaxis[0]

# Select widgets

In [16]:
select = Select(title="Variable", value=var_names[0], options=var_names, width=200)
select_dist = Select(title="Distance Type", value='dtw', options=['euclid', 'dtw'], width=200)

# DataTable

In [17]:
var_rule = ''
for k,v in var_meta.items():
    s = """if(variable == "{var}" ){{return("{color}")}} \n""".format(var=k, color=v['color'])
    var_rule += s

slope_rule = """
    if(slope > 0 ){{return("#68B0AB")}}
    else if(slope < 0 ){{return("#FF7E67")}}
"""
    
selection_summary_data = dict(
    variable=[k for k,v in var_meta.items()],
    group=[v['category'] for k,v in var_meta.items()],
    slope=[None for k,v in var_meta.items()],
    usl=[0 for k,v in var_meta.items()],
    lsl=[0 for k,v in var_meta.items()],
    dist=[None for k,v in var_meta.items()],
)
cds_selection_summary_data = ColumnDataSource(selection_summary_data)


cell_format = HTMLTemplateFormatter(
    template="""
    <div style="
    color: #4B5D67; 
    text-align: left;"
    > <%= value %> </div>"""
)

template_slope="""
    <div style="background:<%= 
        (function colorfromint(){{{x}}}()) %>; 
        color: #4B5D67;
        text-align: left;
    "> 
    <%= value %>
    </div>
    """.format(x=slope_rule)

template_var="""
    <div style="background:<%= 
        (function colorfromint(){{{x}}}()) %>; 
        color: black;
        text-align: left;
    "> 
    <%= value %>
    </div>
    """.format(x=var_rule)


columns = [
    TableColumn(
        field="variable", title="variable", 
        formatter=cell_format, 
        width = 10
    ),
    TableColumn(field='dist', title='distance', width = 10, formatter=cell_format),
    TableColumn(
        field='slope', title='slope', 
        formatter=HTMLTemplateFormatter(template=template_slope), 
        width = 10
    ),
    TableColumn(
        field="group", title="group", 
        formatter=HTMLTemplateFormatter(template=template_var), 
        width = 350,
    ),
    TableColumn(field='usl', formatter=cell_format, title='ucl', width = 1),
    TableColumn(field='lsl', formatter=cell_format, title='lcl', width = 1),
]

data_table = DataTable(
    source=cds_selection_summary_data,
    columns=columns,
    fit_columns=True,
    selectable = True,
    sortable = True,
    width=400,
    height=270,
    css_classes=["sum-table"]
)


# `CustomJS` Callbacks

In [18]:
cds_selection_summary_data.selected.js_on_change(
    'indices',
    CustomJS(
        args=dict(((k, eval(k)) for k in ['select', 'cds_selection_summary_data'])),
        code="""
            select.value = cds_selection_summary_data.data['variable'][cds_selection_summary_data.selected.indices]
            select.change.emit()
    """
    )
)

select.js_on_change(
    "value", 
    CustomJS(
        args=dict(((k, eval(k)) for k in ['cds_tsplot', 'select', 'cds_selection_summary_data', 'var_meta','plot_yaxis'])),
        code=f"""            
            const select_value = cb_obj.value
            cds_tsplot.data['plotVar'] = cds_tsplot.data[select_value]
            cds_tsplot.data['plotVar_ewm'] = cds_tsplot.data[select_value+"_ewm"]
            cds_tsplot.data['plotVar_usl'] = cds_tsplot.data[select_value+"_usl"]
            cds_tsplot.data['plotVar_lsl'] = cds_tsplot.data[select_value+"_lsl"]
            cds_tsplot.change.emit()
            plot_yaxis.axis_label = select_value; 
        """
    )
)

button = Button(label='Cluster segments', button_type="primary", width=100, css_classes=['cluster'])
button.js_on_click(
        CustomJS(
        args=dict(((k, eval(k)) for k in ['cds_tsplot', 'select', 'select_dist','cds_selection_summary_data', 'var_meta'])),
        code=f""" 
            {htmltext.sum_calc.format(alert='alert("Select a segment of the time series from the plot below using the box select tool");')}
            {htmltext.plot_cluster}
        """
        )
)

button_dtw = Button(label='Calculate distance', button_type="primary", width=100)
button_dtw.js_on_click(
        CustomJS(
        args=dict(((k, eval(k)) for k in ['cds_tsplot', 'select', 'select_dist', 'cds_selection_summary_data', 'var_meta'])),
        code=f""" 
            {htmltext.dist_calc}
        """
        )
)

cds_tsplot.selected.js_on_change(
    'indices', 
    CustomJS(
        args=dict(((k, eval(k)) for k in ['cds_tsplot', 'select', 'cds_selection_summary_data', 'var_meta'])),
        code=f"""            
            {htmltext.sum_calc.format(alert="")}
        """
    )
)


In [19]:
div_head = Div(text=htmltext.div_head)
div_social = Div(text=htmltext.div_social)

# Create Dashboard

In [20]:
output_file('../report/ts-cluster.html', title='TimeString')

dash = Column(
        div_social,
        Row(htmltext.space(8),
            Column(
                div_head, 
                Row(select, select_dist), 
                Row(
                    Column(
                        p, 
                        Row(button)
                    ), 
                    Column(
                        data_table, 
                        button_dtw
                    )
                )
            )
        )
)

save(dash, template=htmltext.template)

'/Users/hasannagib/Desktop/projects/timestring/report/ts-cluster.html'