In [1]:
from datetime import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm.notebook import tqdm

import sys
sys.path.append('../')
from timelab import MultiColumn, from_data, from_xy_data
from timelab.utils import smash_array

In [2]:
df = pd.read_pickle('processed.pkl')

In [3]:
# df = df[['LNC', 'MAS', 'CSX']]
df = df[['MAS']]

In [4]:
df.head(5)

Unnamed: 0_level_0,MAS,MAS,MAS,MAS,MAS
Unnamed: 0_level_1,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2005-12-21,18.991695,19.154659,18.903944,19.085712,1384200.0
2005-12-22,19.085703,19.29881,18.910202,18.929007,1576900.0
2005-12-23,18.95409,18.95409,18.684571,18.816196,2090500.0
2005-12-27,18.929008,18.991687,18.80365,18.847525,1609000.0
2005-12-28,18.853798,18.935281,18.747245,18.803656,2026300.0


In [5]:
# xdata preprocessing

xdata = df.pct_change().dropna()
xdata = MultiColumn(xdata)
xdata.head()

Unnamed: 0_level_0,MAS,MAS,MAS,MAS,MAS
Unnamed: 0_level_1,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2005-12-22,0.00495,0.007526,0.000331,-0.008211,0.139214
2005-12-23,-0.006896,-0.017862,-0.011932,-0.00596,0.325702
2005-12-27,-0.001323,0.001984,0.006373,0.001665,-0.230328
2005-12-28,-0.003973,-0.00297,-0.003,-0.002328,0.259354
2005-12-29,-0.002659,-0.000331,-0.001337,-0.000667,-0.240734


In [6]:
# ydata preprocessing

ydata = xdata.copy()

ydata = ydata.channel_apply(lambda x: (x['High'] > 0).astype(float), 'Raise')
ydata = ydata.loc[xdata.index]
ydata = MultiColumn(ydata)
ydata.head()

AttributeError: 'MultiColumn' object has no attribute 'channel_apply'

In [9]:
lookback = 7
horizon = 1
gap = 0

panel = from_xy_data(xdata, ydata, lookback=lookback, horizon=horizon, gap=gap)
panel

100%|██████████| 3739/3739 [00:00<00:00, 276738.65it/s]

TimePanel
size                            3739
lookback                           7
horizon                            1
gap                                0
num_xunits                         1
num_yunits                         1
num_xchannels                      5
num_ychannels                      1
start            2005-12-22 00:00:00
end              2020-11-10 00:00:00
Name: TimePanel, dtype: object





<TimePanel, size 3739>

In [10]:
train, test = panel.train_test_split(0.2)

In [11]:
pd.Series(panel.y.flatten()).value_counts()

1.0    1887
0.0    1852
dtype: int64

In [12]:
# Should find freq [business days]... Missing calendar?
# panel.find_freq()

In [13]:
# y is 1D so flatten is enough
# y = panel.y.flatten()

In [14]:
train_panels = train.split_units(split_yunits=False)
test_panels = test.split_units(split_yunits=False)

100%|██████████| 1/1 [00:00<00:00, 73.13it/s]
100%|██████████| 1/1 [00:00<00:00, 202.89it/s]


In [18]:
inputs = {}
hidden = {}

for panel_ in tqdm(train_panels):
    name = panel_.xunits[0]

    # Remove unit dimension - it is already being processed separately.
    X = smash_array(panel_.X)

    inputs[name] = tf.keras.Input(shape=(X.shape[1:]), name='input.' + name)

    # Convoluting on the time dimension
    # [lookback] timesteps reduced to 100 nodes
    hidden[name] = tf.keras.layers.SeparableConv1D(10*lookback, lookback, name='conv.' + name, activation=tf.nn.relu)(inputs[name])
    hidden[name] = tf.keras.layers.Flatten(name='flatten.' + name)(hidden[name])
    hidden[name] = tf.keras.layers.Dense(5, activation=tf.nn.relu, name='dense.' + name)(hidden[name])

x = tf.keras.layers.concatenate(list(hidden.values()))
x = tf.keras.layers.Dense(panel.y.shape[1], activation=tf.nn.sigmoid)(x)
outputs = tf.keras.layers.Reshape(panel.y.shape[1:])(x)

model = tf.keras.Model(inputs=list(inputs.values()), outputs=outputs)
model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["binary_crossentropy", 'accuracy'])

  0%|          | 0/1 [00:00<?, ?it/s]

In [19]:
x_train = [smash_array(i.X) for i in train_panels]
y_train = train.y

x_test = [smash_array(i.X) for i in test_panels]
y_test = test.y

H = model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=100)

Epoch 1/100
 8/94 [=>............................] - ETA: 0s - loss: 0.6925 - binary_crossentropy: 0.6925 - accuracy: 0.5039 

2021-09-20 16:54:56.566037: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
11/94 [==>...........................] - ETA: 0s - loss: 0.6919 - binary_crossentropy: 0.6919 - accuracy: 0.5369

2021-09-20 16:54:57.406312: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 

In [20]:
loss = pd.DataFrame({'loss': H.history['loss'], 'val_loss': H.history['val_loss']})
acc = pd.DataFrame({'loss': H.history['accuracy'], 'val_loss': H.history['val_accuracy']})
acc.plot()

In [None]:
logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)

In [None]:
%load_ext tensorboard

In [None]:
import plotly.graph_objects as go
import pandas as pd

fig = go.Figure(data=[go.Candlestick(x=data.index,
                open=data['Open'],
                high=data['High'],
                low=data['Low'],
                close=data['Close'])
                ])


fig.update_layout(xaxis_rangeslider_visible=False)

shapes = []
for index in trues.index:
    shapes.append(dict(type='line', yref='paper', y0=0, y1=1, xref='x',
                 x0=index, x1=index))

fig.update_layout(shapes=shapes)

fig.show()

In [None]:
train, test = stock_panel.split_train_test(0.2)
print(train.X.shape)
print(train.y.shape)

In [None]:
inputs = tf.keras.Input(shape=(train.X.shape[1:]))

x = tf.keras.layers.Conv1D(2, 2)(inputs)
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(10, activation=tf.nn.relu)(x)
x = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)(x)
outputs = tf.keras.layers.Reshape(train.y.shape[1:])(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(optimizer="Adam", loss="binary_crossentropy", metrics=["binary_crossentropy", 'AUC'])
model.summary()

In [None]:
H = model.fit(train.X, train.y, epochs=120, validation_data=(test.X, test.y), verbose=0)

In [None]:
losses = pd.DataFrame({'train_loss': H.history['loss'], 'val_loss': H.history['val_loss']})
losses.plot()

In [None]:
aucs = pd.DataFrame({'train_auc': H.history['auc'], 'val_auc': H.history['val_auc']})
aucs.plot()

In [None]:
preds = model.predict(test.X)

pred_data = pd.DataFrame(index=test.yindex, 
                         data={'ytrue': test.y.flatten(), 
                               'ypred':preds.flatten()}).astype(float)

pred_data['pred'] = pred_data.ypred > 0.9

In [None]:
pred_data.ypred.plot(kind='scatter')

In [None]:
thresh = 0.9
pred_data[pred_data.ypred > thresh]

In [None]:
print(classification_report(pred_data.ytrue, pred_data.ypred > thresh))

In [None]:
# Option to add multilevel - one per column instead of one above all
# Label: if (tomorrow high - today close) > 0

In [None]:
# xdata.loc[:, (slice(None), ['Volume'])] = xdata.loc[:, (slice(None), ['Volume'])].shift()

In [None]:
import plotly.graph_objects as go

def pair_plot(pair, xunit, yunit, xchannels=None, ychannels=None):
    
    
    x = pair.xframe
    y = pair.yframe
    
    x = select(x, units=[xunit], channels=xchannels)
    y = select(y, units=[yunit], channels=ychannels)

    fig = go.Figure()

    for _, channel in enumerate(xchannels):
        c = random.choice(cmap1)
        fig.add_trace(go.Scatter(x=x.index, y=x[channel], name="x_" + channel,
                                 line=dict(width=2, color=c)))
        
    for _, channel in enumerate(ychannels):
        c = random.choice(cmap1)
        fig.add_trace(go.Scatter(x=y.index, y=y[channel], name="y_" + channel,
                                 line=dict(width=2, dash='dot', color=c)))

    fig.update_layout(title='', xaxis_title='Timestamps', yaxis_title='Values')
    fig.show()

In [None]:
pair_plot(panel.pairs[0], 'MSFT', 'MSFT')

In [None]:
xdata['QCOM']['High'].plot()

In [None]:
import re
import requests
import pandas as pd

def get_url_mails(url):
    EMAIL_REGEX=r"[\w\.-]+@[\w\.-]+"

    mails = []
    r=requests.get(url)
    for re_match in re.findall(EMAIL_REGEX, r.text):
        mails.append(re_match)
    return mails

def get_mails(urls):
    mails = {}
    for url in urls:
        mails[url] = list(set(get_url_mails(url)))
    return pd.DataFrame(dict([(k, pd.Series(v)) for k,v in mails.items() ]))

def get_mails_from_csv(csv_filename, output_filename):
    urls = pd.read_csv(csv_filename)['Email']
    get_mails(urls).to_csv(output_filename)

get_mails_from_csv('/Users/rodrigonader/Downloads/pesquisa.csv', 'myout.csv')