<a href="https://colab.research.google.com/github/hmghaly/km/blob/main/stock_trading_support.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Start

In [1]:
from google.colab import drive
import os
drive.mount('/content/drive')
cwd='/content/drive/MyDrive/stocks' #directory where we keep the data
os.chdir(cwd)
!pwd

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/13dgt1IEtslxXGld9lXtheEMLLQEWiZh4/stocks


In [2]:
!ls /content/drive/MyDrive/stocks

 AAPL.csv			  results
 archive.zip			 'Robinhood transcactions.gsheet'
 daily				  robin.ipynb
 daily-script.py		  robin_stocks
 data				  stock_info-aug28.txt
 models				  stock_info_sp500.txt
 prediction-2022-09-01.gsheet	  stock_info.txt
 prediction-2022-09-01.tsv	  stock_market_data
 prediction-2022-09-06.gsheet	  stock_symbols.txt
 prediction-2022-09-06.tsv	  stock-trading-support.ipynb
 pypy3.9-v7.3.8-linux64		  test-numpy-pypy.py
 pypy3.9-v7.3.8-linux64.tar.bz2   valid_stock_25_less_aug_28.txt


#Network Definition - Aug 2022
https://github.com/hmghaly/word_align/blob/master/rnn_utils.py

In [2]:
import torch
from torch import nn
import torch.optim as optim
import random
import dill as pickle

torch.manual_seed(1)
random.seed(1)

device = torch.device('cpu')
#device = torch.device('cuda')

class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, output_size,num_layers, matching_in_out=False, init_val=None, apply_sigmoid=False, apply_softmax=False, batch_size=1):
    super(RNN, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.num_layers = num_layers
    self.batch_size = batch_size
    self.apply_softmax=apply_softmax
    self.apply_sigmoid=apply_sigmoid
    self.init_val=init_val
    self.matching_in_out = matching_in_out #length of input vector matches the length of output vector 
    self.lstm = nn.LSTM(input_size, hidden_size,num_layers)
    self.hidden2out = nn.Linear(hidden_size, output_size)
    if self.apply_softmax: self.softmax =nn.Softmax(dim=2)
    if self.apply_sigmoid: self.sigmoid =nn.Sigmoid() 
    
    #self.sigmoid = torch.sigmoid(dim=1)
    self.hidden = self.init_hidden()
  def forward(self, feature_list):
    self.hidden = self.init_hidden() ### check
    feature_list=torch.tensor(feature_list)
    feature_list=feature_list.to(device) #### <<<<<<<<<<<<<<<<< 
    if self.matching_in_out:
      lstm_out, _ = self.lstm( feature_list.view(len( feature_list), 1, -1))
      output_scores = self.hidden2out(lstm_out.view(len( feature_list), -1))
      if self.apply_sigmoid: output_scores=self.sigmoid(output_scores).to(device)
      elif self.apply_softmax: output_scores=self.softmax(output_scores).to(device)
      #output_scores = torch.sigmoid(output_space) #we'll need to check if we need this sigmoid
      return output_scores #output_scores
    else:
      outs=[]
      for i in range(len(feature_list)):
        cur_ft_tensor=feature_list[i]#.view([1,1,self.input_size])
        cur_ft_tensor=cur_ft_tensor.view([1,1,self.input_size])
        lstm_out, self.hidden = self.lstm(cur_ft_tensor, self.hidden)
        outs=self.hidden2out(lstm_out)
        if self.apply_sigmoid: outs = self.sigmoid(outs).to(device) #self.sigmoid =nn.Sigmoid()
        elif self.apply_softmax: outs = self.softmax(outs).to(device)
        
      return outs
  def init_hidden(self):
    #return torch.rand(self.num_layers, self.batch_size, self.hidden_size)
    if self.init_val!=None:
      h1=torch.ones(self.num_layers, self.batch_size, self.hidden_size)*self.init_val
      h2=torch.ones(self.num_layers, self.batch_size, self.hidden_size)*self.init_val
    else:
      h1=torch.rand(self.num_layers, self.batch_size, self.hidden_size)
      h2=torch.rand(self.num_layers, self.batch_size, self.hidden_size)
    return (h1.to(device),h2.to(device))

def to_tensor(list1):
  return torch.tensor(list1,dtype=torch.float32)

def split_train_test(all_data0,train_ratio=0.8):
  train_size=int(len(all_data0)*train_ratio)
  train_set0,tes_set0=all_data0[:train_size],all_data0[train_size:]
  return train_set0,tes_set0  

def out2labels(rnn_flat_out,label_list): #a flat rnn output to split into slices, and get the label weights for each slice
  final_list=[]
  n_slices=int(len(rnn_flat_out)/len(label_list))
  for i0 in range(n_slices):
    i1=i0+1
    cur_slice=rnn_flat_out[i0*len(label_list):i1*len(label_list)]
    tmp_list=[]
    for lb0,cs0 in zip(label_list,cur_slice): tmp_list.append((lb0,cs0))
    tmp_list.sort(key=lambda x:-x[-1])
    final_list.append(tmp_list)
  return final_list

def dill_pickle(obj0,fpath0):
  pickle_fopen=open(fpath0,"wb")
  pickle.settings['recurse'] = True
  pickle.dump(obj0,pickle_fopen)
  pickle_fopen.close()

def dill_unpickle(fpath0):
  pickle_fopen0=open(fpath0,"rb")
  tmp_dict=pickle.load(pickle_fopen0)
  pickle_fopen0.close()
  return tmp_dict

def log_something(text0,fpath0):
  fopen0=open(fpath0,"a")
  fopen0.write(text0+"\n")
  fopen0.close()
  
n_input=1
n_output=3
n_hidden =64#64
n_layers=2
LR=0.01

#rnn = RNN(n_input, n_hidden, n_output,n_layers,matching_in_out=False).to(device)
rnn = RNN(n_input, n_hidden, n_output,n_layers,matching_in_out=False,init_val=None, apply_sigmoid=False).to(device)
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all cnn parameters


#print(rnn.init_hidden())
input_tensor=torch.rand(10,n_input)
print(input_tensor.shape)
rnn_out=rnn(input_tensor)
print(rnn_out)
actual_out=[1.,0.,1.]
print(rnn)
for a in range(100):
  #rnn.zero_grad()
  input_list=[-2.,5.2,-1.1,3.2,2,8.9,-7.3]
  input_tensor=torch.tensor(input_list)
  actual_out_tensor=torch.tensor(actual_out).to(device)
#   #input_tensor=torch.rand((5, n_input)).to(device)
  rnn_output = rnn(input_tensor).to(device)
  print("Input:",input_tensor)
  #print("Output:", output.shape)
#   #rnn_output_list=rnn_output.ravel().tolist()
  rnn_output_list=rnn_output.tolist()
  print(rnn_output_list)
  #print([round(v,4) for v in rnn_output_list])
  loss = loss_func(actual_out_tensor.ravel(), rnn_output.ravel()) #calculate the loss, difference between the output and the desired outcome tensors
  loss.backward()
  optimizer.step()
  print("-------")

torch.Size([10, 1])
tensor([[[ 0.1250, -0.0037, -0.0528]]], grad_fn=<ViewBackward0>)
RNN(
  (lstm): LSTM(1, 64, num_layers=2)
  (hidden2out): Linear(in_features=64, out_features=3, bias=True)
)
Input: tensor([-2.0000,  5.2000, -1.1000,  3.2000,  2.0000,  8.9000, -7.3000])
[[[0.10608354955911636, 0.013464339077472687, -0.05862850695848465]]]
-------
Input: tensor([-2.0000,  5.2000, -1.1000,  3.2000,  2.0000,  8.9000, -7.3000])
[[[0.3377465307712555, -0.05398379638791084, 0.17546875774860382]]]
-------
Input: tensor([-2.0000,  5.2000, -1.1000,  3.2000,  2.0000,  8.9000, -7.3000])
[[[0.70696622133255, -0.0033850669860839844, 0.5571586489677429]]]
-------
Input: tensor([-2.0000,  5.2000, -1.1000,  3.2000,  2.0000,  8.9000, -7.3000])
[[[1.5057740211486816, 0.22393426299095154, 1.3384796380996704]]]
-------
Input: tensor([-2.0000,  5.2000, -1.1000,  3.2000,  2.0000,  8.9000, -7.3000])
[[[1.672014832496643, 0.06658178567886353, 1.6787805557250977]]]
-------
Input: tensor([-2.0000,  5.2000, -1



-------
Input: tensor([-2.0000,  5.2000, -1.1000,  3.2000,  2.0000,  8.9000, -7.3000])
[[[0.20360167324543, -0.1833760142326355, 0.32594364881515503]]]
-------
Input: tensor([-2.0000,  5.2000, -1.1000,  3.2000,  2.0000,  8.9000, -7.3000])
[[[0.19432374835014343, -0.1244024783372879, 0.272468626499176]]]
-------
Input: tensor([-2.0000,  5.2000, -1.1000,  3.2000,  2.0000,  8.9000, -7.3000])
[[[0.2308298945426941, -0.07646417617797852, 0.27593427896499634]]]
-------
Input: tensor([-2.0000,  5.2000, -1.1000,  3.2000,  2.0000,  8.9000, -7.3000])
[[[0.28298142552375793, -0.03757026046514511, 0.3025377690792084]]]
-------
Input: tensor([-2.0000,  5.2000, -1.1000,  3.2000,  2.0000,  8.9000, -7.3000])
[[[0.33755892515182495, -0.008276678621768951, 0.34007972478866577]]]
-------
Input: tensor([-2.0000,  5.2000, -1.1000,  3.2000,  2.0000,  8.9000, -7.3000])
[[[0.38406533002853394, 0.022359002381563187, 0.3842792212963104]]]
-------
Input: tensor([-2.0000,  5.2000, -1.1000,  3.2000,  2.0000,  8.90

#Main functions - Aug 2022

In [3]:
from types import prepare_class
import pandas as pd
from random import shuffle, seed
import os, re, random
import numpy.polynomial as poly

random.seed(1)



def get_pairs(list1): #turn the flat list of high-lows into a lit of pairs
  out=[]
  for i0 in range(0,len(list1),2): out.append((list1[i0],list1[i0+1]))
  return out


def gen_labels(max_val0=5):
  str_vals=[]
  for val_i in range(-max_val0-1,max_val0+2): str_vals.append(str(int(val_i)))
  return str_vals


def transpose(list1):
  return list(map(list, zip(*list1)))

def get_poly(list1,deg0=2):
  indexes0=list(range(1,len(list1)+1))
  c = poly.Polynomial.fit(indexes0, list1, deg = deg0)
  return c.convert().coef.tolist()

def get_f0(list1):
  avg0=sum(list1)/len(list1)
  counter=0
  for i0 in range(1,len(list1)):
    prev_val,cur_val=list1[i0-1],list1[i0]
    if prev_val<avg0 and cur_val>avg0: counter+=1
  avg_freq=counter/len(list1)
  return avg_freq

# def get_pd_col_data(pd_frame0,col_names0):
#   all_col_data=[]
#   for index0,row_dict0 in pd_frame0.iterrows():
#     cur_list=[row_dict0.get(v,0.) for v in col_names0]
#     for cl in cur_list: print(cl,type(cl))
    
#     all_col_data.append(cur_list)
#   return all_col_data

def get_pd_col_data(pd_frame0,col_names0):
  all_col_data=[]
  for index0,row_dict0 in pd_frame0.iterrows():
    cur_list=[row_dict0.get(v,0.) for v in col_names0]
    valid_row=True
    for cl in cur_list: 
      if not type(cl) is float: valid_row=False
    if not valid_row: continue
    all_col_data.append(cur_list)
  return all_col_data  

def normalize_ft_labels_new(prev_2d_data0,next_2d_data0,params0={}): #get a chunk of data and normalize both the features and labels to last closing value of the features chunk
  last_val=prev_2d_data0[-1][-1]
  normalized_prev_list,normalized_next_list=[],[]
  normalized_prev_list=normalize_analyze_features(prev_2d_data0,params0)
  # for item0 in prev_2d_data0:
  #   normalized_item=[(v-last_val)/last_val for v in item0]
  #   normalized_prev_list.append(normalized_item)
  for item0 in next_2d_data0:
    normalized_item=[(v-last_val)/last_val for v in item0]
    normalized_next_list.append(normalized_item)
  return normalized_prev_list,normalized_next_list

def normalize_analyze_features(prev_2d_data0,params0={}): #get summary of features based on min/max/avg/std-dev/polynomial weights/F0 .. etc
  cur_summary=params0.get("summary",{})
  last_val=prev_2d_data0[-1][-1]
  normalized_prev_list_items=[]
  final_list=[]
  for item0 in prev_2d_data0:
    normalized_item=[(v-last_val)/last_val for v in item0]
    normalized_prev_list_items.append(normalized_item)
  if cur_summary=={} or cur_summary==None: return normalized_prev_list_items
  normalized_prev_list_items_transpose=transpose(normalized_prev_list_items)
  for ti in normalized_prev_list_items_transpose:
    cur_tmp_list=[]
    mean_val=sum(ti)/len(ti)
    max_val,min_val=max(ti),min(ti)
    if cur_summary.get("mean",False): cur_tmp_list.append(mean_val)
    if cur_summary.get("max",False): cur_tmp_list.append(max_val)
    if cur_summary.get("min",False): cur_tmp_list.append(min_val)
    poly_deg=cur_summary.get("poly",0)
    if poly_deg>0: cur_tmp_list.extend(get_poly(ti,poly_deg))
    if cur_summary.get("f0",False): cur_tmp_list.append(get_f0(ti))
    final_list.append(cur_tmp_list)
  final_list_transpose=transpose(final_list)
  if cur_summary.get("last-item-norm",False): final_list_transpose.append(normalized_prev_list_items[-1])
  if cur_summary.get("last-item-raw",False): final_list_transpose.append(prev_2d_data0[-1])
  return final_list_transpose




def get_csv_data_new(csv_fpath0,params0={}):
  columns0 =params0.get("columns",["High","Low","Open","Close"])  
  pd_df=pd.read_csv(csv_fpath0)
  data_2d = get_pd_col_data(pd_df,columns0) #pd_df.filter(columns)
  return data_2d

def get_data_full_new(csv_fpath0,params0={}):
  prev_n0=params0.get("prev_n",30)
  next_n0=params0.get("next_n",5)
  cur_csv_data0=get_csv_data_new(csv_fpath0,params0)
  slice_size0=prev_n0+next_n0
  all_data_items=[]
  for i0 in range(len(cur_csv_data0)-slice_size0):
    prev_slice=cur_csv_data0[i0:i0+prev_n0]
    next_slice=cur_csv_data0[i0+prev_n0:i0+prev_n0+next_n0]
    normalized_prev0,normalized_next0=normalize_ft_labels_new(prev_slice,next_slice,params0)
    all_data_items.append((normalized_prev0,normalized_next0))
  return all_data_items

def get_data_full_from_list_new(cur_csv_data0,params0={}):
  prev_n0=params0.get("prev_n",30)
  next_n0=params0.get("next_n",5)
  #cur_csv_data0=get_csv_data_new(csv_fpath0,params0)
  slice_size0=prev_n0+next_n0
  all_data_items=[]
  for i0 in range(len(cur_csv_data0)-slice_size0):
    prev_slice=cur_csv_data0[i0:i0+prev_n0]
    next_slice=cur_csv_data0[i0+prev_n0:i0+prev_n0+next_n0]
    normalized_prev0,normalized_next0=normalize_ft_labels_new(prev_slice,next_slice,params0)
    all_data_items.append((normalized_prev0,normalized_next0))
  return all_data_items


def get_label_spec_dict(label0): # e.g. "day:0;low:<0"
  tmp_dict0={}
  lb_split=label0.split(";")
  for sp0 in lb_split:
    colon_split=sp0.split(":")
    tmp_dict0[colon_split[0]]=colon_split[1]
  return tmp_dict0

def eval_pred2(rnn_out0,actual_outcome0):
  eval_list=[]
  rnn_out_flat=rnn_out0.ravel()
  for ac0,pred0 in zip(actual_outcome0,rnn_out_flat):
    cur_pred0=pred0.item()
    if ac0==1: dist0=1-cur_pred0
    else: dist0=cur_pred0
    eval_list.append(dist0)
    #print("ac0,pred0 >>>",ac0,cur_pred0, "dist0",dist0)
  return sum(eval_list)/len(eval_list)

#===================================


def normalize_ft_vals(prev_2d_data,flatten=False):
  last_val=prev_2d_data[-1][-1]
  #print("last_val", last_val)
  final_list=[]
  for item0 in prev_2d_data:
    normalized_item=[(v-last_val)/last_val for v in item0]
    if flatten: final_list.extend(normalized_item)
    else: final_list.append(normalized_item)
    
  return final_list

def percent_bin_vals(ratio0,max_val0=5):
  percent=ratio0*100
  if abs(percent)<=max_val0: return str(int(percent))
  elif percent<-max_val0: return str(-max_val0-1)
  else: return str(max_val0+1)


def one_hot_labels(label_list0,standard_labels0):
  oh_list=[]
  for lb0 in label_list0: 
    tmp_oh=[1. if v==lb0 else 0. for v in standard_labels0]
    oh_list.extend(tmp_oh)
  return oh_list

def normalize_next_labels(next_2d_data0,last_closing_val0,max_percent_val0=5):
  normalized_high_low_labels=[]
  for item0 in next_2d_data0:
    high0,low0,open0,close0=item0
    high_ratio0=(high0-last_closing_val0)/last_closing_val0
    low_ratio0=(low0-last_closing_val0)/last_closing_val0
    high_label0=percent_bin_vals(high_ratio0,max_val0=max_percent_val0)
    low_label0=percent_bin_vals(low_ratio0,max_val0=max_percent_val0)
    normalized_high_low_labels.extend((high_label0,low_label0))
  return normalized_high_low_labels

def prep_ft_labels(data0,params0={}):
  #prev_n0=20,next_n0=10,max_percent0=5
  prev_n0=params0.get("prev_n",30)
  next_n0=params0.get("next_n",10)
  max_percent0=params0.get("max_percent",5)
  flatten0=params0.get("flatten",True)

  slice_size0=prev_n0+next_n0
  all_data_items=[]
  for i0 in range(len(data0)-slice_size0):
    prev_slice=data0[i0:i0+prev_n0]
    next_slice=data0[i0+prev_n0:i0+prev_n0+next_n0]
    ref_val=prev_slice[-1][-1]
    cur_ft_list=normalize_ft_vals(prev_slice,flatten0)
    normalized_next_list=normalize_next_labels(next_slice,ref_val,max_percent_val0=max_percent0)
    all_data_items.append((cur_ft_list,normalized_next_list,ref_val))
  return all_data_items


def get_data_full(csv_fpath0,params0={}):
  columns0 =params0.get("columns",["High","Low","Open","Close"])  
  prev_n0=params0.get("prev_n",30)
  next_n0=params0.get("next_n",10)
  max_percent0=params0.get("max_percent",5)
  standard_labels0=params0.get("standard_labels",gen_labels(max_percent0))

  cur_data0=[]
  fname0=os.path.split(csv_fpath0)[-1]
  standard_labels0=gen_labels(max_percent0)
  pd_df=pd.read_csv(csv_fpath0)
  data_2d = get_pd_col_data(pd_df,columns0) #pd_df.filter(columns)
  tmp_data_items=prep_ft_labels(data_2d,params0)
  for cur_ft_vals0,cur_labels0,ref0 in tmp_data_items:
    oh_labels0=one_hot_labels(cur_labels0,standard_labels0)
    cur_data0.append((cur_ft_vals0,oh_labels0,cur_labels0,ref0,fname0))
  return cur_data0

def eval_pred(rnn_out0,actual_outcome0,standard_labels0,gain_threshold0=5):
  gain_loss=0
  rnn_out_flat=rnn_out0.ravel()
  preds=out2labels(rnn_out_flat,standard_labels0)
  eval_list=[]
  for ac0,pred0 in zip(actual_outcome0,preds):
    cur_pred=[(v[0],round(v[1].item(),4)) for v in pred0]
    eval_list.append((ac0,cur_pred[0]))
  for i0 in range(0,len(eval_list),2):
    cur_high=eval_list[i0]
    ac_high,pred_high_wt=cur_high
    pred_high,pred_wt=pred_high_wt
    if i0>0 and int(pred_high)>=gain_threshold0: 
      #print(i0/2, "Sell Decision - predicted: %s - actual: %s"%(pred_high,ac_high))
      gain_loss=int(ac_high)
      break
  return gain_loss


#Loading Data - Aug 2022

In [None]:

seed(0)
cur_prev_n,cur_next_n=30,10
cur_max_percent_val=5

n_files=50


def get_specs_from_next_data(normalized_next0,params0={}):
  pred_labels0=params0.get("pred_labels",[])
  cols0=params0.get("columns",[])

  label_val_dict0={}
  label_spec_dict0={}
  for a in pred_labels0:
    label_val_dict0[a]=0.
    spec0=get_label_spec_dict(a)
    label_spec_dict0[a]=spec0
  for day0,item0 in enumerate(normalized_next0):
    #high0,low0,open0,close0=item0
    local_dict={}
    for col_i,col_name0 in enumerate(cols0):
      col_name_lower0=col_name0.lower()
      cur_val0=item0[col_i]
      local_dict[col_name_lower0]=cur_val0
    # print(item0)
    # print(local_dict)
    #   local_dict["low"]=low0
    #   local_dict["open"]=open0
    #   local_dict["close"]=close0

    # local_dict["high"]=high0
    # local_dict["low"]=low0
    # local_dict["open"]=open0
    # local_dict["close"]=close0
    for lb0 in pred_labels0:
      corr_tmp_dict=label_spec_dict0[lb0]
      if not str(day0) in corr_tmp_dict.get("day",""): continue
      for ld_key,ld_val in local_dict.items():
        corr_rule=corr_tmp_dict.get(ld_key)
        if corr_rule==None: continue
        comparator=corr_rule[0]
        compared_to=int(corr_rule[1:])
        ld_val_percent=100*ld_val
        outcome=0.
        if comparator==">" and ld_val_percent>=compared_to: outcome=1.
        if comparator=="<" and ld_val_percent<=compared_to: outcome=1.
        if comparator=="=" and int(ld_val_percent)==compared_to: outcome=1.
        if outcome==1: label_val_dict0[lb0]=1.
  return [label_val_dict0.get(v,0.) for v in pred_labels0] #label_val_dict0

cur_params={}
cur_params["prev_n"]=30#15#30
cur_params["next_n"]=5#10
cur_params["max_percent"]=5
#cur_params["columns"]=["High","Low","Open","Close"]
cur_params["columns"]=["High","Low","Close"]
cur_params["gain_threshold"]=5
cur_params["flatten"]=False
cur_params["standard_labels"]=gen_labels(cur_params["max_percent"])
#cur_params["pred_labels"]=["day:0;low:<0","day:0;low:<-2","day:0;low:<2","day:1234;high:>0","day:1234;high:>2","day:1234;high:>5","day:4;close:<0"]
#cur_params["pred_labels"]=["day:0;low:<0","day:1234;high:>3","day:4;close:<0"]
cur_params["pred_labels"]=["day:1234;high:>3"]
cur_params["summary"]={}
cur_params["summary"]["mean"]=True
cur_params["summary"]["max"]=True
cur_params["summary"]["min"]=True
cur_params["summary"]["std-dev"]=False
cur_params["summary"]["f0"]=True
cur_params["summary"]["last-item-raw"]=True
cur_params["summary"]["last-item-norm"]=True
#cur_params["summary"]["poly"]=2



#columns = ["High","Low","Open","Close"]

root_dir='stock_market_data/sp500/csv'
files=['ABC', 'ADP', 'A', 'ABT', 'ABMD', 'ADI', 'ABBV', 'AAPL', 'ADSK', 'ADM', 'ACN', 'AAP', 'AAL', 'ALGN', 'APH', 'AOS', 'AWK', 'ALLE', 'AME', 'APD', 'ARE', 'AIZ', 'ALB', 'APA', 'ALK', 'AEE', 'AMGN', 'ANTM', 'AEP', 'AON', 'AKAM', 'AXP', 'AMD', 'AMAT', 'AMP', 'ANET', 'AJG', 'AZO', 'ATVI', 'AMZN', 'AMT', 'AVB', 'ALTR', 'AVY', 'CAH', 'CDNS', 'BIO', 'CDE', 'BXP', 'BK', 'BEN', 'C', 'BMRA', 'BAX', 'BLK', 'BF-A', 'BDX', 'BR', 'BSHI', 'CB', 'CAG', 'BIIB', 'BAC', 'BMY', 'CCI', 'BSX', 'CAT', 'BRK-A', 'BBY', 'BA', 'BWA', 'CME', 'CNWT', 'CF', 'CTXS', 'D', 'CTSH', 'CHD', 'CFG', 'DFS', 'CPICQ', 'DG', 'CRM', 'CHRW', 'CLX', 'DGX', 'CPB', 'COTY', 'CHTR', 'COP', 'CNC', 'CNP', 'DE', 'COO', 'CUK', 'CPRT', 'COST', 'CINF', 'CMG', 'CL', 'CTQ', 'CTAS', 'CMI', 'CSCO', 'COWN', 'DAL', 'DTE', 'ENS', 'EQIX', 'DRE', 'DOV', 'DHI', 'EW', 'ES', 'EQR', 'DIS', 'DPZ', 'FANG', 'EXR', 'EMR', 'DLTR', 'EMN', 'FAST', 'DVA', 'EBAY', 'EA', 'DRI', 'EOG', 'EL', 'ESS', 'EIX', 'DXCM', 'EFX', 'F', 'ECL', 'ED', 'GS-PJ', 'GILD', 'GIS', 'FMBM', 'FPLPF', 'GM', 'FBHS', 'HBAN', 'FLS', 'FIS', 'FE', 'FRT', 'FRMC', 'FFIV', 'GWW', 'GRMN', 'GGG', 'FN', 'GOOG', 'GPC', 'FLT', 'FITB', 'FCX', 'FISV', 'GPN', 'FMC', 'FRC', 'HAL', 'FDX', 'FCGN', 'FB', 'GE', 'FTI', 'GD', 'HAS', 'HD', 'INTU', 'IFF', 'IRM', 'ICE', 'HLT', 'IDXX', 'HII', 'ILMN', 'HTLF', 'HPQ', 'HON', 'IBM', 'IPGP', 'HCA', 'HRL', 'IR', 'HSY', 'HOLX', 'ISRG', 'HPE', 'HRB', 'HSIC', 'INTH', 'HFC', 'HBI', 'HUM', 'IP', 'HST', 'IEX', 'HES', 'KSU', 'LNT', 'KRA', 'KHC', 'KR', 'KMB', 'JKHY', 'JNJ', 'IT', 'LEG', 'ITW', 'KSS', 'KEY', 'JNPR', 'LKQ', 'KIM', 'IVZ', 'KO', 'LNC', 'JBHT', 'LDOS', 'KMX', 'LMT', 'K', 'JPM', 'KGNR', 'KACPF', 'JCI', 'LH', 'KEYS', 'LBTYA', 'MSFT', 'MKTX', 'LYB', 'MCO', 'MRO', 'MDLZ', 'MLM', 'LVS', 'LRCX', 'MSCI', 'MOS', 'MRK', 'MET', 'MGM', 'MNST', 'MMC', 'MO', 'MCHP', 'LYV', 'MHK', 'MDT', 'LUV', 'MCK', 'MS-PF', 'MMM', 'MAA', 'MCD', 'MPC', 'MAR', 'LOW', 'MRCR', 'NOV', 'PEG', 'NVRO', 'NTRR', 'MU', 'NTRA', 'PAYX', 'NSC', 'NRG', 'ODFL', 'NTAP', 'PBCT', 'NFLX', 'ORLY', 'OMC', 'NTRS', 'NCTKF', 'NOXL', 'OKE', 'NI', 'NVR', 'NOC', 'O', 'NOW', 'PCAR', 'NEE', 'NLSN', 'NWL', 'MSI', 'NDAQ', 'NMHLY', 'OXY', 'NOK', 'NEOG', 'NCLH', 'RF', 'PSX', 'RE', 'PPG', 'ROK', 'PXD', 'RIBT', 'RCL', 'REGN', 'RMD', 'PKI', 'RL', 'RJF', 'PG', 'QRVO', 'REG', 'PHM', 'PNWRF', 'PKG', 'PNW', 'PLD', 'PVH', 'PM', 'PNR', 'PWR', 'PH', 'RLI', 'PEP', 'PRU', 'PFE', 'RHI', 'ROST', 'TAP', 'TEL', 'SRG', 'SLG', 'RSG', 'SYK', 'SNPS', 'SCHW', 'SHW', 'RXMD', 'SEGXF', 'SWKS', 'SBUX', 'RSNHF', 'SWK', 'SONC', 'ROP', 'STZ-B', 'TCYSF', 'STT', 'SPG', 'SYF', 'T', 'STX', 'SIVB', 'SO', 'ROL', 'TJX', 'SEE', 'SLB', 'SRE', 'VZ', 'UNP', 'TMUS', 'TRAUF', 'V', 'TW', 'VRSK', 'TWTR', 'URI', 'ULTA', 'UPS', 'UDR', 'TSN', 'UAL', 'TSCO', 'TTWO', 'VRSN', 'UA', 'TMO', 'WBA', 'TXN', 'UNM', 'USB', 'TXT', 'VMC', 'WAT', 'UHS', 'UEEC', 'VTR', 'TYL', 'TROW', 'TRV', 'VFC', 'WYNN', 'WSPOF', 'WU', 'YUM', 'XYL', 'WST', 'WRK', 'WEC', 'WM', 'ZTS', 'ZBH', 'XOM', 'XEL', 'WDC', 'WRB', 'WY', 'ZION', 'WHR', 'XLEFF', 'WMB', 'XLNX']
sample_files=["AAPL","GOOG","FB","AMZN","EA","IBM","MSFT","GM","UPS","PG"]
sample_files=list(set(sample_files+files[:n_files]))
print("sample_files",len(sample_files), sample_files)

all_data=[]
for fname in sample_files:
  cur_fname= fname+".csv"
  print(cur_fname)
  cur_fpath=os.path.join(root_dir,cur_fname)
  #tmp_data=get_data_full(cur_fpath,cur_params)
  tmp_data=get_data_full_new(cur_fpath,cur_params)
  all_data.extend(tmp_data)

shuffle(all_data)
print(len(all_data))
train_data,test_data=split_train_test(all_data)
print("train_data",len(train_data))
print("test_data",len(test_data))
# for ad in all_data[:5]:
#   for a0 in ad:
#     print(a0)
#   print("----")

item0=train_data[0]
train0,test0=item0
for tr in train0:
  print(tr)

for ts in test0:
  print(ts)  
# import numpy as np
# test=all_data[0][0]
# test_array=np.array(test)
#print(test_array.shape)

sample_files 58 ['ADM', 'ANET', 'AEP', 'ALGN', 'ALTR', 'AON', 'AIZ', 'BIO', 'AME', 'ATVI', 'AMAT', 'GOOG', 'AMZN', 'AMD', 'AMP', 'APD', 'AAPL', 'AMT', 'PG', 'AAL', 'AOS', 'BK', 'AKAM', 'ADSK', 'CAH', 'APA', 'AJG', 'ARE', 'AXP', 'ALK', 'GM', 'ADI', 'AMGN', 'ACN', 'AEE', 'AWK', 'ANTM', 'BXP', 'AVB', 'ABC', 'CDNS', 'FB', 'ALB', 'ABT', 'EA', 'AAP', 'AVY', 'APH', 'MSFT', 'ABBV', 'ALLE', 'AZO', 'CDE', 'A', 'IBM', 'ABMD', 'ADP', 'UPS']
ADM.csv
ANET.csv
AEP.csv
ALGN.csv
ALTR.csv
AON.csv
AIZ.csv
BIO.csv
AME.csv
ATVI.csv
AMAT.csv
GOOG.csv
AMZN.csv
AMD.csv
AMP.csv
APD.csv
AAPL.csv
AMT.csv
PG.csv
AAL.csv
AOS.csv
BK.csv
AKAM.csv
ADSK.csv
CAH.csv
APA.csv
AJG.csv
ARE.csv
AXP.csv
ALK.csv
GM.csv
ADI.csv
AMGN.csv
ACN.csv
AEE.csv
AWK.csv
ANTM.csv
BXP.csv
AVB.csv
ABC.csv
CDNS.csv
FB.csv
ALB.csv
ABT.csv
EA.csv
AAP.csv
AVY.csv
APH.csv
MSFT.csv
ABBV.csv
ALLE.csv
AZO.csv
CDE.csv
A.csv
IBM.csv
ABMD.csv
ADP.csv
UPS.csv
448973
train_data 359178
test_data 89795
[-0.03532456252957017, -0.06752805784967221, -0.0490

#New Training - Aug 2022

In [None]:
import torch, random, time, math
torch.manual_seed(1)
random.seed(1)

#params={}
# def extract_features():
#   return

def eval_pred2(rnn_out0,actual_outcome0):
  eval_list=[]
  rnn_out_flat=rnn_out0.ravel()
  for ac0,pred0 in zip(actual_outcome0,rnn_out_flat):
    cur_pred0=pred0.item()
    if ac0==1: dist0=1-cur_pred0
    else: dist0=cur_pred0
    eval_list.append(dist0)
  return sum(eval_list)/len(eval_list)


exp_name="new-stock-pred-test29-0"
exp_name="new-stock-pred-test29-64-L2"
exp_name="new-stock-pred-test29-64-L2-LR1e-6"
exp_name="new-stock-pred-test39-64-L2-LR1e-6"
exp_name="new-stock-pred-test39-128-L2-LR1e-6"
exp_name="new-stock-unflattened-000001"
exp_name="new-stock-unflattened-128-000001"
exp_name="new-stock-unflattened-sample4-32-000001"
exp_name="new-stock-unflattened-sample-30-10-128-000001"
exp_name="new-stock-unflattened-sample-30-10-128-0000001"
exp_name="new-stock-unflattened-sample19-30-10-128-0000001"
exp_name="new-stock-unflattened-sample19-30-10-128-00000001"
exp_name="test-batches-1layer-000001-256"
exp_name="batches-1layer-000001-256-5day-pred-39stocks"
exp_name="batches-1layer-0000001-256-5day-pred-39stocks"
exp_name="batches-2layer-0000001-256-5day-pred-39stocks"
exp_name="batches-limited-preds-29-256-2"
exp_name="batches-limited-preds-48-128-1-0000001"
exp_name="test-batches-limited-preds-58-128-6layer-00000001"
#exp_name="test-batches-limited-preds-15prev-58-512-1layer-0000001"
#exp_name="test-batches-limited-preds-15prev-58-1024-1layer-00000001"
exp_name="test-batches-limited-preds-15prev-10-1024-1layer-00000001"
exp_name="test-batches-3preds-20prev-29-1024-1layer-000000001"
exp_name="test-batches-init-zeros-3preds-20prev-29-1024-1layer-000000001"
#exp_name="test-batches-init-zeros-3preds-30prev-48-512-2layer-00000005"
exp_name="test-batches-init-zeros-1preds-30prev-10-1024-1layer-0000001"
exp_name="test-batches-init-zeros-1preds-30prev-19-512-1layer-00000001"
exp_name="test-batches-init-rand-1preds-15prev-29-512-1layer-00000001"
exp_name="test-batches-init-rand-1preds-15prev-39-512-1layer-0000001"
exp_name="test-batches-sigmoid-1preds-30prev-10-64-1layer-0000001"
exp_name="test-batches-sigmoid-1preds-30prev-10-512-1layer-0000001"
exp_name="test-batches-sigmoid-1preds-30prev-10-512-1layer-0000001"
exp_name="test-batches-sigmoid-1preds-15prev-29-512-1layer-0000001"
exp_name="test-batches-sigmoid-1preds-15prev-29-512-3layer-0000001"
exp_name="test-batches-sigmoid-1preds-10prev-48-512-1layer-00000001"
exp_name="test2-batches-summary-sigmoid-1preds-10prev-10-64-1layer-00000001"
exp_name="test2-batches-summary-sigmoid-1preds-30prev-29-64-1layer-00000001"
exp_name="test2-batches-summary-sigmoid-1preds-30prev-29-64-1layer-000000001"
exp_name="test2-batches-summary-sigmoid-1preds-30prev-29-64-1layer-0000001"
exp_name="test2-batches-summary-sigmoid-1preds-30prev-29-64-2layer-0000001"
exp_name="test2-batches-summary-sigmoid-1preds-30prev-58-64-2layer-0000001"
exp_name="test2-batches-summary-sigmoid-1preds-30prev-58-128-2layer-0000001"

n_layers=2#4#3
n_hidden=128 #1024#512#256#256 #128 #128 #256 #64#64
#LR=0.0000001 #0.0000001
#LR=0.0000001
#LR=0.00000001
#LR=0.0000001
#LR=0.000001
#LR=0.000000001
LR=0.0000001
n_epochs=100
n_data=None #number of items per source
train_batch_size=10000
cur_matching_in_out=False
cur_init_zeros=False
cur_apply_sigmoid=True
train_ratio=0.8
model_dir="models"
#output_labels=standard_labels=ipa_symbol_list #combined_ipa_list
standard_labels=cur_params["standard_labels"]

n_batches=math.floor(len(train_data)/train_batch_size)
if n_batches==0: test_batch_size=len(test_data)
else: test_batch_size=math.floor(len(test_data)/n_batches)



item0=train_data[0]
ft_list,lb_list=item0[:2]
ft_list_tensor=torch.tensor(ft_list,dtype=torch.float32)
extracted_out_specs=get_specs_from_next_data(lb_list,cur_params)
lb_list_tensor=torch.tensor(extracted_out_specs,dtype=torch.float32)
n_input=ft_list_tensor.shape[-1]
n_output=lb_list_tensor.shape[-1]

#get_specs_from_next_data()
print("ft_list",ft_list)
print("lb_list",lb_list)
print("extracted_out_specs",extracted_out_specs)
print("ft_list_tensor",ft_list_tensor.shape)
print("lb_list_tensor",lb_list_tensor.shape)
#hello
loss_func = nn.MSELoss()
rnn = RNN(n_input, n_hidden, n_output,n_layers,matching_in_out=cur_matching_in_out,apply_sigmoid=cur_apply_sigmoid).to(device)
optimizer = torch.optim.Adam(rnn.parameters(), lr=LR) 



print(ft_list_tensor.shape)
print(lb_list_tensor.shape)
exp_dir_path=os.path.join(model_dir,exp_name)
if not os.path.exists(exp_dir_path): os.makedirs(exp_dir_path)
tmp_model_dir=os.path.join(exp_dir_path,"tmp") 
if not os.path.exists(tmp_model_dir): os.makedirs(tmp_model_dir)


log_fpath=os.path.join(exp_dir_path,"log.txt")
log_something(str(rnn),log_fpath)



#ft_list_tensor=ft_list_tensor.reshape([1,n_input])
rnn_out=rnn(ft_list_tensor)
print("rnn_out",rnn_out.shape)

for epoch_i in range(n_epochs):
  PATH=os.path.join(exp_dir_path,"model-%s.model"%epoch_i) #tmp_path
  if os.path.exists(PATH):
    try: checkpoint = torch.load(PATH)
    except: checkpoint = dill_unpickle(PATH)
    rnn.load_state_dict(checkpoint['model_state_dict'])
    print("loaded model for this epoch",PATH)
    rnn.train()
    continue  
  rnn.zero_grad()
  train_loss_items,test_loss_items=[],[]
  train_eval_items,test_eval_items=[],[]
  
  for batch_i0 in range(n_batches+1):
    t0=time.time()
    batch_train_eval_dict,batch_test_eval_dict={},{}
    
    #pred_count,correct_count=0,0
    batch_i1=batch_i0+1
    cur_train_items=train_data[batch_i0*train_batch_size:batch_i1*train_batch_size]
    cur_test_items=test_data[batch_i0*test_batch_size:batch_i1*test_batch_size]
    print("batch_i0",batch_i0, "cur_train_items",len(cur_train_items),"cur_test_items",len(cur_test_items))
    tmp_path=os.path.join(tmp_model_dir, "model-batch-%s.model"%batch_i0)
    if os.path.exists(tmp_path):
      try: checkpoint = torch.load(tmp_path)
      except: checkpoint = dill_unpickle(tmp_path)
      rnn.load_state_dict(checkpoint['model_state_dict'])
      print("loaded model for this epoch",tmp_path)
      rnn.train()
      continue  


    for item_i,item0 in enumerate(cur_train_items):
      if item_i%5000==0: print("training",item_i, "out of:",len(cur_train_items))
      ft_list,lb_list=item0[:2]
      ft_list_tensor=torch.tensor(ft_list,dtype=torch.float32)
      extracted_out_specs=get_specs_from_next_data(lb_list,cur_params)
      lb_list_tensor=torch.tensor(extracted_out_specs,dtype=torch.float32)

      #actual_outcome=item0[2]
      
      #lb_list_tensor=torch.tensor(lb_list,dtype=torch.float32)
      lb_tensor_flat=lb_list_tensor.ravel()
      rnn_out=rnn(ft_list_tensor)
      rnn_out_flat=rnn_out.ravel()
      loss = loss_func(rnn_out_flat,lb_tensor_flat) #calculate the loss, difference between the output and the desired outcome tensors
      loss.backward()
      optimizer.step() 
      train_loss_items.append(loss.item())
      #cur_eval_item=eval_pred(rnn_out,actual_outcome,cur_params["standard_labels"])
      #cur_eval_item=0
      cur_eval_item=eval_pred2(rnn_out,extracted_out_specs)
      train_eval_items.append(cur_eval_item)

    for item_i,item0 in enumerate(cur_test_items):
      if item_i%5000==0: print("testing",item_i, "out of:",len(cur_test_items))
      ft_list,lb_list=item0[:2]
      #actual_outcome=item0[2]
      ft_list_tensor=torch.tensor(ft_list,dtype=torch.float32)
      extracted_out_specs=get_specs_from_next_data(lb_list,cur_params)
      lb_list_tensor=torch.tensor(extracted_out_specs,dtype=torch.float32)

      
      #lb_list_tensor=torch.tensor(lb_list,dtype=torch.float32)
      lb_tensor_flat=lb_list_tensor.ravel()
      rnn_out=rnn(ft_list_tensor)
      rnn_out_flat=rnn_out.ravel()
      if True: #item_i<20:
        for lb,a,b in zip(cur_params["pred_labels"],extracted_out_specs,rnn_out_flat.tolist()):
          key0=(lb,int(a))
          tmp_dict=batch_test_eval_dict.get(lb,{})
          tmp_dict[int(a)]=tmp_dict.get(int(a),[])+[b]
          batch_test_eval_dict[lb]=tmp_dict#batch_test_eval_dict.get(key0,[])+[b]
          if item_i<20: print(lb,"actual:", a, "predicted:", round(b,4))
        # print("actual:",extracted_out_specs)
        # print("predict",[round(v,4) for v in rnn_out_flat.tolist()])
        #print("-----")
      loss = loss_func(rnn_out_flat,lb_tensor_flat) #calculate the loss, difference between the output and the desired outcome tensors
      test_loss_items.append(loss.item())
      #cur_eval_item=eval_pred(rnn_out,actual_outcome,cur_params["standard_labels"])
      cur_eval_item=eval_pred2(rnn_out,extracted_out_specs)
      
      test_eval_items.append(cur_eval_item)

    train_batch_loss_avg=sum(train_loss_items[-len(cur_train_items):])/len(cur_train_items)  
    test_batch_loss_avg=sum(test_loss_items[-len(cur_test_items):])/len(cur_test_items)
    train_batch_eval_avg=sum(train_eval_items[-len(cur_train_items):])/len(cur_train_items)  
    test_batch_eval_avg=sum(test_eval_items[-len(cur_test_items):])/len(cur_test_items)
    some_eval_list=sorted(list(batch_test_eval_dict.items()))
    total_diff=0
    for a,b_dict in some_eval_list:
      vals_0=b_dict.get(0,[])
      vals_1=b_dict.get(1,[])
      avg_vals_0,avg_vals_1=0,0
      if vals_0: avg_vals_0=sum(vals_0)/len(vals_0)
      if vals_1: avg_vals_1=sum(vals_1)/len(vals_1)
      diff01=avg_vals_1-avg_vals_0
      total_diff+=diff01

      #avg_b=sum(b)/len(b)#round(,4)
      #print(a,round(avg_b,6), len(b),b[:5])
      #print(a,"avg_vals_0", round(avg_vals_0,6),"avg_vals_1", round(avg_vals_1,6),"diff01",round(diff01,6))
    print("total_diff", round(total_diff*len(cur_test_items),4))


    train_loss_avg=sum(train_loss_items)/len(train_loss_items)  
    test_loss_avg=sum(test_loss_items)/len(test_loss_items)
    train_eval_avg=sum(train_eval_items)/len(train_eval_items)  
    test_eval_avg=sum(test_eval_items)/len(test_eval_items)

    cur_checkpoint={
              'epoch': epoch_i,
              'n_input': n_input,
              'n_hidden': n_hidden,
              'n_layers': n_layers,
              'n_output': n_output,
              'output_labels': standard_labels,
              'model_state_dict': rnn.state_dict(),
              'LR': LR,
              'matching_in_out':cur_matching_in_out,
              'train_loss': train_loss_avg,
              'test_loss': test_loss_avg,
              'train_eval': train_eval_avg,
              'test_eval': test_eval_avg,
              'feature_extraction_parameters':cur_params,
              'parameters':cur_params,
              'feature_extraction_function':normalize_analyze_features,
              'label_extraction_function':get_specs_from_next_data   
              }
    dill_pickle(cur_checkpoint, tmp_path)
    t1=time.time()
    elapsed=round(t1-t0,1)
    line="Epoch: %s - batch: %s out of %s - train_loss_avg: %s -  test_loss_avg: %s - train_eval_avg: %s - test_eval_avg: %s - elpased: %s"%(epoch_i,batch_i0,n_batches,round(train_batch_loss_avg,4),round(test_batch_loss_avg,4),round(train_batch_eval_avg,4),round(test_batch_eval_avg,4),elapsed)
    print(line)
    print(tmp_path)
    log_something(line,log_fpath)
    print("==========")

  



  #train_eval_avg,test_eval_avg=0,0  
  #print("epoch_i",epoch_i,"train_loss_avg",round(train_loss_avg,4),"test_loss_avg",round(test_loss_avg,4)) 
  line="Epoch: %s - train_loss_avg: %s -  test_loss_avg: %s - train_eval_avg: %s - test_eval_avg: %s"%(epoch_i,round(train_loss_avg,4),round(test_loss_avg,4),round(train_eval_avg,4),round(test_eval_avg,4))
  #line="Epoch # %s  - Batch: %s / %s -  train loss: %s - test loss: %s - train eval: %s - test eval: %s - elapsed: %s"%(epoch0, batch_i0, n_batches, avg_train_loss,avg_test_loss, avg_train_eval,avg_test_eval, elapsed)
  print(line)
  log_something(line,log_fpath)
  dill_pickle(cur_checkpoint, PATH)
  print("model saved")
  for f in os.listdir(tmp_model_dir):
    tmp_fpath=os.path.join(tmp_model_dir,f)
    os.remove(tmp_fpath)
  print("deleted temporary files")
  print("-----------")



    #torch.save(cur_checkpoint, tmp_path)
  
  




ft_list [[-0.03532456252957017, -0.06752805784967221, -0.049088585560913035], [0.02996251952621438, 0.008988755857864374, 0.020412060187353077], [-0.1357677603632792, -0.1822096656289118, -0.17977522431170953], [0.06666666666666667, 0.06666666666666667, 0.06666666666666667], [0.016853997599312324, -0.00842694522244508, 0.0], [18.100000381469727, 17.649999618530273, 17.799999237060547]]
lb_list [[0.004119873223460046, -0.020599258962878064, -0.0007490094109442812], [0.008988755857864374, -0.01498125976310729, 0.0009364224953136013], [0.015917575103998525, -0.007865134587025785, 0.009363367717758682], [0.005617999199770775, -0.03183515020799745, -0.014044944422215854], [-0.01647938573941802, -0.04007489665491754, -0.032771465548888885]]
extracted_out_specs [0.0]
ft_list_tensor torch.Size([6, 3])
lb_list_tensor torch.Size([1])
torch.Size([6, 3])
torch.Size([1])




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
day:1234;high:>3 actual: 0.0 predicted: 0.1709
day:1234;high:>3 actual: 0.0 predicted: 0.5783
day:1234;high:>3 actual: 1.0 predicted: 0.5777
day:1234;high:>3 actual: 0.0 predicted: 0.4018
day:1234;high:>3 actual: 1.0 predicted: 0.5553
day:1234;high:>3 actual: 0.0 predicted: 0.4384
day:1234;high:>3 actual: 0.0 predicted: 0.4871
day:1234;high:>3 actual: 1.0 predicted: 0.3801
day:1234;high:>3 actual: 0.0 predicted: 0.1678
day:1234;high:>3 actual: 1.0 predicted: 0.5968
day:1234;high:>3 actual: 1.0 predicted: 0.3595
day:1234;high:>3 actual: 1.0 predicted: 0.436
day:1234;high:>3 actual: 0.0 predicted: 0.4919
day:1234;high:>3 actual: 1.0 predicted: 0.6277
day:1234;high:>3 actual: 0.0 predicted: 0.4452
day:1234;high:>3 actual: 0.0 predicted: 0.6624
day:1234;high:>3 actual: 0.0 predicted: 0.4414
total_diff 189.463
Epoch: 12 - batch: 3 out of 35 - train_loss_avg: 0.22 -  test_loss_avg: 0.2228 - train_eval_avg: 0.4386 - test_eval_av

#Testing and evaluating - Aug 2022

In [19]:
#New
import torch, random
torch.manual_seed(1)
random.seed(1)


def eval_pred1(rnn_out0,actual_outcome0,standard_labels0,gain_threshold0=5):
  gain_loss=0
  rnn_out_flat=rnn_out0.ravel()
  preds=out2labels(rnn_out_flat,standard_labels0)
  eval_list=[]
  for ac0,pred0 in zip(actual_outcome0,preds):
    cur_pred=[(v[0],round(v[1].item(),4)) for v in pred0]
    eval_list.append((ac0,cur_pred))
  for i0 in range(0,len(eval_list),2):
    cur_high=eval_list[i0]
    cur_low=eval_list[i0+1]
    high_ac,high_preds=cur_high
    low_ac,low_preds=cur_low
    sorted_high_preds=[(int(v[0]),v[1]) for v in high_preds]
    sorted_high_preds=sorted(sorted_high_preds,key=lambda x:int(x[0]))
    sum_high_pred_vals=sum([v[1] for v in sorted_high_preds])
    sorted_low_preds=[(int(v[0]),v[1]) for v in low_preds]
    sorted_low_preds=sorted(sorted_low_preds,key=lambda x:int(x[0]))
    sum_low_pred_vals=sum([v[1] for v in sorted_low_preds])

    #print("actual high:",high_ac,"sorted_high_preds",sorted_high_preds)
    print(int(i0/2), "actual high:",high_ac, "actual low:",low_ac)
    for hi,hi_lb_wt in enumerate(sorted_high_preds):
      hi_lb0,hi_wt0=hi_lb_wt
      low_lb0,low_wt0=low_lb_wt=sorted_low_preds[hi]
      
      if hi_lb0<0: hi_cum_wt=sum([v[1] for v in sorted_high_preds[:hi+1]])
      elif hi_lb0>0: hi_cum_wt=sum([v[1] for v in sorted_high_preds[hi:]])
      else: hi_cum_wt=hi_wt0
      avg_hi_cum_wt=hi_cum_wt/sum_high_pred_vals

      if low_lb0<0: low_cum_wt=sum([v[1] for v in sorted_low_preds[:hi+1]])
      elif low_lb0>0: low_cum_wt=sum([v[1] for v in sorted_low_preds[hi:]])
      else: low_cum_wt=low_wt0
      avg_low_cum_wt=low_cum_wt/sum_low_pred_vals

      #print(hi,hi_lb_wt,round(avg_cum_wt,4))
      print("high:", hi_lb_wt,round(avg_hi_cum_wt,4),"low:", low_lb_wt,round(avg_low_cum_wt,4))
  return gain_loss


def eval_pred2(rnn_out0,actual_outcome0):
  eval_list=[]
  rnn_out_flat=rnn_out0.ravel()
  for ac0,pred0 in zip(actual_outcome0,rnn_out_flat):
    cur_pred0=pred0.item()
    if ac0==1: dist0=1-cur_pred0
    else: dist0=cur_pred0
    eval_list.append(dist0)
    #print("ac0,pred0 >>>",ac0,cur_pred0, "dist0",dist0)
  return sum(eval_list)/len(eval_list)


epoch_i=3
exp_name="new-stock-unflattened-sample19-30-10-128-0000001"

epoch_i=0
exp_name="test-batches-1layer-000001-256"

model_dir="models"
exp_dir_path=os.path.join(model_dir,exp_name)
tmp_path=os.path.join(exp_dir_path,"model-%s.model"%epoch_i)

tmp_path='models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-25.model'
tmp_path='models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-70.model'
tmp_path='models/batches-limited-preds-small-16-1/tmp/model-batch-44.model'
tmp_path='models/batches-limited-preds-29-256-2/tmp/model-batch-24.model'
tmp_path='models/batches-limited-preds-48-128-1/tmp/model-batch-29.model'
tmp_path='models/test-batches-limited-preds-15prev-58-512-1layer-0000001/tmp/model-batch-413.model'
tmp_path='models/test2-batches-summary-sigmoid-1preds-30prev-29-64-2layer-0000001/tmp/model-batch-8.model'
tmp_path='models/test2-batches-summary-sigmoid-1preds-30prev-29-64-2layer-0000001/model-15.model'
tmp_path='models/test2-batches-summary-sigmoid-1preds-30prev-58-64-2layer-0000001/model-8.model' 
#initial: 12:70, 11:73 10:70 9:71 8:78 7:75 6:70 5:69
#file_i=80: 12:56, 11:63, 8:57
#file_i=110: 12:67, 11:73, 8:88
#file_i=120: 12:69, 11:73, 8:81
#file_i=150: 12:76, 11:76, 8:67
#file_i=200: 12:67, 11:75, 8:92
#file_i=250: 12:72, 11:77, 8:92
tmp_path='models/test2-batches-summary-sigmoid-1preds-30prev-58-128-2layer-0000001/model-13.model' 
#file_i=250: 15: 87, 14:87, 13:87, 12:89, 11: 83, 10:87
#file_i=80: 12: 69, 13: 69.3
#file_i=110: 12: 76, 13: 90, 14: 75, 15: 83
#file_i=120: 12: 67, 13: 75, 14: 73, 15: 67
#file_i=200: 12: 83, 13: 80, 14: 81, 15: 78
#file_i=200: 12: 84, 13: 75, 14: 83, 15: 84

file_i=120 #48

try: checkpoint = torch.load(tmp_path)
except: checkpoint = dill_unpickle(tmp_path)
rnn = RNN(checkpoint["n_input"], checkpoint["n_hidden"] , checkpoint["n_output"] , checkpoint["n_layers"] , checkpoint["matching_in_out"]).to(device)
n_input=checkpoint["n_input"]
standard_labels=checkpoint["output_labels"]
cur_parameters=checkpoint["parameters"]
pred_labels=cur_parameters["pred_labels"]
cur_parameters=checkpoint["feature_extraction_parameters"]
label_func=checkpoint['label_extraction_function']
feature_func=checkpoint['feature_extraction_function']
rnn.load_state_dict(checkpoint['model_state_dict'])
rnn.eval()

root_dir='stock_market_data/sp500/csv'
files=['ABC', 'ADP', 'A', 'ABT', 'ABMD', 'ADI', 'ABBV', 'AAPL', 'ADSK', 'ADM', 'ACN', 'AAP', 'AAL', 'ALGN', 'APH', 'AOS', 'AWK', 'ALLE', 'AME', 'APD', 'ARE', 'AIZ', 'ALB', 'APA', 'ALK', 'AEE', 'AMGN', 'ANTM', 'AEP', 'AON', 'AKAM', 'AXP', 'AMD', 'AMAT', 'AMP', 'ANET', 'AJG', 'AZO', 'ATVI', 'AMZN', 'AMT', 'AVB', 'ALTR', 'AVY', 'CAH', 'CDNS', 'BIO', 'CDE', 'BXP', 'BK', 'BEN', 'C', 'BMRA', 'BAX', 'BLK', 'BF-A', 'BDX', 'BR', 'BSHI', 'CB', 'CAG', 'BIIB', 'BAC', 'BMY', 'CCI', 'BSX', 'CAT', 'BRK-A', 'BBY', 'BA', 'BWA', 'CME', 'CNWT', 'CF', 'CTXS', 'D', 'CTSH', 'CHD', 'CFG', 'DFS', 'CPICQ', 'DG', 'CRM', 'CHRW', 'CLX', 'DGX', 'CPB', 'COTY', 'CHTR', 'COP', 'CNC', 'CNP', 'DE', 'COO', 'CUK', 'CPRT', 'COST', 'CINF', 'CMG', 'CL', 'CTQ', 'CTAS', 'CMI', 'CSCO', 'COWN', 'DAL', 'DTE', 'ENS', 'EQIX', 'DRE', 'DOV', 'DHI', 'EW', 'ES', 'EQR', 'DIS', 'DPZ', 'FANG', 'EXR', 'EMR', 'DLTR', 'EMN', 'FAST', 'DVA', 'EBAY', 'EA', 'DRI', 'EOG', 'EL', 'ESS', 'EIX', 'DXCM', 'EFX', 'F', 'ECL', 'ED', 'GS-PJ', 'GILD', 'GIS', 'FMBM', 'FPLPF', 'GM', 'FBHS', 'HBAN', 'FLS', 'FIS', 'FE', 'FRT', 'FRMC', 'FFIV', 'GWW', 'GRMN', 'GGG', 'FN', 'GOOG', 'GPC', 'FLT', 'FITB', 'FCX', 'FISV', 'GPN', 'FMC', 'FRC', 'HAL', 'FDX', 'FCGN', 'FB', 'GE', 'FTI', 'GD', 'HAS', 'HD', 'INTU', 'IFF', 'IRM', 'ICE', 'HLT', 'IDXX', 'HII', 'ILMN', 'HTLF', 'HPQ', 'HON', 'IBM', 'IPGP', 'HCA', 'HRL', 'IR', 'HSY', 'HOLX', 'ISRG', 'HPE', 'HRB', 'HSIC', 'INTH', 'HFC', 'HBI', 'HUM', 'IP', 'HST', 'IEX', 'HES', 'KSU', 'LNT', 'KRA', 'KHC', 'KR', 'KMB', 'JKHY', 'JNJ', 'IT', 'LEG', 'ITW', 'KSS', 'KEY', 'JNPR', 'LKQ', 'KIM', 'IVZ', 'KO', 'LNC', 'JBHT', 'LDOS', 'KMX', 'LMT', 'K', 'JPM', 'KGNR', 'KACPF', 'JCI', 'LH', 'KEYS', 'LBTYA', 'MSFT', 'MKTX', 'LYB', 'MCO', 'MRO', 'MDLZ', 'MLM', 'LVS', 'LRCX', 'MSCI', 'MOS', 'MRK', 'MET', 'MGM', 'MNST', 'MMC', 'MO', 'MCHP', 'LYV', 'MHK', 'MDT', 'LUV', 'MCK', 'MS-PF', 'MMM', 'MAA', 'MCD', 'MPC', 'MAR', 'LOW', 'MRCR', 'NOV', 'PEG', 'NVRO', 'NTRR', 'MU', 'NTRA', 'PAYX', 'NSC', 'NRG', 'ODFL', 'NTAP', 'PBCT', 'NFLX', 'ORLY', 'OMC', 'NTRS', 'NCTKF', 'NOXL', 'OKE', 'NI', 'NVR', 'NOC', 'O', 'NOW', 'PCAR', 'NEE', 'NLSN', 'NWL', 'MSI', 'NDAQ', 'NMHLY', 'OXY', 'NOK', 'NEOG', 'NCLH', 'RF', 'PSX', 'RE', 'PPG', 'ROK', 'PXD', 'RIBT', 'RCL', 'REGN', 'RMD', 'PKI', 'RL', 'RJF', 'PG', 'QRVO', 'REG', 'PHM', 'PNWRF', 'PKG', 'PNW', 'PLD', 'PVH', 'PM', 'PNR', 'PWR', 'PH', 'RLI', 'PEP', 'PRU', 'PFE', 'RHI', 'ROST', 'TAP', 'TEL', 'SRG', 'SLG', 'RSG', 'SYK', 'SNPS', 'SCHW', 'SHW', 'RXMD', 'SEGXF', 'SWKS', 'SBUX', 'RSNHF', 'SWK', 'SONC', 'ROP', 'STZ-B', 'TCYSF', 'STT', 'SPG', 'SYF', 'T', 'STX', 'SIVB', 'SO', 'ROL', 'TJX', 'SEE', 'SLB', 'SRE', 'VZ', 'UNP', 'TMUS', 'TRAUF', 'V', 'TW', 'VRSK', 'TWTR', 'URI', 'ULTA', 'UPS', 'UDR', 'TSN', 'UAL', 'TSCO', 'TTWO', 'VRSN', 'UA', 'TMO', 'WBA', 'TXN', 'UNM', 'USB', 'TXT', 'VMC', 'WAT', 'UHS', 'UEEC', 'VTR', 'TYL', 'TROW', 'TRV', 'VFC', 'WYNN', 'WSPOF', 'WU', 'YUM', 'XYL', 'WST', 'WRK', 'WEC', 'WM', 'ZTS', 'ZBH', 'XOM', 'XEL', 'WDC', 'WRB', 'WY', 'ZION', 'WHR', 'XLEFF', 'WMB', 'XLNX']

fname="%s.csv"%files[file_i]
cur_fpath=os.path.join(root_dir,fname)
tmp_data=get_data_full_new(cur_fpath,cur_parameters)
#tmp_data=get_data_full(cur_fpath,cur_parameters)

arbitrary_i=3
gain_loss_items=[]
arbitrary_items=[]
random_items=[]
eval_items=[]
correct_counter,total_counter=0,0
threshold=0.75
#for item_i in range(len(tmp_data)):
for item_i in range(len(tmp_data)):  
  if item_i%500==0:print("item_i",item_i, "out of:", len(tmp_data))
  #print("item_i",item_i)
  item0=tmp_data[item_i]
  ft_list,lb_list=item0[:2]
  ft_list_tensor=torch.tensor(ft_list,dtype=torch.float32)
  extracted_out_specs=label_func(lb_list,cur_parameters)
  lb_list_tensor=torch.tensor(extracted_out_specs,dtype=torch.float32)
  lb_tensor_flat=lb_list_tensor.ravel()
  rnn_out=rnn(ft_list_tensor)
  rnn_out_flat=rnn_out.ravel()
  rnn_out_flat_item=rnn_out_flat.item()
  cur_eval=eval_pred2(rnn_out,extracted_out_specs)
  eval_items.append(cur_eval)

  if rnn_out_flat_item>threshold:
    total_counter+=1
    print("extracted_out_specs",extracted_out_specs)
    print("rnn_out",round(rnn_out_flat_item,4))
    if int(round(extracted_out_specs[0]))==int(round(rnn_out_flat_item)):
      print(">>> correct")
      correct_counter+=1
    else: print("xxx incorrect")
    print("=====")

print("cur_fpath",cur_fpath)
print("model_path",tmp_path)
avg_eval=sum(eval_items)/len(eval_items)
print("average eval:", round(avg_eval,4))
accuracy=0
if total_counter>0:
  accuracy=float(correct_counter)/total_counter
  print("accuracy",round(accuracy,4))
  #overall_gain_loss=sum(gain_loss_items)
if len(gain_loss_items)>0:
  avg_gain_loss=sum(gain_loss_items)/len(gain_loss_items)
  n_gain_items=len([v for v in gain_loss_items if v>0])
  n_loss_items=len([v for v in gain_loss_items if v<0])
  percent_gain_items=round(100*n_gain_items/len(gain_loss_items))
  percent_loss_items=round(100*n_loss_items/len(gain_loss_items))

  avg_arbitrary=sum(arbitrary_items)/len(arbitrary_items)
  avg_random=sum(random_items)/len(random_items)

  print(fname, "epoch",epoch_i)
  print("overall_gain_loss",overall_gain_loss)
  print("avg_gain_loss",round(avg_gain_loss,2))
  print("avg_arbitrary",round(avg_arbitrary,2))
  print("avg_random",round(avg_random,2))
  print("n_gain_items",n_gain_items)
  print("n_loss_items",n_loss_items)
  print("percent_gain_items",percent_gain_items)
  print("percent_loss_items",percent_loss_items)


item_i 0 out of: 6750




extracted_out_specs [1.0]
rnn_out 0.7595
>>> correct
=====
extracted_out_specs [1.0]
rnn_out 0.7926
>>> correct
=====
extracted_out_specs [0.0]
rnn_out 0.8143
xxx incorrect
=====
extracted_out_specs [0.0]
rnn_out 0.7657
xxx incorrect
=====
extracted_out_specs [0.0]
rnn_out 0.7676
xxx incorrect
=====
extracted_out_specs [1.0]
rnn_out 0.8072
>>> correct
=====
extracted_out_specs [1.0]
rnn_out 0.912
>>> correct
=====
item_i 500 out of: 6750
extracted_out_specs [0.0]
rnn_out 0.8446
xxx incorrect
=====
extracted_out_specs [0.0]
rnn_out 0.7914
xxx incorrect
=====
extracted_out_specs [0.0]
rnn_out 0.8003
xxx incorrect
=====
extracted_out_specs [1.0]
rnn_out 0.8383
>>> correct
=====
extracted_out_specs [1.0]
rnn_out 0.8075
>>> correct
=====
extracted_out_specs [1.0]
rnn_out 0.777
>>> correct
=====
extracted_out_specs [1.0]
rnn_out 0.8343
>>> correct
=====
extracted_out_specs [1.0]
rnn_out 0.7897
>>> correct
=====
extracted_out_specs [1.0]
rnn_out 0.7602
>>> correct
=====
extracted_out_specs [1

In [None]:
import torch
train_item,test_item=item0[:2]
train_item_tensor=to_tensor(train_item)
train_item_tensor_avg=torch.mean(train_item_tensor,dim=0)
train_item_tensor_min=torch.min(train_item_tensor,dim=0)
train_item_tensor_max=torch.max(train_item_tensor,dim=0)
print(train_item_tensor)
print(train_item_tensor_avg)
print(train_item_tensor_min.values)
print(train_item_tensor_max.values)

tensor([[-0.0026, -0.0240, -0.0233],
        [-0.0110, -0.0238, -0.0122],
        [-0.0111, -0.0179, -0.0157],
        [-0.0100, -0.0197, -0.0177],
        [-0.0136, -0.0318, -0.0160],
        [ 0.0061, -0.0178,  0.0044],
        [ 0.0092,  0.0001,  0.0066],
        [ 0.0093,  0.0023,  0.0053],
        [ 0.0077, -0.0011,  0.0034],
        [ 0.0066, -0.0008,  0.0037],
        [ 0.0120,  0.0000,  0.0112],
        [ 0.0160,  0.0045,  0.0059],
        [ 0.0094,  0.0020,  0.0034],
        [ 0.0071,  0.0007,  0.0012],
        [ 0.0050, -0.0019,  0.0000]])
tensor([ 0.0027, -0.0086, -0.0026])
tensor([-0.0136, -0.0318, -0.0233])
tensor([0.0160, 0.0045, 0.0112])


In [None]:
import numpy.polynomial as poly
# x = [1, 2, 3, 4, 5]
# y = [16, 42.25, 81, 132.25, 196]
# c = poly.Polynomial.fit(x, y, deg = 2)

# def transpose(list1):
#   return list(map(list, zip(*list1)))

# def get_poly(list1,deg0=2):
#   indexes0=list(range(1,len(list1)+1))
#   c = poly.Polynomial.fit(indexes0, list1, deg = deg0)
#   return c.convert().coef.tolist()

# def get_f0(list1):
#   avg0=sum(list1)/len(list1)
#   counter=0
#   for i0 in range(1,len(list1)):
#     prev_val,cur_val=list1[i0-1],list1[i0]
#     if prev_val<avg0 and cur_val>avg0: counter+=1
#   avg_freq=counter/len(list1)
#   return avg_freq




cur_params={}
cur_params["summary"]={}
cur_params["summary"]["mean"]=True
cur_params["summary"]["max"]=True
cur_params["summary"]["min"]=True
cur_params["summary"]["std-dev"]=False
cur_params["summary"]["f0"]=True
cur_params["summary"]["last-item-raw"]=True
cur_params["summary"]["last-item-norm"]=True
cur_params["summary"]["poly"]=2
cur_params["columns"]=["High","Low","Close"]




# item0=train_data[0]
# train_item,test_item=item0[:2]
# train_item_transpose=transpose(train_item)
# print("train_item")
# for ti in train_item:
#   print(ti)
# print("---")
# final_list=[]
# cur_summary=params0.get("summary",{})
# for ti in train_item_transpose:
#   cur_tmp_list=[]
#   #print(ti)
#   mean_val=sum(ti)/len(ti)
#   max_val,min_val=max(ti),min(ti)
#   if cur_summary.get("mean",False): cur_tmp_list.append(mean_val)
#   if cur_summary.get("max",False): cur_tmp_list.append(max_val)
#   if cur_summary.get("min",False): cur_tmp_list.append(min_val)
#   poly_deg=cur_summary.get("poly",0)
#   if poly_deg>0: cur_tmp_list.extend(get_poly(ti,poly_deg))
#   if cur_summary.get("f0",False): cur_tmp_list.append(get_f0(ti))
#   final_list.append(cur_tmp_list)
    

  # print("mean,max,min",mean_val,max_val,min_val)
  # poly0=get_poly(ti)
  # print("poly0",poly0)
  # f0_test=get_f0(ti)

# for fl in final_list:
#   print(fl)



# root_dir='stock_market_data/sp500/csv'
# sample_files=["AAPL","GOOG","FB","AMZN","EA","IBM","MSFT","GM","UPS","PG"]

# all_data=[]
# for fname in sample_files:
#   cur_fname= fname+".csv"
#   print(cur_fname)
#   cur_fpath=os.path.join(root_dir,cur_fname)
#   print(cur_fpath)
csv_fpath0='stock_market_data/sp500/csv/MSFT.csv'
test=get_csv_data_new(csv_fpath0,cur_params)

cur_raw_ft_list=test[:20]
for ts in cur_raw_ft_list:
  print(ts)
print("------")
cur_params["summary"]={}
cur_summary=normalize_analyze_features(cur_raw_ft_list,cur_params)
print("cur_summary")
for a in cur_summary:
  print(a)

# test=[1,2,3,4,5]
# test=[1,5,16,22]
# test=[7,8,9,10]
# poly0=get_poly(test,2)
# print(poly0)

# test2=[1,4,1,3,1,5,2]
# test2_avg=sum(test2)/len(test2)
# print("avg",test2_avg)
# f0_test=get_f0(test2)
# print(test2)

[0.1015629991889, 0.0885419994592666, 0.0972220003604888]
[0.1024309992790222, 0.0972220003604888, 0.1006940007209777]
[0.1032989993691444, 0.1006940007209777, 0.1024309992790222]
[0.1032989993691444, 0.0989580005407333, 0.0998260006308555]
[0.1006940007209777, 0.0972220003604888, 0.0980900004506111]
[0.0980900004506111, 0.0946180000901222, 0.0954860001802444]
[0.0972220003604888, 0.0911459997296333, 0.0928819999098777]
[0.0928819999098777, 0.0894099995493888, 0.0902779996395111]
[0.0920139998197555, 0.0894099995493888, 0.0920139998197555]
[0.0954860001802444, 0.0911459997296333, 0.0946180000901222]
[0.0963540002703666, 0.0946180000901222, 0.0963540002703666]
[0.0963540002703666, 0.09375, 0.0954860001802444]
[0.0954860001802444, 0.0946180000901222, 0.0946180000901222]
[0.0972220003604888, 0.0946180000901222, 0.0954860001802444]
[0.0989580005407333, 0.0963540002703666, 0.0963540002703666]
[0.0972220003604888, 0.0963540002703666, 0.0963540002703666]
[0.0972220003604888, 0.092881999909877

In [None]:
#OLD
import torch, random, time, math
torch.manual_seed(1)
random.seed(1)

#params={}
def extract_features():
  return
exp_name="new-stock-pred-test29-0"
exp_name="new-stock-pred-test29-64-L2"
exp_name="new-stock-pred-test29-64-L2-LR1e-6"
exp_name="new-stock-pred-test39-64-L2-LR1e-6"
exp_name="new-stock-pred-test39-128-L2-LR1e-6"
exp_name="new-stock-unflattened-000001"
exp_name="new-stock-unflattened-128-000001"
exp_name="new-stock-unflattened-sample4-32-000001"
exp_name="new-stock-unflattened-sample-30-10-128-000001"
exp_name="new-stock-unflattened-sample-30-10-128-0000001"
exp_name="new-stock-unflattened-sample19-30-10-128-0000001"
exp_name="new-stock-unflattened-sample19-30-10-128-00000001"
exp_name="test-batches-1layer-000001-256"
exp_name="batches-1layer-000001-256-5day-pred-39stocks"
exp_name="batches-1layer-0000001-256-5day-pred-39stocks"
exp_name="batches-2layer-0000001-256-5day-pred-39stocks"

n_layers=2#4#3
n_hidden=256 #128 #128 #256 #64#64
#LR=0.0000001 #0.0000001
#LR=0.0000001
#LR=0.00000001
LR=0.0000001
n_epochs=100
n_data=None #number of items per source
train_batch_size=1000
cur_matching_in_out=False
train_ratio=0.8
model_dir="models"
#output_labels=standard_labels=ipa_symbol_list #combined_ipa_list
standard_labels=cur_params["standard_labels"]

n_batches=math.floor(len(train_data)/train_batch_size)
if n_batches==0: test_batch_size=len(test_data)
else: test_batch_size=math.floor(len(test_data)/n_batches)



item0=train_data[0]
ft_list,lb_list=item0[:2]
ft_list_tensor=torch.tensor(ft_list,dtype=torch.float32)
lb_list_tensor=torch.tensor(lb_list,dtype=torch.float32)
n_input=ft_list_tensor.shape[-1]
n_output=lb_list_tensor.shape[0]

loss_func = nn.MSELoss()
rnn = RNN(n_input, n_hidden, n_output,n_layers,matching_in_out=cur_matching_in_out).to(device)
optimizer = torch.optim.Adam(rnn.parameters(), lr=LR) 



print(ft_list_tensor.shape)
print(lb_list_tensor.shape)
exp_dir_path=os.path.join(model_dir,exp_name)
if not os.path.exists(exp_dir_path): os.makedirs(exp_dir_path)
tmp_model_dir=os.path.join(exp_dir_path,"tmp") 
if not os.path.exists(tmp_model_dir): os.makedirs(tmp_model_dir)


log_fpath=os.path.join(exp_dir_path,"log.txt")
log_something(str(rnn),log_fpath)



#ft_list_tensor=ft_list_tensor.reshape([1,n_input])
rnn_out=rnn(ft_list_tensor)
print("rnn_out",rnn_out.shape)

for epoch_i in range(n_epochs):
  PATH=os.path.join(exp_dir_path,"model-%s.model"%epoch_i) #tmp_path
  if os.path.exists(PATH):
    try: checkpoint = torch.load(PATH)
    except: checkpoint = dill_unpickle(PATH)
    rnn.load_state_dict(checkpoint['model_state_dict'])
    print("loaded model for this epoch",PATH)
    rnn.train()
    continue  
  rnn.zero_grad()
  train_loss_items,test_loss_items=[],[]
  train_eval_items,test_eval_items=[],[]
  for batch_i0 in range(n_batches+1):
    t0=time.time()
    
    #pred_count,correct_count=0,0
    batch_i1=batch_i0+1
    cur_train_items=train_data[batch_i0*train_batch_size:batch_i1*train_batch_size]
    cur_test_items=test_data[batch_i0*test_batch_size:batch_i1*test_batch_size]
    print("batch_i0",batch_i0, "cur_train_items",len(cur_train_items),"cur_test_items",len(cur_test_items))
    tmp_path=os.path.join(tmp_model_dir, "model-batch-%s.model"%batch_i0)
    if os.path.exists(tmp_path):
      try: checkpoint = torch.load(tmp_path)
      except: checkpoint = dill_unpickle(tmp_path)
      rnn.load_state_dict(checkpoint['model_state_dict'])
      print("loaded model for this epoch",tmp_path)
      rnn.train()
      continue  


    for item_i,item0 in enumerate(cur_train_items):
      if item_i%5000==0: print("training",item_i, "out of:",len(cur_train_items))
      ft_list,lb_list=item0[:2]
      actual_outcome=item0[2]
      ft_list_tensor=torch.tensor(ft_list,dtype=torch.float32)
      lb_list_tensor=torch.tensor(lb_list,dtype=torch.float32)
      lb_tensor_flat=lb_list_tensor.ravel()
      rnn_out=rnn(ft_list_tensor)
      rnn_out_flat=rnn_out.ravel()
      loss = loss_func(rnn_out_flat,lb_tensor_flat) #calculate the loss, difference between the output and the desired outcome tensors
      loss.backward()
      optimizer.step() 
      train_loss_items.append(loss.item())
      cur_eval_item=eval_pred(rnn_out,actual_outcome,cur_params["standard_labels"])
      train_eval_items.append(cur_eval_item)

    for item_i,item0 in enumerate(cur_test_items):
      if item_i%5000==0: print("testing",item_i, "out of:",len(cur_test_items))
      ft_list,lb_list=item0[:2]
      actual_outcome=item0[2]
      ft_list_tensor=torch.tensor(ft_list,dtype=torch.float32)
      lb_list_tensor=torch.tensor(lb_list,dtype=torch.float32)
      lb_tensor_flat=lb_list_tensor.ravel()
      rnn_out=rnn(ft_list_tensor)
      rnn_out_flat=rnn_out.ravel()
      loss = loss_func(rnn_out_flat,lb_tensor_flat) #calculate the loss, difference between the output and the desired outcome tensors
      test_loss_items.append(loss.item())
      cur_eval_item=eval_pred(rnn_out,actual_outcome,cur_params["standard_labels"])
      test_eval_items.append(cur_eval_item)

    train_batch_loss_avg=sum(train_loss_items[-len(cur_train_items):])/len(cur_train_items)  
    test_batch_loss_avg=sum(test_loss_items[-len(cur_test_items):])/len(cur_test_items)
    train_batch_eval_avg=sum(train_eval_items[-len(cur_train_items):])/len(cur_train_items)  
    test_batch_eval_avg=sum(test_eval_items[-len(cur_test_items):])/len(cur_test_items)

    train_loss_avg=sum(train_loss_items)/len(train_loss_items)  
    test_loss_avg=sum(test_loss_items)/len(test_loss_items)
    train_eval_avg=sum(train_eval_items)/len(train_eval_items)  
    test_eval_avg=sum(test_eval_items)/len(test_eval_items)

    cur_checkpoint={
              'epoch': epoch_i,
              'n_input': n_input,
              'n_hidden': n_hidden,
              'n_layers': n_layers,
              'n_output': n_output,
              'output_labels': standard_labels,
              'model_state_dict': rnn.state_dict(),
              'LR': LR,
              'matching_in_out':cur_matching_in_out,
              'train_loss': train_loss_avg,
              'test_loss': test_loss_avg,
              'train_eval': train_eval_avg,
              'test_eval': test_eval_avg,
              'feature_extraction_parameters':cur_params,
              'feature_extraction_function':extract_features   
              }
    dill_pickle(cur_checkpoint, tmp_path)
    t1=time.time()
    elapsed=round(t1-t0,1)
    line="Epoch: %s - batch: %s out of %s - train_loss_avg: %s -  test_loss_avg: %s - train_eval_avg: %s - test_eval_avg: %s - elpased: %s"%(epoch_i,batch_i0,n_batches,round(train_batch_loss_avg,4),round(test_batch_loss_avg,4),round(train_batch_eval_avg,4),round(test_batch_eval_avg,4),elapsed)
    print(line)
    print(tmp_path)
    log_something(line,log_fpath)

  



  #train_eval_avg,test_eval_avg=0,0  
  #print("epoch_i",epoch_i,"train_loss_avg",round(train_loss_avg,4),"test_loss_avg",round(test_loss_avg,4)) 
  line="Epoch: %s - train_loss_avg: %s -  test_loss_avg: %s - train_eval_avg: %s - test_eval_avg: %s"%(epoch_i,round(train_loss_avg,4),round(test_loss_avg,4),round(train_eval_avg,4),round(test_eval_avg,4))
  #line="Epoch # %s  - Batch: %s / %s -  train loss: %s - test loss: %s - train eval: %s - test eval: %s - elapsed: %s"%(epoch0, batch_i0, n_batches, avg_train_loss,avg_test_loss, avg_train_eval,avg_test_eval, elapsed)
  print(line)
  log_something(line,log_fpath)
  dill_pickle(cur_checkpoint, PATH)
  print("model saved")
  for f in os.listdir(tmp_model_dir):
    tmp_fpath=os.path.join(tmp_model_dir,f)
    os.remove(tmp_fpath)
  print("deleted temporary files")
  print("-----------")



    #torch.save(cur_checkpoint, tmp_path)
  
  




torch.Size([30, 4])
torch.Size([130])
rnn_out torch.Size([1, 1, 130])
batch_i0 0 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-0.model
batch_i0 1 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-1.model
batch_i0 2 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-2.model
batch_i0 3 cur_train_items 1000 cur_test_items 250




loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-3.model
batch_i0 4 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-4.model
batch_i0 5 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-5.model
batch_i0 6 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-6.model
batch_i0 7 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-7.model
batch_i0 8 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-8.model
batch_i0 9 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5d

In [None]:
#OLD
import torch, random, time, math
torch.manual_seed(1)
random.seed(1)

#params={}
def extract_features():
  return
exp_name="new-stock-pred-test29-0"
exp_name="new-stock-pred-test29-64-L2"
exp_name="new-stock-pred-test29-64-L2-LR1e-6"
exp_name="new-stock-pred-test39-64-L2-LR1e-6"
exp_name="new-stock-pred-test39-128-L2-LR1e-6"
exp_name="new-stock-unflattened-000001"
exp_name="new-stock-unflattened-128-000001"
exp_name="new-stock-unflattened-sample4-32-000001"
exp_name="new-stock-unflattened-sample-30-10-128-000001"
exp_name="new-stock-unflattened-sample-30-10-128-0000001"
exp_name="new-stock-unflattened-sample19-30-10-128-0000001"
exp_name="new-stock-unflattened-sample19-30-10-128-00000001"
exp_name="test-batches-1layer-000001-256"
exp_name="batches-1layer-000001-256-5day-pred-39stocks"
exp_name="batches-1layer-0000001-256-5day-pred-39stocks"
exp_name="batches-2layer-0000001-256-5day-pred-39stocks"

n_layers=2#4#3
n_hidden=256 #128 #128 #256 #64#64
#LR=0.0000001 #0.0000001
#LR=0.0000001
#LR=0.00000001
LR=0.0000001
n_epochs=100
n_data=None #number of items per source
train_batch_size=1000
cur_matching_in_out=False
train_ratio=0.8
model_dir="models"
#output_labels=standard_labels=ipa_symbol_list #combined_ipa_list
standard_labels=cur_params["standard_labels"]

n_batches=math.floor(len(train_data)/train_batch_size)
if n_batches==0: test_batch_size=len(test_data)
else: test_batch_size=math.floor(len(test_data)/n_batches)



item0=train_data[0]
ft_list,lb_list=item0[:2]
ft_list_tensor=torch.tensor(ft_list,dtype=torch.float32)
lb_list_tensor=torch.tensor(lb_list,dtype=torch.float32)
n_input=ft_list_tensor.shape[-1]
n_output=lb_list_tensor.shape[0]

loss_func = nn.MSELoss()
rnn = RNN(n_input, n_hidden, n_output,n_layers,matching_in_out=cur_matching_in_out).to(device)
optimizer = torch.optim.Adam(rnn.parameters(), lr=LR) 



print(ft_list_tensor.shape)
print(lb_list_tensor.shape)
exp_dir_path=os.path.join(model_dir,exp_name)
if not os.path.exists(exp_dir_path): os.makedirs(exp_dir_path)
tmp_model_dir=os.path.join(exp_dir_path,"tmp") 
if not os.path.exists(tmp_model_dir): os.makedirs(tmp_model_dir)


log_fpath=os.path.join(exp_dir_path,"log.txt")
log_something(str(rnn),log_fpath)



#ft_list_tensor=ft_list_tensor.reshape([1,n_input])
rnn_out=rnn(ft_list_tensor)
print("rnn_out",rnn_out.shape)

for epoch_i in range(n_epochs):
  PATH=os.path.join(exp_dir_path,"model-%s.model"%epoch_i) #tmp_path
  if os.path.exists(PATH):
    try: checkpoint = torch.load(PATH)
    except: checkpoint = dill_unpickle(PATH)
    rnn.load_state_dict(checkpoint['model_state_dict'])
    print("loaded model for this epoch",PATH)
    rnn.train()
    continue  
  rnn.zero_grad()
  train_loss_items,test_loss_items=[],[]
  train_eval_items,test_eval_items=[],[]
  for batch_i0 in range(n_batches+1):
    t0=time.time()
    
    #pred_count,correct_count=0,0
    batch_i1=batch_i0+1
    cur_train_items=train_data[batch_i0*train_batch_size:batch_i1*train_batch_size]
    cur_test_items=test_data[batch_i0*test_batch_size:batch_i1*test_batch_size]
    print("batch_i0",batch_i0, "cur_train_items",len(cur_train_items),"cur_test_items",len(cur_test_items))
    tmp_path=os.path.join(tmp_model_dir, "model-batch-%s.model"%batch_i0)
    if os.path.exists(tmp_path):
      try: checkpoint = torch.load(tmp_path)
      except: checkpoint = dill_unpickle(tmp_path)
      rnn.load_state_dict(checkpoint['model_state_dict'])
      print("loaded model for this epoch",tmp_path)
      rnn.train()
      continue  


    for item_i,item0 in enumerate(cur_train_items):
      if item_i%5000==0: print("training",item_i, "out of:",len(cur_train_items))
      ft_list,lb_list=item0[:2]
      actual_outcome=item0[2]
      ft_list_tensor=torch.tensor(ft_list,dtype=torch.float32)
      lb_list_tensor=torch.tensor(lb_list,dtype=torch.float32)
      lb_tensor_flat=lb_list_tensor.ravel()
      rnn_out=rnn(ft_list_tensor)
      rnn_out_flat=rnn_out.ravel()
      loss = loss_func(rnn_out_flat,lb_tensor_flat) #calculate the loss, difference between the output and the desired outcome tensors
      loss.backward()
      optimizer.step() 
      train_loss_items.append(loss.item())
      cur_eval_item=eval_pred(rnn_out,actual_outcome,cur_params["standard_labels"])
      train_eval_items.append(cur_eval_item)

    for item_i,item0 in enumerate(cur_test_items):
      if item_i%5000==0: print("testing",item_i, "out of:",len(cur_test_items))
      ft_list,lb_list=item0[:2]
      actual_outcome=item0[2]
      ft_list_tensor=torch.tensor(ft_list,dtype=torch.float32)
      lb_list_tensor=torch.tensor(lb_list,dtype=torch.float32)
      lb_tensor_flat=lb_list_tensor.ravel()
      rnn_out=rnn(ft_list_tensor)
      rnn_out_flat=rnn_out.ravel()
      loss = loss_func(rnn_out_flat,lb_tensor_flat) #calculate the loss, difference between the output and the desired outcome tensors
      test_loss_items.append(loss.item())
      cur_eval_item=eval_pred(rnn_out,actual_outcome,cur_params["standard_labels"])
      test_eval_items.append(cur_eval_item)

    train_batch_loss_avg=sum(train_loss_items[-len(cur_train_items):])/len(cur_train_items)  
    test_batch_loss_avg=sum(test_loss_items[-len(cur_test_items):])/len(cur_test_items)
    train_batch_eval_avg=sum(train_eval_items[-len(cur_train_items):])/len(cur_train_items)  
    test_batch_eval_avg=sum(test_eval_items[-len(cur_test_items):])/len(cur_test_items)

    train_loss_avg=sum(train_loss_items)/len(train_loss_items)  
    test_loss_avg=sum(test_loss_items)/len(test_loss_items)
    train_eval_avg=sum(train_eval_items)/len(train_eval_items)  
    test_eval_avg=sum(test_eval_items)/len(test_eval_items)

    cur_checkpoint={
              'epoch': epoch_i,
              'n_input': n_input,
              'n_hidden': n_hidden,
              'n_layers': n_layers,
              'n_output': n_output,
              'output_labels': standard_labels,
              'model_state_dict': rnn.state_dict(),
              'LR': LR,
              'matching_in_out':cur_matching_in_out,
              'train_loss': train_loss_avg,
              'test_loss': test_loss_avg,
              'train_eval': train_eval_avg,
              'test_eval': test_eval_avg,
              'feature_extraction_parameters':cur_params,
              'feature_extraction_function':extract_features   
              }
    dill_pickle(cur_checkpoint, tmp_path)
    t1=time.time()
    elapsed=round(t1-t0,1)
    line="Epoch: %s - batch: %s out of %s - train_loss_avg: %s -  test_loss_avg: %s - train_eval_avg: %s - test_eval_avg: %s - elpased: %s"%(epoch_i,batch_i0,n_batches,round(train_batch_loss_avg,4),round(test_batch_loss_avg,4),round(train_batch_eval_avg,4),round(test_batch_eval_avg,4),elapsed)
    print(line)
    print(tmp_path)
    log_something(line,log_fpath)

  



  #train_eval_avg,test_eval_avg=0,0  
  #print("epoch_i",epoch_i,"train_loss_avg",round(train_loss_avg,4),"test_loss_avg",round(test_loss_avg,4)) 
  line="Epoch: %s - train_loss_avg: %s -  test_loss_avg: %s - train_eval_avg: %s - test_eval_avg: %s"%(epoch_i,round(train_loss_avg,4),round(test_loss_avg,4),round(train_eval_avg,4),round(test_eval_avg,4))
  #line="Epoch # %s  - Batch: %s / %s -  train loss: %s - test loss: %s - train eval: %s - test eval: %s - elapsed: %s"%(epoch0, batch_i0, n_batches, avg_train_loss,avg_test_loss, avg_train_eval,avg_test_eval, elapsed)
  print(line)
  log_something(line,log_fpath)
  dill_pickle(cur_checkpoint, PATH)
  print("model saved")
  for f in os.listdir(tmp_model_dir):
    tmp_fpath=os.path.join(tmp_model_dir,f)
    os.remove(tmp_fpath)
  print("deleted temporary files")
  print("-----------")



    #torch.save(cur_checkpoint, tmp_path)
  
  




torch.Size([30, 4])
torch.Size([130])
rnn_out torch.Size([1, 1, 130])
batch_i0 0 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-0.model
batch_i0 1 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-1.model
batch_i0 2 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-2.model
batch_i0 3 cur_train_items 1000 cur_test_items 250




loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-3.model
batch_i0 4 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-4.model
batch_i0 5 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-5.model
batch_i0 6 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-6.model
batch_i0 7 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-7.model
batch_i0 8 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-8.model
batch_i0 9 cur_train_items 1000 cur_test_items 250
loaded model for this epoch models/batches-2layer-0000001-256-5d

In [None]:
import torch, random
torch.manual_seed(1)
random.seed(1)

def eval_pred1(rnn_out0,actual_outcome0,standard_labels0,gain_threshold0=5):
  gain_loss=0
  rnn_out_flat=rnn_out0.ravel()
  preds=out2labels(rnn_out_flat,standard_labels0)
  eval_list=[]
  for ac0,pred0 in zip(actual_outcome0,preds):
    cur_pred=[(v[0],round(v[1].item(),4)) for v in pred0]
    eval_list.append((ac0,cur_pred))
  for i0 in range(0,len(eval_list),2):
    cur_high=eval_list[i0]
    cur_low=eval_list[i0+1]
    high_ac,high_preds=cur_high
    low_ac,low_preds=cur_low
    sorted_high_preds=[(int(v[0]),v[1]) for v in high_preds]
    sorted_high_preds=sorted(sorted_high_preds,key=lambda x:int(x[0]))
    sum_high_pred_vals=sum([v[1] for v in sorted_high_preds])
    sorted_low_preds=[(int(v[0]),v[1]) for v in low_preds]
    sorted_low_preds=sorted(sorted_low_preds,key=lambda x:int(x[0]))
    sum_low_pred_vals=sum([v[1] for v in sorted_low_preds])

    #print("actual high:",high_ac,"sorted_high_preds",sorted_high_preds)
    print(int(i0/2), "actual high:",high_ac, "actual low:",low_ac)
    for hi,hi_lb_wt in enumerate(sorted_high_preds):
      hi_lb0,hi_wt0=hi_lb_wt
      low_lb0,low_wt0=low_lb_wt=sorted_low_preds[hi]
      
      if hi_lb0<0: hi_cum_wt=sum([v[1] for v in sorted_high_preds[:hi+1]])
      elif hi_lb0>0: hi_cum_wt=sum([v[1] for v in sorted_high_preds[hi:]])
      else: hi_cum_wt=hi_wt0
      avg_hi_cum_wt=hi_cum_wt/sum_high_pred_vals

      if low_lb0<0: low_cum_wt=sum([v[1] for v in sorted_low_preds[:hi+1]])
      elif low_lb0>0: low_cum_wt=sum([v[1] for v in sorted_low_preds[hi:]])
      else: low_cum_wt=low_wt0
      avg_low_cum_wt=low_cum_wt/sum_low_pred_vals

      #print(hi,hi_lb_wt,round(avg_cum_wt,4))
      print("high:", hi_lb_wt,round(avg_hi_cum_wt,4),"low:", low_lb_wt,round(avg_low_cum_wt,4))
    
    
    #print("cur_high",cur_high,"cur_low",cur_low)
    #print()


    # ac_high,pred_high_wt=cur_high
    # pred_high,pred_wt=pred_high_wt
    # print(i0/2,cur_high)
    # if i0>0 and int(pred_high)>=gain_threshold0: 
    #   #print(i0/2, "Sell Decision - predicted: %s - actual: %s"%(pred_high,ac_high))
    #   gain_loss=int(ac_high)
    #   break
  return gain_loss


#epoch_i=5
file_i=126 #48


#exp_name="new-stock-pred-test2"

# epoch_i=4
# exp_name="new-stock-pred-test29-64-L2-LR1e-6"

# epoch_i=4
# exp_name="new-stock-pred-test39-64-L2-LR1e-6"

#epoch_i=40
# epoch_i=20#22
# exp_name="new-stock-unflattened-000001"

epoch_i=3
#epoch_i=2
#exp_name="new-stock-unflattened-128-000001"
#exp_name="new-stock-unflattened-sample-30-10-128-0000001"
exp_name="new-stock-unflattened-sample19-30-10-128-0000001"

epoch_i=0
exp_name="test-batches-1layer-000001-256"

model_dir="models"
exp_dir_path=os.path.join(model_dir,exp_name)
tmp_path=os.path.join(exp_dir_path,"model-%s.model"%epoch_i)

tmp_path='models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-25.model'
tmp_path='models/batches-2layer-0000001-256-5day-pred-39stocks/tmp/model-batch-70.model'
try: checkpoint = torch.load(tmp_path)
except: checkpoint = dill_unpickle(tmp_path)
rnn = RNN(checkpoint["n_input"], checkpoint["n_hidden"] , checkpoint["n_output"] , checkpoint["n_layers"] , checkpoint["matching_in_out"]).to(device)
n_input=checkpoint["n_input"]
standard_labels=checkpoint["output_labels"]
cur_parameters=checkpoint["feature_extraction_parameters"]
rnn.load_state_dict(checkpoint['model_state_dict'])
rnn.eval()

root_dir='stock_market_data/sp500/csv'
files=['ABC', 'ADP', 'A', 'ABT', 'ABMD', 'ADI', 'ABBV', 'AAPL', 'ADSK', 'ADM', 'ACN', 'AAP', 'AAL', 'ALGN', 'APH', 'AOS', 'AWK', 'ALLE', 'AME', 'APD', 'ARE', 'AIZ', 'ALB', 'APA', 'ALK', 'AEE', 'AMGN', 'ANTM', 'AEP', 'AON', 'AKAM', 'AXP', 'AMD', 'AMAT', 'AMP', 'ANET', 'AJG', 'AZO', 'ATVI', 'AMZN', 'AMT', 'AVB', 'ALTR', 'AVY', 'CAH', 'CDNS', 'BIO', 'CDE', 'BXP', 'BK', 'BEN', 'C', 'BMRA', 'BAX', 'BLK', 'BF-A', 'BDX', 'BR', 'BSHI', 'CB', 'CAG', 'BIIB', 'BAC', 'BMY', 'CCI', 'BSX', 'CAT', 'BRK-A', 'BBY', 'BA', 'BWA', 'CME', 'CNWT', 'CF', 'CTXS', 'D', 'CTSH', 'CHD', 'CFG', 'DFS', 'CPICQ', 'DG', 'CRM', 'CHRW', 'CLX', 'DGX', 'CPB', 'COTY', 'CHTR', 'COP', 'CNC', 'CNP', 'DE', 'COO', 'CUK', 'CPRT', 'COST', 'CINF', 'CMG', 'CL', 'CTQ', 'CTAS', 'CMI', 'CSCO', 'COWN', 'DAL', 'DTE', 'ENS', 'EQIX', 'DRE', 'DOV', 'DHI', 'EW', 'ES', 'EQR', 'DIS', 'DPZ', 'FANG', 'EXR', 'EMR', 'DLTR', 'EMN', 'FAST', 'DVA', 'EBAY', 'EA', 'DRI', 'EOG', 'EL', 'ESS', 'EIX', 'DXCM', 'EFX', 'F', 'ECL', 'ED', 'GS-PJ', 'GILD', 'GIS', 'FMBM', 'FPLPF', 'GM', 'FBHS', 'HBAN', 'FLS', 'FIS', 'FE', 'FRT', 'FRMC', 'FFIV', 'GWW', 'GRMN', 'GGG', 'FN', 'GOOG', 'GPC', 'FLT', 'FITB', 'FCX', 'FISV', 'GPN', 'FMC', 'FRC', 'HAL', 'FDX', 'FCGN', 'FB', 'GE', 'FTI', 'GD', 'HAS', 'HD', 'INTU', 'IFF', 'IRM', 'ICE', 'HLT', 'IDXX', 'HII', 'ILMN', 'HTLF', 'HPQ', 'HON', 'IBM', 'IPGP', 'HCA', 'HRL', 'IR', 'HSY', 'HOLX', 'ISRG', 'HPE', 'HRB', 'HSIC', 'INTH', 'HFC', 'HBI', 'HUM', 'IP', 'HST', 'IEX', 'HES', 'KSU', 'LNT', 'KRA', 'KHC', 'KR', 'KMB', 'JKHY', 'JNJ', 'IT', 'LEG', 'ITW', 'KSS', 'KEY', 'JNPR', 'LKQ', 'KIM', 'IVZ', 'KO', 'LNC', 'JBHT', 'LDOS', 'KMX', 'LMT', 'K', 'JPM', 'KGNR', 'KACPF', 'JCI', 'LH', 'KEYS', 'LBTYA', 'MSFT', 'MKTX', 'LYB', 'MCO', 'MRO', 'MDLZ', 'MLM', 'LVS', 'LRCX', 'MSCI', 'MOS', 'MRK', 'MET', 'MGM', 'MNST', 'MMC', 'MO', 'MCHP', 'LYV', 'MHK', 'MDT', 'LUV', 'MCK', 'MS-PF', 'MMM', 'MAA', 'MCD', 'MPC', 'MAR', 'LOW', 'MRCR', 'NOV', 'PEG', 'NVRO', 'NTRR', 'MU', 'NTRA', 'PAYX', 'NSC', 'NRG', 'ODFL', 'NTAP', 'PBCT', 'NFLX', 'ORLY', 'OMC', 'NTRS', 'NCTKF', 'NOXL', 'OKE', 'NI', 'NVR', 'NOC', 'O', 'NOW', 'PCAR', 'NEE', 'NLSN', 'NWL', 'MSI', 'NDAQ', 'NMHLY', 'OXY', 'NOK', 'NEOG', 'NCLH', 'RF', 'PSX', 'RE', 'PPG', 'ROK', 'PXD', 'RIBT', 'RCL', 'REGN', 'RMD', 'PKI', 'RL', 'RJF', 'PG', 'QRVO', 'REG', 'PHM', 'PNWRF', 'PKG', 'PNW', 'PLD', 'PVH', 'PM', 'PNR', 'PWR', 'PH', 'RLI', 'PEP', 'PRU', 'PFE', 'RHI', 'ROST', 'TAP', 'TEL', 'SRG', 'SLG', 'RSG', 'SYK', 'SNPS', 'SCHW', 'SHW', 'RXMD', 'SEGXF', 'SWKS', 'SBUX', 'RSNHF', 'SWK', 'SONC', 'ROP', 'STZ-B', 'TCYSF', 'STT', 'SPG', 'SYF', 'T', 'STX', 'SIVB', 'SO', 'ROL', 'TJX', 'SEE', 'SLB', 'SRE', 'VZ', 'UNP', 'TMUS', 'TRAUF', 'V', 'TW', 'VRSK', 'TWTR', 'URI', 'ULTA', 'UPS', 'UDR', 'TSN', 'UAL', 'TSCO', 'TTWO', 'VRSN', 'UA', 'TMO', 'WBA', 'TXN', 'UNM', 'USB', 'TXT', 'VMC', 'WAT', 'UHS', 'UEEC', 'VTR', 'TYL', 'TROW', 'TRV', 'VFC', 'WYNN', 'WSPOF', 'WU', 'YUM', 'XYL', 'WST', 'WRK', 'WEC', 'WM', 'ZTS', 'ZBH', 'XOM', 'XEL', 'WDC', 'WRB', 'WY', 'ZION', 'WHR', 'XLEFF', 'WMB', 'XLNX']

fname="%s.csv"%files[file_i]
cur_fpath=os.path.join(root_dir,fname)
tmp_data=get_data_full(cur_fpath,cur_parameters)

arbitrary_i=3
gain_loss_items=[]
arbitrary_items=[]
random_items=[]
#for item_i in range(len(tmp_data)):
for item_i in range(100):  
  if item_i%500==0:print("item_i",item_i, "out of:", len(tmp_data))
  #print("item_i",item_i)
  item0=tmp_data[item_i]
  ft_list,lb_list=item0[:2]
  actual_labels=item0[2]
  actual_pairs=get_pairs(actual_labels)
  cur_pair=actual_pairs[arbitrary_i]
  arbitrary_items.append(int(cur_pair[0]))
  random_i=random.randint(2,4)
  random_pair=actual_pairs[random_i]
  random_items.append(int(random_pair[0]))  
  #print(cur_pair)
  ft_list_tensor=torch.tensor(ft_list,dtype=torch.float32)
  #ft_list_tensor=ft_list_tensor.reshape([1,n_input])
  lb_list_tensor=torch.tensor(lb_list,dtype=torch.float32)
  lb_tensor_flat=lb_list_tensor.ravel()
  rnn_out=rnn(ft_list_tensor)
  cur_gain_loss=eval_pred1(rnn_out,actual_labels,standard_labels)
  gain_loss_items.append(cur_gain_loss)
  print("------")


overall_gain_loss=sum(gain_loss_items)
avg_gain_loss=sum(gain_loss_items)/len(gain_loss_items)
n_gain_items=len([v for v in gain_loss_items if v>0])
n_loss_items=len([v for v in gain_loss_items if v<0])
percent_gain_items=round(100*n_gain_items/len(gain_loss_items))
percent_loss_items=round(100*n_loss_items/len(gain_loss_items))

avg_arbitrary=sum(arbitrary_items)/len(arbitrary_items)
avg_random=sum(random_items)/len(random_items)

print(fname, "epoch",epoch_i)
print("overall_gain_loss",overall_gain_loss)
print("avg_gain_loss",round(avg_gain_loss,2))
print("avg_arbitrary",round(avg_arbitrary,2))
print("avg_random",round(avg_random,2))
print("n_gain_items",n_gain_items)
print("n_loss_items",n_loss_items)
print("percent_gain_items",percent_gain_items)
print("percent_loss_items",percent_loss_items)


item_i 0 out of: 6706
0 actual high: 0 actual low: -3
high: (-6, -0.0065) -0.0081 low: (-6, 0.0099) 0.01
high: (-5, 0.0382) 0.0393 low: (-5, 0.0215) 0.0318
high: (-4, -0.0197) 0.0149 low: (-4, -0.0224) 0.0091
high: (-3, -0.0234) -0.0141 low: (-3, 0.0478) 0.0574
high: (-2, 0.0068) -0.0057 low: (-2, 0.08) 0.1383
high: (-1, 0.0214) 0.0208 low: (-1, 0.3261) 0.4681
high: (0, 0.5202) 0.6448 low: (0, 0.5382) 0.5442
high: (1, 0.1761) 0.3344 low: (1, 0.0066) -0.0123
high: (2, 0.0836) 0.1161 low: (2, 0.0014) -0.019
high: (3, 0.0018) 0.0125 low: (3, 0.0211) -0.0204
high: (4, -0.0349) 0.0103 low: (4, -0.0373) -0.0418
high: (5, -0.007) 0.0535 low: (5, -0.0002) -0.004
high: (6, 0.0502) 0.0622 low: (6, -0.0038) -0.0038
1 actual high: -2 actual low: -6
high: (-6, 0.025) 0.026 low: (-6, 0.0374) 0.0435
high: (-5, 0.0115) 0.038 low: (-5, -0.034) 0.004
high: (-4, 0.0242) 0.0632 low: (-4, 0.0366) 0.0465
high: (-3, 0.0144) 0.0782 low: (-3, 0.0598) 0.116
high: (-2, 0.026) 0.1053 low: (-2, 0.1027) 0.2354
high



0 actual high: 1 actual low: -2
high: (-6, -0.007) -0.0087 low: (-6, 0.01) 0.0102
high: (-5, 0.0383) 0.0391 low: (-5, 0.0216) 0.0321
high: (-4, -0.0197) 0.0145 low: (-4, -0.0221) 0.0096
high: (-3, -0.0235) -0.0148 low: (-3, 0.048) 0.0584
high: (-2, 0.0064) -0.0069 low: (-2, 0.0797) 0.1393
high: (-1, 0.0213) 0.0197 low: (-1, 0.3242) 0.4684
high: (0, 0.5174) 0.6456 low: (0, 0.5352) 0.5434
high: (1, 0.175) 0.3347 low: (1, 0.0064) -0.0118
high: (2, 0.0834) 0.1163 low: (2, 0.0019) -0.0183
high: (3, 0.0021) 0.0122 low: (3, 0.0212) -0.0202
high: (4, -0.035) 0.0096 low: (4, -0.0376) -0.0417
high: (5, -0.0072) 0.0533 low: (5, 0.0002) -0.0036
high: (6, 0.0499) 0.0623 low: (6, -0.0037) -0.0038
1 actual high: 3 actual low: 0
high: (-6, 0.0249) 0.0261 low: (-6, 0.0369) 0.0432
high: (-5, 0.0113) 0.0379 low: (-5, -0.0336) 0.0039
high: (-4, 0.0237) 0.0628 low: (-4, 0.0365) 0.0466
high: (-3, 0.0147) 0.0782 low: (-3, 0.0592) 0.1159
high: (-2, 0.0262) 0.1056 low: (-2, 0.1025) 0.2358
high: (-1, 0.0526) 0.

KeyboardInterrupt: ignored

#New processing - Aug 22

In [None]:

cur_params={}
cur_params["prev_n"]=30#30
cur_params["next_n"]=5#10
cur_params["max_percent"]=5
cur_params["columns"]=["High","Low","Open","Close"]
#cur_params["labels"]=["day:0;low:<0","day:0;low:<-2","day:1234;high:>2","day:1234;high:>5","day:34;high:>2","day:34;high:>5","day:4;close:<0","day:4;close:>0","day:4;close:<-3","day:4;close:>3"]
#cur_params["labels"]=["day:0;low:<0","day:0;low:<-2","day:0;low:<2","day:1234;high:>2","day:1234;high:>5","day:4;close:<0","day:4;close:>0","day:4;close:<-3","day:4;close:>3"]
cur_params["pred_labels"]=["day:0;low:<0","day:0;low:<-2","day:0;low:<2","day:1234;high:>0","day:1234;high:>2","day:1234;high:>5","day:4;close:<0"]
cur_params["gain_threshold"]=5
cur_params["flatten"]=False
cur_params["standard_labels"]=gen_labels(cur_params["max_percent"])


def get_label_spec_dict(label0): # e.g. "day:0;low:<0"
  tmp_dict0={}
  lb_split=label0.split(";")
  for sp0 in lb_split:
    colon_split=sp0.split(":")
    tmp_dict0[colon_split[0]]=colon_split[1]
  return tmp_dict0

def get_specs_from_next_data(normalized_next0,params0={}):
  pred_labels0=params0.get("pred_labels",[])
  label_val_dict0={}
  label_spec_dict0={}
  for a in pred_labels0:
    label_val_dict0[a]=0.
    spec0=get_label_spec_dict(a)
    label_spec_dict0[a]=spec0
  for day0,item0 in enumerate(normalized_next0):
    high0,low0,open0,close0=item0
    local_dict={}
    local_dict["high"]=high0
    local_dict["low"]=low0
    local_dict["open"]=open0
    local_dict["close"]=close0
    for lb0 in pred_labels0:
      corr_tmp_dict=label_spec_dict0[lb0]
      if not str(day0) in corr_tmp_dict.get("day",""): continue
      for ld_key,ld_val in local_dict.items():
        corr_rule=corr_tmp_dict.get(ld_key)
        if corr_rule==None: continue
        comparator=corr_rule[0]
        compared_to=int(corr_rule[1:])
        ld_val_percent=100*ld_val
        outcome=0.
        if comparator==">" and ld_val_percent>=compared_to: outcome=1.
        if comparator=="<" and ld_val_percent<=compared_to: outcome=1.
        if comparator=="=" and int(ld_val_percent)==compared_to: outcome=1.
        if outcome==1: label_val_dict0[lb0]=1.
  return label_val_dict0


file_i=125 #48
root_dir='stock_market_data/sp500/csv'
files=['ABC', 'ADP', 'A', 'ABT', 'ABMD', 'ADI', 'ABBV', 'AAPL', 'ADSK', 'ADM', 'ACN', 'AAP', 'AAL', 'ALGN', 'APH', 'AOS', 'AWK', 'ALLE', 'AME', 'APD', 'ARE', 'AIZ', 'ALB', 'APA', 'ALK', 'AEE', 'AMGN', 'ANTM', 'AEP', 'AON', 'AKAM', 'AXP', 'AMD', 'AMAT', 'AMP', 'ANET', 'AJG', 'AZO', 'ATVI', 'AMZN', 'AMT', 'AVB', 'ALTR', 'AVY', 'CAH', 'CDNS', 'BIO', 'CDE', 'BXP', 'BK', 'BEN', 'C', 'BMRA', 'BAX', 'BLK', 'BF-A', 'BDX', 'BR', 'BSHI', 'CB', 'CAG', 'BIIB', 'BAC', 'BMY', 'CCI', 'BSX', 'CAT', 'BRK-A', 'BBY', 'BA', 'BWA', 'CME', 'CNWT', 'CF', 'CTXS', 'D', 'CTSH', 'CHD', 'CFG', 'DFS', 'CPICQ', 'DG', 'CRM', 'CHRW', 'CLX', 'DGX', 'CPB', 'COTY', 'CHTR', 'COP', 'CNC', 'CNP', 'DE', 'COO', 'CUK', 'CPRT', 'COST', 'CINF', 'CMG', 'CL', 'CTQ', 'CTAS', 'CMI', 'CSCO', 'COWN', 'DAL', 'DTE', 'ENS', 'EQIX', 'DRE', 'DOV', 'DHI', 'EW', 'ES', 'EQR', 'DIS', 'DPZ', 'FANG', 'EXR', 'EMR', 'DLTR', 'EMN', 'FAST', 'DVA', 'EBAY', 'EA', 'DRI', 'EOG', 'EL', 'ESS', 'EIX', 'DXCM', 'EFX', 'F', 'ECL', 'ED', 'GS-PJ', 'GILD', 'GIS', 'FMBM', 'FPLPF', 'GM', 'FBHS', 'HBAN', 'FLS', 'FIS', 'FE', 'FRT', 'FRMC', 'FFIV', 'GWW', 'GRMN', 'GGG', 'FN', 'GOOG', 'GPC', 'FLT', 'FITB', 'FCX', 'FISV', 'GPN', 'FMC', 'FRC', 'HAL', 'FDX', 'FCGN', 'FB', 'GE', 'FTI', 'GD', 'HAS', 'HD', 'INTU', 'IFF', 'IRM', 'ICE', 'HLT', 'IDXX', 'HII', 'ILMN', 'HTLF', 'HPQ', 'HON', 'IBM', 'IPGP', 'HCA', 'HRL', 'IR', 'HSY', 'HOLX', 'ISRG', 'HPE', 'HRB', 'HSIC', 'INTH', 'HFC', 'HBI', 'HUM', 'IP', 'HST', 'IEX', 'HES', 'KSU', 'LNT', 'KRA', 'KHC', 'KR', 'KMB', 'JKHY', 'JNJ', 'IT', 'LEG', 'ITW', 'KSS', 'KEY', 'JNPR', 'LKQ', 'KIM', 'IVZ', 'KO', 'LNC', 'JBHT', 'LDOS', 'KMX', 'LMT', 'K', 'JPM', 'KGNR', 'KACPF', 'JCI', 'LH', 'KEYS', 'LBTYA', 'MSFT', 'MKTX', 'LYB', 'MCO', 'MRO', 'MDLZ', 'MLM', 'LVS', 'LRCX', 'MSCI', 'MOS', 'MRK', 'MET', 'MGM', 'MNST', 'MMC', 'MO', 'MCHP', 'LYV', 'MHK', 'MDT', 'LUV', 'MCK', 'MS-PF', 'MMM', 'MAA', 'MCD', 'MPC', 'MAR', 'LOW', 'MRCR', 'NOV', 'PEG', 'NVRO', 'NTRR', 'MU', 'NTRA', 'PAYX', 'NSC', 'NRG', 'ODFL', 'NTAP', 'PBCT', 'NFLX', 'ORLY', 'OMC', 'NTRS', 'NCTKF', 'NOXL', 'OKE', 'NI', 'NVR', 'NOC', 'O', 'NOW', 'PCAR', 'NEE', 'NLSN', 'NWL', 'MSI', 'NDAQ', 'NMHLY', 'OXY', 'NOK', 'NEOG', 'NCLH', 'RF', 'PSX', 'RE', 'PPG', 'ROK', 'PXD', 'RIBT', 'RCL', 'REGN', 'RMD', 'PKI', 'RL', 'RJF', 'PG', 'QRVO', 'REG', 'PHM', 'PNWRF', 'PKG', 'PNW', 'PLD', 'PVH', 'PM', 'PNR', 'PWR', 'PH', 'RLI', 'PEP', 'PRU', 'PFE', 'RHI', 'ROST', 'TAP', 'TEL', 'SRG', 'SLG', 'RSG', 'SYK', 'SNPS', 'SCHW', 'SHW', 'RXMD', 'SEGXF', 'SWKS', 'SBUX', 'RSNHF', 'SWK', 'SONC', 'ROP', 'STZ-B', 'TCYSF', 'STT', 'SPG', 'SYF', 'T', 'STX', 'SIVB', 'SO', 'ROL', 'TJX', 'SEE', 'SLB', 'SRE', 'VZ', 'UNP', 'TMUS', 'TRAUF', 'V', 'TW', 'VRSK', 'TWTR', 'URI', 'ULTA', 'UPS', 'UDR', 'TSN', 'UAL', 'TSCO', 'TTWO', 'VRSN', 'UA', 'TMO', 'WBA', 'TXN', 'UNM', 'USB', 'TXT', 'VMC', 'WAT', 'UHS', 'UEEC', 'VTR', 'TYL', 'TROW', 'TRV', 'VFC', 'WYNN', 'WSPOF', 'WU', 'YUM', 'XYL', 'WST', 'WRK', 'WEC', 'WM', 'ZTS', 'ZBH', 'XOM', 'XEL', 'WDC', 'WRB', 'WY', 'ZION', 'WHR', 'XLEFF', 'WMB', 'XLNX']
fname="%s.csv"%files[file_i]
cur_fpath=os.path.join(root_dir,fname)

start_i=751
n_prev=30
n_next=5

cur_data0=get_csv_data_new(cur_fpath)
print(len(cur_data0), cur_data0[0])
prev0=cur_data0[start_i:start_i+n_prev]
next0=cur_data0[start_i+n_prev:start_i+n_prev+n_next]
pv,nx=normalize_ft_labels_new(prev0,next0)

for n0 in nx:
  print([round(v,4) for v in n0])
cur_label_val_dict=get_specs_from_next_data(nx,cur_params)
for a in cur_params["pred_labels"]:
  print(a, cur_label_val_dict[a])


8165 [0.53125, 0.515625, 0.0, 0.515625]
[0.0152, -0.0227, 0.0076, 0.0076]
[0.053, 0.0152, 0.0303, 0.0492]
[0.0682, 0.0455, 0.0455, 0.0606]
[-0.0076, -0.053, -0.0152, -0.0076]
[0.0152, -0.0379, 0.0, -0.0227]
day:0;low:<0 1.0
day:0;low:<-2 1.0
day:0;low:<2 1.0
day:1234;high:>0 1.0
day:1234;high:>2 1.0
day:1234;high:>5 1.0
day:4;close:<0 1.0


In [None]:
root_dir='stock_market_data/sp500/csv'
files=['ABC', 'ADP', 'A', 'ABT', 'ABMD', 'ADI', 'ABBV', 'AAPL', 'ADSK', 'ADM', 'ACN', 'AAP', 'AAL', 'ALGN', 'APH', 'AOS', 'AWK', 'ALLE', 'AME', 'APD', 'ARE', 'AIZ', 'ALB', 'APA', 'ALK', 'AEE', 'AMGN', 'ANTM', 'AEP', 'AON', 'AKAM', 'AXP', 'AMD', 'AMAT', 'AMP', 'ANET', 'AJG', 'AZO', 'ATVI', 'AMZN', 'AMT', 'AVB', 'ALTR', 'AVY', 'CAH', 'CDNS', 'BIO', 'CDE', 'BXP', 'BK', 'BEN', 'C', 'BMRA', 'BAX', 'BLK', 'BF-A', 'BDX', 'BR', 'BSHI', 'CB', 'CAG', 'BIIB', 'BAC', 'BMY', 'CCI', 'BSX', 'CAT', 'BRK-A', 'BBY', 'BA', 'BWA', 'CME', 'CNWT', 'CF', 'CTXS', 'D', 'CTSH', 'CHD', 'CFG', 'DFS', 'CPICQ', 'DG', 'CRM', 'CHRW', 'CLX', 'DGX', 'CPB', 'COTY', 'CHTR', 'COP', 'CNC', 'CNP', 'DE', 'COO', 'CUK', 'CPRT', 'COST', 'CINF', 'CMG', 'CL', 'CTQ', 'CTAS', 'CMI', 'CSCO', 'COWN', 'DAL', 'DTE', 'ENS', 'EQIX', 'DRE', 'DOV', 'DHI', 'EW', 'ES', 'EQR', 'DIS', 'DPZ', 'FANG', 'EXR', 'EMR', 'DLTR', 'EMN', 'FAST', 'DVA', 'EBAY', 'EA', 'DRI', 'EOG', 'EL', 'ESS', 'EIX', 'DXCM', 'EFX', 'F', 'ECL', 'ED', 'GS-PJ', 'GILD', 'GIS', 'FMBM', 'FPLPF', 'GM', 'FBHS', 'HBAN', 'FLS', 'FIS', 'FE', 'FRT', 'FRMC', 'FFIV', 'GWW', 'GRMN', 'GGG', 'FN', 'GOOG', 'GPC', 'FLT', 'FITB', 'FCX', 'FISV', 'GPN', 'FMC', 'FRC', 'HAL', 'FDX', 'FCGN', 'FB', 'GE', 'FTI', 'GD', 'HAS', 'HD', 'INTU', 'IFF', 'IRM', 'ICE', 'HLT', 'IDXX', 'HII', 'ILMN', 'HTLF', 'HPQ', 'HON', 'IBM', 'IPGP', 'HCA', 'HRL', 'IR', 'HSY', 'HOLX', 'ISRG', 'HPE', 'HRB', 'HSIC', 'INTH', 'HFC', 'HBI', 'HUM', 'IP', 'HST', 'IEX', 'HES', 'KSU', 'LNT', 'KRA', 'KHC', 'KR', 'KMB', 'JKHY', 'JNJ', 'IT', 'LEG', 'ITW', 'KSS', 'KEY', 'JNPR', 'LKQ', 'KIM', 'IVZ', 'KO', 'LNC', 'JBHT', 'LDOS', 'KMX', 'LMT', 'K', 'JPM', 'KGNR', 'KACPF', 'JCI', 'LH', 'KEYS', 'LBTYA', 'MSFT', 'MKTX', 'LYB', 'MCO', 'MRO', 'MDLZ', 'MLM', 'LVS', 'LRCX', 'MSCI', 'MOS', 'MRK', 'MET', 'MGM', 'MNST', 'MMC', 'MO', 'MCHP', 'LYV', 'MHK', 'MDT', 'LUV', 'MCK', 'MS-PF', 'MMM', 'MAA', 'MCD', 'MPC', 'MAR', 'LOW', 'MRCR', 'NOV', 'PEG', 'NVRO', 'NTRR', 'MU', 'NTRA', 'PAYX', 'NSC', 'NRG', 'ODFL', 'NTAP', 'PBCT', 'NFLX', 'ORLY', 'OMC', 'NTRS', 'NCTKF', 'NOXL', 'OKE', 'NI', 'NVR', 'NOC', 'O', 'NOW', 'PCAR', 'NEE', 'NLSN', 'NWL', 'MSI', 'NDAQ', 'NMHLY', 'OXY', 'NOK', 'NEOG', 'NCLH', 'RF', 'PSX', 'RE', 'PPG', 'ROK', 'PXD', 'RIBT', 'RCL', 'REGN', 'RMD', 'PKI', 'RL', 'RJF', 'PG', 'QRVO', 'REG', 'PHM', 'PNWRF', 'PKG', 'PNW', 'PLD', 'PVH', 'PM', 'PNR', 'PWR', 'PH', 'RLI', 'PEP', 'PRU', 'PFE', 'RHI', 'ROST', 'TAP', 'TEL', 'SRG', 'SLG', 'RSG', 'SYK', 'SNPS', 'SCHW', 'SHW', 'RXMD', 'SEGXF', 'SWKS', 'SBUX', 'RSNHF', 'SWK', 'SONC', 'ROP', 'STZ-B', 'TCYSF', 'STT', 'SPG', 'SYF', 'T', 'STX', 'SIVB', 'SO', 'ROL', 'TJX', 'SEE', 'SLB', 'SRE', 'VZ', 'UNP', 'TMUS', 'TRAUF', 'V', 'TW', 'VRSK', 'TWTR', 'URI', 'ULTA', 'UPS', 'UDR', 'TSN', 'UAL', 'TSCO', 'TTWO', 'VRSN', 'UA', 'TMO', 'WBA', 'TXN', 'UNM', 'USB', 'TXT', 'VMC', 'WAT', 'UHS', 'UEEC', 'VTR', 'TYL', 'TROW', 'TRV', 'VFC', 'WYNN', 'WSPOF', 'WU', 'YUM', 'XYL', 'WST', 'WRK', 'WEC', 'WM', 'ZTS', 'ZBH', 'XOM', 'XEL', 'WDC', 'WRB', 'WY', 'ZION', 'WHR', 'XLEFF', 'WMB', 'XLNX']
file_i=0
fname="%s.csv"%files[file_i]
cur_fpath=os.path.join(root_dir,fname)
tmp_data=get_data_full(cur_fpath)
print(len(tmp_data))
print(tmp_data[0])
# for tmp0 in tmp_data[:5]:
#   for t_ in tmp_data:
#     print(t_)
#   print("----")

6725
([0.09195402298850575, 0.034482758620689655, 0.08045977011494253, 0.08620689655172414, 0.09195402298850575, 0.04597701149425287, 0.06896551724137931, 0.04597701149425287, 0.08045977011494253, 0.04597701149425287, 0.04597701149425287, 0.06321839080459771, 0.06896551724137931, 0.04597701149425287, 0.06896551724137931, 0.04597701149425287, 0.06896551724137931, 0.04597701149425287, 0.04597701149425287, 0.05747126436781609, 0.06896551724137931, 0.04597701149425287, 0.04597701149425287, 0.05172413793103448, 0.05747126436781609, 0.04597701149425287, 0.04597701149425287, 0.05172413793103448, 0.05747126436781609, 0.04597701149425287, 0.04597701149425287, 0.05747126436781609, 0.08620689655172414, 0.04597701149425287, 0.04597701149425287, 0.08620689655172414, 0.08620689655172414, 0.05747126436781609, 0.06321839080459771, 0.08045977011494253, 0.06896551724137931, 0.034482758620689655, 0.05747126436781609, 0.04597701149425287, 0.034482758620689655, 0.0, 0.034482758620689655, 0.0057471264367816

In [None]:
random.randint(2,10)

3

In [None]:
ANET.csv
overall_gain_loss 3022
avg_gain_loss 1.59
n_gain_items 1151
n_loss_items 471
percent_gain_items 61
percent_loss_items 25

In [None]:
def get_pairs(list1):
  out=[]
  for i0 in range(0,len(list1),2): out.append((list1[i0],list1[i0+1]))
  return out

list0=["a","b","c","d"]
get_pairs(list0)

[('a', 'b'), ('c', 'd')]

In [None]:
gen_labels(3)
percent_bin_vals(0.174,10)


'11'

In [None]:
train_i=130
for train_i in range(100):
  print(train_i)
  prev_vals,next_vals=cur_train0[train_i]
  prev_vals=[round(v,2) for v in prev_vals]
  next_vals=[round(v,2) for v in next_vals]
  prev_percents,next_percents=get_diff_percent(prev_vals,prev_vals[-1]),get_diff_percent(next_vals,prev_vals[-1])
  min_val=min(next_percents)
  min_val_index=next_percents.index(min_val)
  max_val=max(next_percents)
  max_val_index=next_percents.index(max_val)
  rebound_max_val=max(next_percents[min_val_index:])
  rebound_diff=round(rebound_max_val-min_val,2)

  print("prev_vals:", prev_vals)
  print("next_vals:", next_vals)
  #print(prev_percents)
  print("Next percent differences:", next_percents)
  print("min val:",min_val, "@ index:",min_val_index)
  print("max val:",max_val, "@ index:",max_val_index)
  print("max rebound val:",rebound_max_val)
  print("rebound_diff:",rebound_diff)
  print("---------")

#print("min val:",min_next_diff_val, "@ index:",min_val_index, "max val:",max_next_diff_val, "rebound_max_val:",rebound_max_val)

0
prev_vals: [1.72, 1.73, 1.72, 1.75, 1.77, 1.76, 1.75, 1.74, 1.75, 1.75, 1.77, 1.75, 1.76, 1.77, 1.81, 1.76, 1.73, 1.74, 1.73, 1.72]
next_vals: [1.73, 1.72, 1.75, 1.75, 1.76, 1.78, 1.8, 1.79, 1.8, 1.81]
Next percent differences: [0.58, 0.0, 1.74, 1.74, 2.33, 3.49, 4.65, 4.07, 4.65, 5.23]
min val: 0.0 @ index: 1
max val: 5.23 @ index: 9
max rebound val: 5.23
rebound_diff: 5.23
---------
1
prev_vals: [1.73, 1.72, 1.75, 1.77, 1.76, 1.75, 1.74, 1.75, 1.75, 1.77, 1.75, 1.76, 1.77, 1.81, 1.76, 1.73, 1.74, 1.73, 1.72, 1.73]
next_vals: [1.72, 1.75, 1.75, 1.76, 1.78, 1.8, 1.79, 1.8, 1.81, 1.79]
Next percent differences: [-0.58, 1.16, 1.16, 1.73, 2.89, 4.05, 3.47, 4.05, 4.62, 3.47]
min val: -0.58 @ index: 0
max val: 4.62 @ index: 8
max rebound val: 4.62
rebound_diff: 5.2
---------
2
prev_vals: [1.72, 1.75, 1.77, 1.76, 1.75, 1.74, 1.75, 1.75, 1.77, 1.75, 1.76, 1.77, 1.81, 1.76, 1.73, 1.74, 1.73, 1.72, 1.73, 1.72]
next_vals: [1.75, 1.75, 1.76, 1.78, 1.8, 1.79, 1.8, 1.81, 1.79, 1.79]
Next percent 

#Main Functions

In [None]:
import os
import pandas as pd
import numpy as np

#define functions to extract features and process labels
def get_diff_percent(val_list,ref_val0): #ref_val is the present val
  out_vals=[]
  for val0 in val_list:
    try:
      diff0=val0-ref_val0
      precent0=100*(diff0/ref_val0)
      out_vals.append(round(precent0,2))
    except: pass
  return out_vals

class io_cls: #input to output: category >< onehot
  def __init__(self,spacing=2,max_val=10): #for a general purpose, this can be where we define the labels
    self.spacing=spacing
    self.max_val=max_val
    self.all_labels=[]
    for mv_val0 in range(-max_val,max_val+1,spacing):self.all_labels.append(str(mv_val0))
    self.n_labels=len(self.all_labels)
  def one_hot(self,val_list,ref_val): #and this is when we convert from categorical to one hot
    self.diff_list=[]
    self.one_hot_list=[]
    for val0 in val_list:
      diff0=val0-ref_val
      precent0=100*(diff0/ref_val)
      precent0_norm=int(round(precent0/self.spacing)*self.spacing) #int(round(spacing*precent0)/spacing)
      if precent0_norm<=-self.max_val: diff_str=str(-self.max_val)
      elif precent0_norm>=self.max_val: diff_str=str(self.max_val)
      else: diff_str=str(precent0_norm)
      self.diff_list.append(diff_str)
      tmp_one_hot_vals=[0.]*len(self.all_labels)
      if diff_str in self.all_labels: 
        tmp_i=self.all_labels.index(diff_str)
        tmp_one_hot_vals[tmp_i]=1.
      self.one_hot_list.append(tmp_one_hot_vals)
    return self.one_hot_list
  def out2labels(self,rnn_flat_out): #a flat rnn output to split into slices, and get the label weights for each slice - and then from one hot to categorical
    final_list=[]
    n_slices=int(len(rnn_flat_out)/len(self.all_labels))
    for i0 in range(n_slices):
      i1=i0+1
      cur_slice=rnn_flat_out[i0*len(self.all_labels):i1*len(self.all_labels)]
      tmp_list=[]
      for lb0,cs0 in zip(self.all_labels,cur_slice): tmp_list.append((lb0,cs0))
      tmp_list.sort(key=lambda x:-x[-1])
      final_list.append(tmp_list)
    return final_list

#Getting the input
def get_norm_close(fpath,prev_n0=20,next_n0=10,train_ratio=0.75):
  pd_df=pd.read_csv(fpath)
  close_col=pd_df["Close"].fillna(0)
  #close_col=pd_df.dropna(subset=['Close'], how='all', inplace=True)
  #close_col = pd_df[pd_df['Close'].notna()]
  data_len=len(close_col)
  all_data=[]
  for test_i in range(prev_n0,len(close_col)-next_n0):
    prev_items=close_col[test_i-prev_n0:test_i].to_list() #[0,1,2,3,4,5,6,7,8,9] predict the closing today and the following next_n-1 days
    next_items=close_col[test_i:test_i+next_n0].to_list()
    all_data.append((prev_items,next_items))    
  train_size=int(train_ratio*data_len)
  train_data=all_data[:train_size]
  test_data=all_data[train_size:]
  return train_data,test_data

def get_prev_next_vals(list_vals,prev_n0=20,next_n0=10):
  all_data=[]
  for test_i in range(prev_n0,len(list_vals)-next_n0):
    prev_vals=list_vals[test_i-prev_n0:test_i] #[0,1,2,3,4,5,6,7,8,9] predict the closing today and the following next_n-1 days
    next_vals=list_vals[test_i:test_i+next_n0]
    all_data.append((prev_vals,next_vals))  
    #prev_percents,next_percents=get_diff_percent(prev_vals,prev_vals[-1]),get_diff_percent(next_vals,prev_vals[-1])  
  return all_data

def get_prev_next_percent(list_vals0,prev_n0=20,next_n0=10):
  new_data=[]
  tmp_data=get_prev_next_vals(list_vals0,prev_n0,next_n0)
  for prev0,next0 in tmp_data:
    prev_percents,next_percents=get_diff_percent(prev0,prev0[-1]),get_diff_percent(next0,prev0[-1]) 
    new_data.append((prev_percents,next_percents)) 
  return new_data

  




def extract_labels(next_percents0):
  if next_percents0==[]: return [0.,0.]
  min_val0=min(next_percents0)
  #print("min_val0",min_val0)
  min_val_index=next_percents0.index(min_val0)
  max_val=max(next_percents0[1:])
  max_val_index=next_percents0.index(max_val)
  rebound_max_val=max(next_percents0[min_val_index:])
  rebound_diff=round(rebound_max_val-min_val0,2)
  max_greater_than_5=0.
  found_minus_5=False
  found_rebound_greater_than_5=False
  cur_min_val=None
  for i0, percent_val in enumerate(next_percents0):
    if percent_val<-5 and found_minus_5==False: 
      found_minus_5=True
      if cur_min_val==None or percent_val<cur_min_val: cur_min_val=percent_val
      #print("percent_val",percent_val,"found_minus_5",found_minus_5)
      continue
    if cur_min_val!=None and percent_val-cur_min_val>5: 
      found_rebound_greater_than_5=True
      break
  min_5_rebound_greater_than_5=0.
  if max_val>5: max_greater_than_5=1.
  if found_rebound_greater_than_5: min_5_rebound_greater_than_5=1.
  actual_out0=[max_greater_than_5,min_5_rebound_greater_than_5]
  return actual_out0

cur_test_list=[12,9,15,10,13,14,12]
cur_ref_val=12
test_out=get_diff_percent(cur_test_list,cur_ref_val)
print(test_out)

[0.0, -25.0, 25.0, -16.67, 8.33, 16.67, 0.0]


#Starting Training

In [None]:
import time, math, random
from random import shuffle

model_name="exp6-pred1-combined-stocks"
model_name="exp6-pred1-combined-stocks1-3layer-full"
model_name="exp6-pred1-combined-stocks1-3layer-batches1"
model_name="exp6-pred2-combined-stocks1-3layer-batches1"
model_name="exp6-pred2-combined-stocks1-4layer-batches2"
n_input=1
n_output=1
n_output=2 #>5, min <5 & rebound >5
n_hidden =64#64
n_layers=4#3
n_epochs=100
LR=0.0000001
prev_n,next_n=20,10
n_train,n_test=None,None
#n_train,n_test=1000,50
train_batch_size=10000

test_cutoff_val=0.5

torch.manual_seed(1)
random.seed(1)




root_dir='stock_market_data/sp500/csv'
initial_files=["AAPL","GOOG","FB","AMZN","EA","IBM","MSFT","GM","UPS","PG"]
#cur_path=os.path.join(root_dir,"AAPL.csv")
all_files=[v.split(".")[0] for v in os.listdir(root_dir) if v.endswith(".csv")]
additional_files=[v for v in all_files if not v in initial_files]
sample_files=initial_files+additional_files[:90]

#cur_train0,cur_test0=get_norm_close("stock_market_data/sp500/csv/AAPL.csv",prev_n,next_n) #stock_market_data/sp500/csv/AAPL.csv



#rnn = RNN(n_input, n_hidden, n_output,n_layers,matching_in_out=False).to(device)
rnn = RNN(n_input, n_hidden, n_output,n_layers,matching_in_out=False).to(device)
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all cnn parameters


model_dir=os.path.join(cwd,"models", model_name) 
tmp_model_dir=os.path.join(cwd,"models", model_name,"tmp") 
if not os.path.exists(tmp_model_dir): os.makedirs(tmp_model_dir)
log_fpath=os.path.join(model_dir,"log.txt")
log_fopen=open(log_fpath,"a")
log_fopen.write(str(rnn)+"\n")

print("loading data")
all_training,all_testing=[],[]
for fname in sample_files:
  cur_path=os.path.join(root_dir,fname+".csv")
  cur_train0,cur_test0=get_norm_close(cur_path,prev_n,next_n,train_ratio=0.8)
  if n_train!=None: cur_train0=cur_train0[:n_train]
  if n_test!=None: cur_test0=cur_test0[:n_test]
  all_training.extend(cur_train0)
  all_testing.extend(cur_test0)
shuffle(all_training)
shuffle(all_testing)
print("all_training", len(all_training),"all_testing",len(all_testing))
n_batches=math.floor(len(all_training)/train_batch_size)
test_batch_size=math.floor(len(all_testing)/n_batches)


for epoch0 in range(n_epochs):
  PATH=os.path.join(model_dir, "model-%s.model"%epoch0)
  if os.path.exists(PATH):
    checkpoint = torch.load(PATH)
    rnn.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print("loaded model for this epoch",PATH)
    for a,b in  checkpoint.items():
      if "loss" in a.lower(): print(a,round(b,6))
    continue  
  print("epoch0",epoch0)
  for batch_i0 in range(n_batches+1):
    t0=time.time()
    pred_count,correct_count=0,0
    batch_i1=batch_i0+1
    cur_train_items=all_training[batch_i0*train_batch_size:batch_i1*train_batch_size]
    cur_test_items=all_testing[batch_i0*test_batch_size:batch_i1*test_batch_size]
    print("batch_i0",batch_i0, "cur_train_items",len(cur_train_items),"cur_test_items",len(cur_test_items))
    tmp_path=os.path.join(tmp_model_dir, "model-batch-%s.model"%batch_i0)
    if os.path.exists(tmp_path):
      checkpoint = torch.load(tmp_path)
      rnn.load_state_dict(checkpoint['model_state_dict'])
      optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
      print("loaded model for this epoch",tmp_path)
      continue  

    total_train_loss,total_test_loss=0,0
    train_counter,test_counter=0,0
    test_pred_counter,test_correct_counter=0,0 #how many test items reach the cutoff val for prediction, how many are correctly predicted
    #for train_i in range(1500):
    for train_i, train_item in enumerate(cur_train_items):
      if train_i%2000==0: print("train_i",train_i)
      #print(train_i)
      prev_vals,next_vals=train_item# cur_train0[train_i]
      prev_vals=[round(v,2) for v in prev_vals]
      next_vals=[round(v,2) for v in next_vals]
      prev_percents,next_percents=get_diff_percent(prev_vals,prev_vals[-1]),get_diff_percent(next_vals,prev_vals[-1])
      if prev_percents==[] or next_percents==[]: continue
      actual_out=extract_labels(next_percents)
      # print("cur_actual", cur_actual, next_percents)
      # print("--------")
      # continue
      # min_val=min(next_percents)
      # min_val_index=next_percents.index(min_val)
      # max_val=max(next_percents)
      # max_val_index=next_percents.index(max_val)
      # rebound_max_val=max(next_percents[min_val_index:])
      # rebound_diff=round(rebound_max_val-min_val,2)
      
      # max_gr_5=0.
      # if max_val>5: max_gr_5=1.
      # actual_out=[max_gr_5]
      #prev_percents=[math.log(v) for v in prev_percents] #testing
      #if prev_percents==[]: continue
      input_tensor=torch.tensor(prev_percents)
      actual_out_tensor=torch.tensor(actual_out).to(device)
      rnn_output = rnn(input_tensor).to(device)
      rnn_output_list=rnn_output.tolist()
      loss = loss_func(actual_out_tensor.ravel(), rnn_output.ravel()) #calculate the loss, difference between the output and the desired outcome tensors
      # if epoch0>3 and train_i<50:
      #   print("rnn out:",rnn_output_list, "actual:", actual_out, "loss:", loss.item())
      #print(loss)
      loss.backward()
      optimizer.step()
      total_train_loss+=loss.item()
      train_counter+=1

    for test_i, test_item in enumerate(cur_test_items):
      #print(train_i)
      if test_i%1000==0: print("test_i",test_i)
      rnn.zero_grad()
      prev_vals,next_vals=test_item# cur_train0[train_i]
      prev_vals=[round(v,2) for v in prev_vals]
      next_vals=[round(v,2) for v in next_vals]
      prev_percents,next_percents=get_diff_percent(prev_vals,prev_vals[-1]),get_diff_percent(next_vals,prev_vals[-1])
      if prev_percents==[] or next_percents==[]: continue
      actual_out=extract_labels(next_percents)

      # continue
      # min_val=min(next_percents)
      # min_val_index=next_percents.index(min_val)
      # max_val=max(next_percents)
      # max_val_index=next_percents.index(max_val)
      # rebound_max_val=max(next_percents[min_val_index:])
      # rebound_diff=round(rebound_max_val-min_val,2)
      
      # max_gr_5=0.
      # if max_val>5: max_gr_5=1.
      # actual_out=[max_gr_5]
      input_tensor=torch.tensor(prev_percents)
      actual_out_tensor=torch.tensor(actual_out).to(device)
      rnn_output = rnn(input_tensor).to(device)
      rnn_output_list=rnn_output.ravel().tolist()
      loss = loss_func(actual_out_tensor.ravel(), rnn_output.ravel()) #calculate the loss, difference between the output and the desired outcome tensors
      predicted_increase,predicted_rebound=rnn_output_list
      actual_increase,actual_rebound=actual_out
      if predicted_increase>0.6 or predicted_rebound>0.4:
        pred_count+=1
        if actual_increase>0.5 or actual_rebound>0.5: correct_count+=1
        tmp_rnn_output_list=[round(v,2) for v in rnn_output_list]
        print(test_i, "rnn out:",tmp_rnn_output_list, "actual:", actual_out, "loss:", round(loss.item(),6))


      # if test_i<50:
      #   print(test_i, "rnn out:",rnn_output_list, "actual:", actual_out, "loss:", loss.item())
      #print(loss)
      pred_val=sum(rnn_output_list)/len(rnn_output_list)
      if pred_val>=test_cutoff_val and False:
        test_pred_counter+=1
        #if pred_val>0.5
        print("pred:", round(pred_val,2), "actual:", sum(actual_out))
        print("prev_percents",prev_percents)
        print("next_percents",next_percents)
        print("-------------")
      total_test_loss+=loss.item()
      test_counter+=1


    avg_train_loss=round(total_train_loss/train_counter,6)
    avg_test_loss=round(total_test_loss/test_counter,6)
    correct_ratio=0
    if pred_count>0: correct_ratio=round(correct_count/pred_count,2)
    print("pred_count",pred_count,"correct_count",correct_count,"correct_ratio",correct_ratio)
    # print("epoch0",epoch0, fname, "avg_train_loss",avg_train_loss, "avg_test_loss",avg_test_loss)
    # print("-------") pred_count, correct_count, correct_ratio
    
    t1=time.time()
    elapsed=round(t1-t0,2) 
    t0=time.time()    
    line="Epoch # %s - Batch: %s -  train loss: %s - test loss: %s - Correctness :%s/%s (ratio: %s) - elapsed: %s"%(epoch0, batch_i0, avg_train_loss,avg_test_loss,  correct_count, pred_count, correct_ratio, elapsed)
    #line="Epoch # %s  -  train loss: %s - test loss: %s - elapsed: %s"%(epoch0, avg_train_loss,avg_test_loss, elapsed)
    print(line)
    log_fopen=open(log_fpath,"a")
    log_fopen.write(line+"\n")
    log_fopen.close() 
    cur_checkpoint={
            'epoch': epoch0,
            'n_input': n_input,
            'n_hidden': n_hidden,
            'n_layers': n_layers,
            'n_output': n_output,
            'LR': LR,
            'model_state_dict': rnn.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': avg_train_loss,
            'test_loss': avg_test_loss
            }
    torch.save(cur_checkpoint, tmp_path)
  
  torch.save(cur_checkpoint, PATH)  
  print("model saved")
  for f in os.listdir(tmp_model_dir):
    tmp_fpath=os.path.join(tmp_model_dir,f)
    os.remove(tmp_fpath)
  print("deleted temporary files")
  print("-----------")


NameError: ignored

#Testing on actual data

In [None]:
import torch
e0=8
model_name="exp5-pred1-3-test"
model_name="exp6-pred1-combined-stocks"
model_name="exp6-pred1-combined-stocks1-3layer-full" 
model_name="exp6-pred1-combined-stocks1-3layer-batches1"
model_name="exp6-pred2-combined-stocks1-3layer-batches1"
model_name="exp6-pred2-combined-stocks1-4layer-batches2"

pred_cutoff_val=0.6
torch.manual_seed(1)
random.seed(1)

# e0=8
# model_name="exp6-pred2-combined-stocks1-4layer-batches2"
# model_dir=os.path.join(cwd,"models", model_name) 
# PATH=os.path.join(model_dir, "model-%s.model"%e0)
def load_model(model_fpath0):
  checkpoint = torch.load(model_fpath0)
  rnn0 = RNN(checkpoint["n_input"], checkpoint["n_hidden"] , checkpoint["n_output"] , checkpoint["n_layers"] , matching_in_out=False).to(device)
  rnn0.load_state_dict(checkpoint['model_state_dict'])
  rnn0.eval()
  return rnn0

def predict(input_list,rnn_obj):
  input_tensor=torch.tensor(input_list)
  rnn_output = rnn_obj(input_tensor)
  rnn_output_list=rnn_output.ravel().tolist()
  return rnn_output_list

model_dir=os.path.join(cwd,"models", model_name) 
PATH=os.path.join(model_dir, "model-%s.model"%e0)
checkpoint = torch.load(PATH)
rnn = RNN(checkpoint["n_input"], checkpoint["n_hidden"] , checkpoint["n_output"] , checkpoint["n_layers"] , matching_in_out=False).to(device)
rnn.load_state_dict(checkpoint['model_state_dict'])
rnn.eval()
root_dir='stock_market_data/sp500/csv'
sample_test_files=['XOM', 'XEL', 'WDC', 'WRB', 'WY', 'ZION', 'WHR', 'XLEFF', 'WMB', 'XLNX']
#sample_test_files=["AAPL","GOOG","FB","AMZN","EA","IBM","MSFT","GM","UPS","PG"]
for fname in sample_test_files[:2]:
  print(fname)
  pred_count,correct_count=0,0
  cur_fpath=os.path.join(root_dir,fname+".csv")
  #cur_train0,cur_test0=get_norm_close("stock_market_data/sp500/csv/AAPL.csv") #stock_market_data/sp500/csv/AAPL.csv
  cur_train0,cur_test0=get_norm_close(cur_fpath) #stock_market_data/sp500/csv/AAPL.csv
  for test_i, test_item in enumerate(cur_test0):
    #print(train_i)
    rnn.zero_grad()
    prev_vals,next_vals=test_item# cur_train0[train_i]
    prev_vals=[round(v,2) for v in prev_vals]
    next_vals=[round(v,2) for v in next_vals]
    prev_percents,next_percents=get_diff_percent(prev_vals,prev_vals[-1]),get_diff_percent(next_vals,prev_vals[-1])
    actual_out=extract_labels(next_percents)
    # min_val=min(next_percents)
    # min_val_index=next_percents.index(min_val)
    # max_val=max(next_percents)
    # max_val_index=next_percents.index(max_val)
    # rebound_max_val=max(next_percents[min_val_index:])
    # rebound_diff=round(rebound_max_val-min_val,2)
    
    # max_gr_5=0.
    # if max_val>5: max_gr_5=1.
    # actual_out=[max_gr_5]
    
    input_tensor=torch.tensor(prev_percents)
    actual_out_tensor=torch.tensor(actual_out)
    rnn_output = rnn(input_tensor)
    rnn_output_list=rnn_output.ravel().tolist()
    predicted_increase,predicted_rebound=rnn_output_list
    actual_increase,actual_rebound=actual_out
    # predicted_val=sum(rnn_output_list)
    # actual_val=sum(actual_out)
    # if test_i<50:
    #   print(test_i, "rnn out:",rnn_output_list, "actual:", actual_out, "loss:", loss.item())
    #   #print(loss)
    #   print("--------")
    # continue

    if predicted_increase>pred_cutoff_val or predicted_rebound>0.35:
      pred_count+=1
      if actual_increase>0.5 or actual_rebound>0.5: correct_count+=1
      #print(test_i, "rnn out:",rnn_output_list, "actual:", actual_out, "loss:", loss.item())
      print(test_i, "rnn out:",rnn_output_list, "actual:", actual_out)
      #if actual_increase>0.5: correct_count+=1
      # print(fname, "predicted_val:",round(predicted_val,2), "actual_out:",round(actual_val,2))
      # print("next_percents",next_percents)
      # #print(rnn_output_list)
      # #print(sum(rnn_output_list))
      # print("------")
  correct_ratio=0    
  if pred_count>0: correct_ratio=round(correct_count/pred_count,2)
  print(">>>>", fname, "pred_count",pred_count,"correct_count",correct_count,"correct_ratio",correct_ratio)
  print("=========")


XOM




4 rnn out: [0.6026967167854309, 0.21063825488090515] actual: [0.0, 0.0]
2771 rnn out: [0.6120272278785706, 0.22972239553928375] actual: [1.0, 0.0]
2777 rnn out: [0.6584183573722839, 0.2904338240623474] actual: [0.0, 0.0]
2778 rnn out: [0.7015856504440308, 0.3120870292186737] actual: [0.0, 0.0]
2779 rnn out: [0.6679953932762146, 0.3043203353881836] actual: [0.0, 0.0]
2780 rnn out: [0.6524690389633179, 0.28235986828804016] actual: [0.0, 0.0]
2781 rnn out: [0.7352421879768372, 0.3719955384731293] actual: [0.0, 1.0]
2782 rnn out: [0.7012118697166443, 0.3452860116958618] actual: [0.0, 1.0]
2783 rnn out: [0.7353832721710205, 0.37432312965393066] actual: [1.0, 1.0]
2784 rnn out: [0.6801297664642334, 0.3489380180835724] actual: [1.0, 1.0]
2785 rnn out: [0.7323333621025085, 0.37931200861930847] actual: [1.0, 1.0]
2786 rnn out: [0.6868031620979309, 0.35476288199424744] actual: [1.0, 1.0]
2787 rnn out: [0.7067278623580933, 0.3570493459701538] actual: [1.0, 0.0]
2788 rnn out: [0.7088838219642639, 

In [None]:
train_batch_size=3000
n_batches=math.floor(len(all_training)/train_batch_size)
test_batch_size=math.floor(len(all_testing)/n_batches)
print("train_batch_size",train_batch_size, "test_batch_size",test_batch_size, "n_batches",n_batches)

train_batch_size 3000 test_batch_size 736 n_batches 101


In [None]:
all_files=[v.split(".")[0] for v in os.listdir(root_dir) if v.endswith(".csv")]

In [None]:
all_files[-10:]
#print(test_batch_size)

['XOM', 'XEL', 'WDC', 'WRB', 'WY', 'ZION', 'WHR', 'XLEFF', 'WMB', 'XLNX']

#Robinhood - Sep 22

In [4]:
!pip install robin_stocks

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#Robinhood login - Sep 22

In [5]:
#we should set the environ variables first
!export robinhood_username=""
!export robinhood_password=""

import robin_stocks as rs
import os 

robin_user = os.environ.get("robinhood_username") #email!
robin_pass = os.environ.get("robinhood_password") #ss!
#rs.robinhood.l
rs.robinhood.login(username=robin_user,
         password=robin_pass,
         expiresIn=86400,
         by_sms=True)

Robinhood username: hmghaly@gmail.com
Robinhood password: ··········


{'access_token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJleHAiOjE2NjMxMDk4NjMsInRva2VuIjoidndicDhtMjdidldqTkZUOWxacWVQNjd0ZkRpUkpMIiwidXNlcl9pZCI6IjY1NDcxZTU3LWE3NmYtNGU3Yy1hMWM3LWY5Nzc2NzMyNjkwNCIsImRldmljZV9oYXNoIjoiMzYzN2I5NDA0NjI4MmViNTU0ZTgwMGI3MzJlMWRiNTAiLCJzY29wZSI6ImludGVybmFsIiwiZGN0IjoxNjQ3OTU4NDEzLCJzZXJ2aWNlX3JlY29yZHMiOlt7ImhhbHRlZCI6ZmFsc2UsInNlcnZpY2UiOiJudW1tdXNfdXMiLCJzaGFyZF9pZCI6MSwic3RhdGUiOiJhdmFpbGFibGUifSx7ImhhbHRlZCI6ZmFsc2UsInNlcnZpY2UiOiJicm9rZWJhY2tfdXMiLCJzaGFyZF9pZCI6OSwic3RhdGUiOiJhdmFpbGFibGUifV0sInVzZXJfb3JpZ2luIjoiVVMiLCJvcHRpb25zIjpmYWxzZSwibGV2ZWwyX2FjY2VzcyI6ZmFsc2V9.GGB3RshsZl0F_PYRihnP8ViwewZlps2FfsMVZCWzAJOlkPcCG7U7PBnBBYXxahIhp7QwAF2aYHiUBFlS5mTf-2iBl_yfinQ_cIKkCqlkb_qk15a0vpNAARjsu1wkUOy9dahP6UpK7d5t8aG0tyVY1x2Po0xjL4HqSP5ZfRMjIvaDKgHdiVuPhR0bYjfbx9YGvBLRa88tV8L347qn_kwhZeo9xAJQaLyFse4ASf0Rmkg8aWz-FmAHbvlKAPqgdFCss3NR7rWmoFDlVqr2qosZw71V7W_1a7i1C1Aoac8wtj5QWf7n7G-a2jV7om2ePpZoFWNzI63d_anyn5FTI5w4Nw',
 'expires_in': 635718,
 'token_type': 'Bear

#Collecting Stock info - Aug 22
using our initial lists and robinhood queries

In [None]:
import json
root_dir='stock_market_data/sp500/csv'
stock_info_fname="stock_info-aug28.txt"
all_symbols=[]
stock_groups=["sp500","nyse","nasdaq","forbes2000"]
for stg in stock_groups:
  cur_root_dir=os.path.join("stock_market_data",stg,"csv")
  cur_symbols=[v.split(".")[0] for v in os.listdir(cur_root_dir)]
  all_symbols.extend(cur_symbols)
all_symbols=sorted(list(set(all_symbols)))
all_stock_info_items=[]
# cur_symbols=[v.split(".")[0] for v in os.listdir(root_dir)]
print(len(all_symbols),all_symbols[:10])
for sym0 in all_symbols:
  cur_stock_info=rs.robinhood.stocks.find_instrument_data(sym0)
  print(sym0)
  for info1 in cur_stock_info:
    if info1 in all_stock_info_items: continue
    all_stock_info_items.append(info1)
stock_info_fopen=open(stock_info_fname,"w")
print("total number of stocks found:", len(all_stock_info_items))
for a in all_stock_info_items:
  #print(a["symbol"])
  json_content=json.dumps(a)
  stock_info_fopen.write(json_content+"\n")
stock_info_fopen.close()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ELS
Found 1 results
ELTK
Found 1 results
ELUXY
No results found for that keyword
ELVAF
Found 3 results
ELY
Found 1 results
EMCF
Found 9 results
EME
Found 3 results
EMF
No results found for that keyword
EMITF
Found 1 results
EMKR
Found 10 results
EML
No results found for that keyword
EMMS
Found 2 results
EMN
Found 1 results
EMO
Found 2 results
EMR
No results found for that keyword
EMSHF
Found 2 results
ENB
No results found for that keyword
ENBP
Found 1 results
ENDP
No results found for that keyword
ENGIY
No results found for that keyword
ENLAY
Found 4 results
ENPH
Found 7 results
ENS
Found 1 results
ENSG
Found 2 results
ENTA
Found 1 results
ENTG
Found 3 results
ENTR
Found 1 results
ENVA
Found 3 results
ENVI
No results found for that keyword
ENZN
Found 1 results
EOD
Found 1 results
EOG
Found 1 results
EONGY
Found 3 results
EOS
Found 1 results
EPAM
No results found for that keyword
EPAY
Found 1 results
EPD
Found 6 results
EP

#Get daily data for stocks - Aug 22
Run every day to get the historic data for stocks of interest - identify stocks less than a certain value

In [None]:
#identifying only valid stocks
import json

max_price=25
stock_info_fname="stock_info-aug28.txt"
stock_info_fopen=open(stock_info_fname)
valid_stocks=[]
for f_i,f_line in enumerate(stock_info_fopen):
  if f_i%50==0 and f_i>0: 
    print(f_i)

  cur_info_dict=json.loads(f_line)
  if cur_info_dict==None: continue
  sym0=cur_info_dict.get("symbol") 
  if sym0==None: continue
  is_tradeable=cur_info_dict["tradeable"] 
  if not is_tradeable: continue
  latest_price_val=None
  latest_price=rs.robinhood.stocks.get_latest_price(sym0)
  if len(latest_price)>0 and latest_price!=None and latest_price[0]!=None: latest_price_val=float(latest_price[0])
  if latest_price_val==None: continue
  if latest_price_val>max_price: continue
  #print(sym0, latest_price_val)
  cur_info_dict["latest_price"]=latest_price_val
  valid_stocks.append((sym0, cur_info_dict, latest_price_val))

stock_info_fopen.close()
valid_stocks.sort(key=lambda x:x[-1])

valid_stock_fname="valid_stock_25_less_aug_28.txt"
valid_stock_fopen=open(valid_stock_fname,"w")

for a in valid_stocks: 
  valid_stock_fopen.write(json.dumps(a[1])+"\n")
valid_stock_fopen.close()
print("valid_stocks",len(valid_stocks))

50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
4150
4200
4250
4300
4350
4400
4450
4500
4550
4600
4650
4700
4750
4800
4850
4900
4950
5000
5050
5100
5150
5200
5250
5300
5350
5400
5450
valid_stocks 2662


#Getting the actual daily values of valid stocks - Sep 22

In [6]:
#We need to have the stock info file ready "stock_info.txt"
import json
from datetime import datetime
date_str=datetime.today().strftime('%Y-%m-%d')
daily_dir="daily"
daily_dir_path=os.path.join(cwd,daily_dir)
if not os.path.exists(daily_dir_path): os.makedirs(daily_dir_path)
daily_fpath=os.path.join(daily_dir_path,date_str+".txt")
daily_fopen=open(daily_fpath,"w")
#stock_info_fname="stock_info-aug28.txt"
stock_info_fname="valid_stock_25_less_aug_28.txt"
stock_info_fopen=open(stock_info_fname)
for f_i,f_line in enumerate(stock_info_fopen):
  #if f_i%50: print(f_i)
  cur_info_dict=json.loads(f_line)
  if cur_info_dict==None: continue
  sym0=cur_info_dict.get("symbol") 
  if sym0==None: continue
  is_tradeable=cur_info_dict["tradeable"] 
  print(sym0, is_tradeable)
  if not is_tradeable: continue
  cur_stock_data = rs.robinhood.stocks.get_stock_historicals([sym0], interval="day", span="3month")
  cur_stock_data_json=json.dumps(cur_stock_data)
  price_line="%s\t%s\n"%(sym0,cur_stock_data_json)
  daily_fopen.write(price_line)
  # print(cur_stock_data)
  # print("-----")
  #if f_i>10: break

stock_info_fopen.close()
daily_fopen.close()

AMPE True
TMBR True
ALNA True
CSCW True
TTOO True
KPRX True
RMED True
MOHO True
IRS+ True
VGFC True
ATHX True
NBRV True
EYESW True
400 Client Error: Bad Request for url: https://api.robinhood.com/quotes/historicals/?symbols=EYESW&interval=day&span=3month&bounds=regular
ALRN True
OXBRW True
OBSV True
SLRX True
VBLT True
QTNT True
TRVN True
PIXY True
CLXT True
NBY True
CNSP True
TENX True
SPCB True
CFRX True
OBLG True
GFAI True
AUMN True
NILE True
TYME True
HGEN True
RIBT True
MRKR True
TOPS True
ESPGY True
ACRX True
ADMP True
CETX True
ENDP True
404 Client Error: Not Found for url: https://api.robinhood.com/quotes/historicals/?symbols=ENDP&interval=day&span=3month&bounds=regular
CHEK True
OTIC True
AKBA True
BTOG True
LOGC True
EXN True
NRBO True
FNHC True
ACOR True
GRIL True
ENSC True
HOTH True
MARK True
AGTC True
RSLS True
GLMD True
REVB True
YCBD True
XIN True
AGRX True
ASTC True
SNMP True
GMBL True
DXF True
AXU True
UAMY True
NYMX True
DAVE True
ICLK True
IDRA True
SFET True
MEIP Tr

#Final Robinhood testing - Aug 22

In [None]:
#cur_stock_data = rs.robinhood.stocks.get_stock_historicals(["AAPL"], interval="day", span="3month")
cur_stock_data = rs.robinhood.stocks.get_stock_historicals(["MSFT"], interval="day", span="3month")
print(len(cur_stock_data))
print(cur_stock_data[0])
print(cur_stock_data[-1])
high_low_open_close_data=[(float(v["high_price"]),float(v["low_price"]),float(v["open_price"]),float(v["close_price"])) for v in cur_stock_data]
for a in high_low_open_close_data[:20]:
  print(a)


epoch_i=3
exp_name="new-stock-unflattened-sample19-30-10-128-0000001"

model_dir="models"
exp_dir_path=os.path.join(model_dir,exp_name)
tmp_path=os.path.join(exp_dir_path,"model-%s.model"%epoch_i)
try: checkpoint = torch.load(tmp_path)
except: checkpoint = dill_unpickle(tmp_path)
rnn = RNN(checkpoint["n_input"], checkpoint["n_hidden"] , checkpoint["n_output"] , checkpoint["n_layers"] , checkpoint["matching_in_out"]).to(device)
n_input=checkpoint["n_input"]
standard_labels=checkpoint["output_labels"]
cur_parameters=checkpoint["feature_extraction_parameters"]
rnn.load_state_dict(checkpoint['model_state_dict'])
rnn.eval()

start_i=0
n_prev=30
n_next=10
prev_vals=high_low_open_close_data[start_i:start_i+n_prev]
next_vals=high_low_open_close_data[start_i+n_prev:start_i+n_prev+n_next]
test_ft=normalize_ft_vals(prev_vals)
test_labels=normalize_next_labels(next_vals,prev_vals[-1][-1])
print("test_labels",test_labels)

ft_list_tensor=torch.tensor(test_ft,dtype=torch.float32)
#lb_list_tensor=torch.tensor(lb_list,dtype=torch.float32)
#lb_tensor_flat=lb_list_tensor.ravel()
rnn_out=rnn(ft_list_tensor)
print(rnn_out)
rnn_out_flat=rnn_out.ravel()
preds=out2labels(rnn_out_flat,standard_labels)
eval_list=[]
for ac0,pred0 in zip(test_labels,preds):
  cur_pred=[(v[0],round(v[1].item(),4)) for v in pred0]
  eval_list.append((ac0,cur_pred[0]))
for i0 in range(0,len(eval_list),2):
  cur_high=eval_list[i0]
  cur_low=eval_list[i0+1]
  ac_high,pred_high_wt=cur_high
  ac_low,pred_low_wt=cur_low
  pred_high,pred_wt=pred_high_wt
  print("ac_high",ac_high, "pred_high",pred_high,"pred_wt", pred_wt,"cur_low",cur_low)

  # if i0>0 and int(pred_high)>=gain_threshold0: 
  #   #print(i0/2, "Sell Decision - predicted: %s - actual: %s"%(pred_high,ac_high))
  #   gain_loss=int(ac_high)
  #   break



# print(test_ft)
# print(test_ft[-1])
# print(test_labels)

63
{'begins_at': '2022-05-23T00:00:00Z', 'open_price': '255.490000', 'close_price': '260.650000', 'high_price': '261.500000', 'low_price': '253.430000', 'volume': 33175379, 'session': 'reg', 'interpolated': False, 'symbol': 'MSFT'}
{'begins_at': '2022-08-22T00:00:00Z', 'open_price': '282.080000', 'close_price': '277.750000', 'high_price': '282.460000', 'low_price': '277.220000', 'volume': 25061070, 'session': 'reg', 'interpolated': False, 'symbol': 'MSFT'}
(261.5, 253.43, 255.49, 260.65)
(261.33, 253.5, 257.89, 259.62)
(264.58, 257.125, 258.14, 262.52)
(267.11, 261.4294, 262.27, 265.9)
(273.34, 267.56, 268.48, 273.24)
(274.77, 268.93, 272.53, 271.87)
(277.69, 270.04, 275.195, 272.42)
(274.65, 261.6, 264.45, 274.58)
(273.45, 268.41, 270.31, 270.02)
(274.18, 267.22, 272.06, 268.75)
(273.13, 265.94, 266.635, 272.5)
(273.0, 269.61, 271.71, 270.41)
(272.7081, 264.63, 267.78, 264.79)
(260.58, 252.53, 260.58, 252.99)
(249.0242, 241.53, 245.11, 242.26)
(245.74, 241.51, 243.86, 244.49)
(255.3, 



#Apply model to daily data - Sep 22

In [8]:
import json
model1='models/test2-batches-summary-sigmoid-1preds-30prev-58-64-2layer-0000001/model-8.model' 
model2='models/test2-batches-summary-sigmoid-1preds-30prev-58-64-2layer-0000001/model-8.model' 
model3='models/test2-batches-summary-sigmoid-1preds-30prev-58-128-2layer-0000001/model-13.model' 
cur_model_fpath=model3

try: checkpoint = torch.load(cur_model_fpath)
except: checkpoint = dill_unpickle(cur_model_fpath)
rnn = RNN(checkpoint["n_input"], checkpoint["n_hidden"] , checkpoint["n_output"] , checkpoint["n_layers"] , checkpoint["matching_in_out"]).to(device)
cur_parameters=checkpoint["parameters"]
pred_labels=cur_parameters["pred_labels"]
cur_parameters=checkpoint["feature_extraction_parameters"]
label_func=checkpoint['label_extraction_function']
feature_func=checkpoint['feature_extraction_function']
prev_n=cur_parameters["prev_n"]
rnn.load_state_dict(checkpoint['model_state_dict'])
rnn.eval()

pred_threshold=0.75
daily_fpath='daily/2022-09-06.txt'
daily_fopen=open(daily_fpath)
out_pred="prediction-2022-09-06.tsv"
out_pred_open=open(out_pred,"w")
headers=["Stock","Link","Last Closing","Accuracy","Accuracy for prediction>%s"%pred_threshold,"Prediction"]
header_line="\t".join(headers)+"\n"
out_pred_open.write(header_line)
for f_i,f_line in enumerate(daily_fopen):
  total_count,correct_count=0,0
  threshold_count=0
  #print(f_line)
  sym0,price_json=f_line.strip().split("\t")
  price_obj_list=json.loads(price_json)
  #price_list=[float(v["close_price"]) for v in price_obj_list]
  try: 
    price_list=[(float(v["high_price"]),float(v["low_price"]),float(v["close_price"])) for v in price_obj_list]
    last_closing_price=price_list[-1][-1]
    normalized_items=get_data_full_from_list_new(price_list,cur_parameters)
  except: continue
  
  
  for item0 in normalized_items:
    ft_list,lb_list=item0[:2]
    ft_list_tensor=to_tensor(ft_list) #torch.tensor(ft_list,dtype=torch.float32)
    extracted_out_specs=label_func(lb_list,cur_parameters)
    lb_list_tensor=to_tensor(extracted_out_specs) #torch.tensor(extracted_out_specs,dtype=torch.float32)
    lb_tensor_flat=lb_list_tensor.ravel()
    rnn_out=rnn(ft_list_tensor)
    total_count+=1
    if extracted_out_specs[0]==1 and rnn_out.item()>pred_threshold:
      threshold_count+=1
    if int(round(extracted_out_specs[0]))==int(round(rnn_out.item())):
      correct_count+=1
    else: pass
      #print("xxx incorrect:","actual:",extracted_out_specs,"predicted:",rnn_out)

    #print("actual:",extracted_out_specs,"predicted:",rnn_out,"cur_eval",cur_eval)
  ft_for_prediction=price_list[-prev_n:]
  normalized_ft_for_prediction=feature_func(ft_for_prediction)
  cur_pred=rnn(normalized_ft_for_prediction)
  cur_pred_val=cur_pred.item()

  accuracy=0
  threshold_accuracy=0
  if total_count>0:
    accuracy=correct_count/total_count
    threshold_accuracy=threshold_count/total_count
  if accuracy<0.5: continue
  if cur_pred_val<0.5: continue
  cur_link='https://finance.yahoo.com/quote/'+sym0
  row_items=[sym0,cur_link,round(last_closing_price,4),round(accuracy,4),round(threshold_accuracy,4),round(cur_pred_val,4)]
  row_items=[str(v) for v in row_items]
  row_line="\t".join(row_items)+"\n"
  out_pred_open.write(row_line)

  
  
  print(sym0, "last_closing_price",last_closing_price, "accuracy",round(accuracy,4),"threshold_accuracy",round(threshold_accuracy,4),"pred:",  round(cur_pred_val,4))
  # for a in price_list[:5]:
  #   print(a)
  # close_prices=[float(v["close_price"]) for v in price_obj_list]
  #if f_i>20: break
out_pred_open.close()
daily_fopen.close()



ALNA last_closing_price 0.1047 accuracy 0.6071 threshold_accuracy 0.0714 pred: 0.47
CSCW last_closing_price 0.1123 accuracy 0.75 threshold_accuracy 0.0 pred: 0.5165
TTOO last_closing_price 0.1149 accuracy 0.5357 threshold_accuracy 0.0714 pred: 0.6164
KPRX last_closing_price 0.215 accuracy 0.6786 threshold_accuracy 0.1071 pred: 0.8235
RMED last_closing_price 0.16 accuracy 0.6786 threshold_accuracy 0.2143 pred: 0.5164
MOHO last_closing_price 0.146 accuracy 0.6429 threshold_accuracy 0.3571 pred: 1.0562
IRS+ last_closing_price 3.88 accuracy 0.6071 threshold_accuracy 0.0357 pred: 0.4549
VGFC last_closing_price 0.1508 accuracy 0.6786 threshold_accuracy 0.0714 pred: 0.8488
NBRV last_closing_price 0.1635 accuracy 0.5357 threshold_accuracy 0.0 pred: 0.5649
ALRN last_closing_price 0.1749 accuracy 0.7857 threshold_accuracy 0.2857 pred: 0.6437
OXBRW last_closing_price 0.20021 accuracy 0.6786 threshold_accuracy 0.0 pred: 0.6936
OBSV last_closing_price 0.1661 accuracy 0.5 threshold_accuracy 0.25 pre

In [None]:
#rs.robinhood.stocks.get_name_by_symbol("FB")
rs.robinhood.get_news("AAPL")

[{'api_source': 'benzinga',
  'author': '',
  'num_clicks': 0,
  'preview_image_url': 'https://images.robinhood.com/PH1Za-x7lZFcMnfEaU9_ACxtpE4/aHR0cHM6Ly9pbWFnZXMucm9iaW5ob29kLmNvbS9nN1VVTG85NUxRX1l3STNoRVBSUVVtUmU1SEkvYUhSMGNITTZMeTl6TG5scGJXY3VZMjl0TDNWMUwyRndhUzl5WlhNdk1TNHlMMmhYY25aVlVqZFpWR0ZqVTNOb09XOU1hMU5RZFZFdExYNUNMMkZFTURCTlJFRTNaSG93TWsxRVFUZFpXRUozWVZkUk9XVllVbWhaTW1nMVlqSTBMUzlvZEhSd2N6b3ZMMjFsWkdsaExucGxibVp6TG1OdmJTOWxiaTlDWlc1NmFXNW5ZUzgwWWpZME1XUXhNVGRpTURGaE56QmtZelE0Wm1ZMU9EZ3pNbUU1T0RObFpn',
  'published_at': '2022-08-23T14:21:28Z',
  'relay_url': 'https://news.robinhood.com/f16d014e-13ae-340b-bb62-ac306c07b75f/',
  'source': 'Benzinga',
  'summary': '',
  'title': 'These Investment Wines Have Outperformed Apple, Ford And Tesla Over The Past Year',
  'updated_at': '2022-08-23T18:03:20.439469Z',
  'url': 'https://finance.yahoo.com/news/investment-wines-outperformed-apple-ford-142128903.html?.tsrc=rss',
  'uuid': 'f16d014e-13ae-340b-bb62-ac306c07b75f',
  'related_in

#Let's apply our algorithm on the daily data

In [None]:
import json
import torch
from datetime import datetime

e0=8
model_name="exp6-pred2-combined-stocks1-4layer-batches2"
model_dir=os.path.join(cwd,"models", model_name) 
PATH=os.path.join(model_dir, "model-%s.model"%e0)
#def predict(input_list, model_fpath):

daily_dir="daily"
date_str=datetime.today().strftime('%Y-%m-%d')
daily_dir_path=os.path.join(cwd,daily_dir)
if not os.path.exists(daily_dir_path): os.makedirs(daily_dir_path)
daily_fpath=os.path.join(daily_dir_path,date_str+".txt")
# daily_fname='2022-03-21.txt'
# daily_fpath=os.path.join(daily_dir_path,daily_fname)
daily_fopen=open(daily_fpath)

for f_i,f_line in enumerate(daily_fopen):
  total_count,correct_count=0,0
  sym0,price_json=f_line.strip().split("\t")
  price_obj_list=json.loads(price_json)
  close_prices=[float(v["close_price"]) for v in price_obj_list]
  #print(sym0,len(close_prices), "prices:", close_prices)
  percent_pairs=get_prev_next_percent(close_prices)
  for prev0,next0 in percent_pairs:

    try: prediction=predict(prev0, PATH)
    except: continue
    actual=extract_labels(next0)
    if prediction[0]<0.6: continue
    total_count+=1
    if actual[0]>0.5: correct_count+=1
    # print("Symbol:",sym0)
    # print("prev0",prev0)
    # print("next0",next0)
    # print("prediction:",[round(v,2) for v in prediction])
    # print("actual:",actual)
    # print("----")
  last_close_prices=close_prices[-20:]
  
  try: last_close_prices_precentages=get_diff_percent(last_close_prices,last_close_prices[-1])
  except: continue
  try: latest_price=float(rs.robinhood.stocks.get_latest_price(sym0)[0])
  except: latest_price=0

  tomorrow_pred=predict(last_close_prices_precentages, PATH)
  tomorrow_pred=[round(v,2) for v in tomorrow_pred]
  #if f_i>100: break
  correct_ratio=0
  if total_count>0: correct_ratio=round(correct_count/total_count,2)
  if correct_ratio==0: continue
  if tomorrow_pred[0]<0.6: continue
  if correct_ratio<0.7: continue
  yesterday_closing=last_close_prices[-1]
  print("Symbol:",sym0, total_count,correct_count,"correct_ratio:",correct_ratio, "tomorrow_pred",tomorrow_pred, "yesterday_closing:",yesterday_closing, "latest_price:",latest_price)
  print("--------")
daily_fopen.close()
#get_prev_next_percent, get_prev_next_vals
print("finished evaluating stocks")



Symbol: AACG 13 12 correct_ratio: 0.92 tomorrow_pred [0.7, 0.29] yesterday_closing: 1.415 latest_price: 1.3208
--------
Symbol: ABEO 14 14 correct_ratio: 1.0 tomorrow_pred [0.6, 0.46] yesterday_closing: 0.2883 latest_price: 0.2973
--------
Symbol: ACCD 14 10 correct_ratio: 0.71 tomorrow_pred [0.61, 0.49] yesterday_closing: 15.68 latest_price: 16.76
--------
Symbol: ACXP 4 4 correct_ratio: 1.0 tomorrow_pred [0.68, 0.31] yesterday_closing: 3.42 latest_price: 3.39
--------
Symbol: ADCT 10 8 correct_ratio: 0.8 tomorrow_pred [0.66, 0.28] yesterday_closing: 13.28 latest_price: 13.5271
--------
Symbol: AEHR 17 15 correct_ratio: 0.88 tomorrow_pred [0.65, 0.37] yesterday_closing: 10.22 latest_price: 11.015
--------
Symbol: AMTX 13 12 correct_ratio: 0.92 tomorrow_pred [0.66, 0.31] yesterday_closing: 10.63 latest_price: 10.93
--------
Symbol: AERI 11 10 correct_ratio: 0.91 tomorrow_pred [0.62, 0.3] yesterday_closing: 8.34 latest_price: 8.775
--------
Symbol: AGEN 8 6 correct_ratio: 0.75 tomorrow_

#Finding daily prices for a certain stock

In [None]:
stock_name="FB"
test_data = rs.robinhood.stocks.get_stock_historicals([stock_name], interval="day", span="3month")
#dropbox_data = rs.stocks.get_stock_historicals()
#test_data[:10]
print(len(test_data))
print(test_data)

404 Client Error: Not Found for url: https://api.robinhood.com/quotes/historicals/?symbols=FB&interval=day&span=3month&bounds=regular
1
[None]


In [None]:
all_symbols=[]
stock_groups=["sp500","nyse","nasdaq","forbes2000"]
for stg in stock_groups:
  cur_root_dir=os.path.join("stock_market_data",stg,"csv")
  cur_symbols=[v.split(".")[0] for v in os.listdir(cur_root_dir)]
  all_symbols.extend(cur_symbols)
all_symbols=sorted(list(set(all_symbols)))
all_stock_info_items=[]
# cur_symbols=[v.split(".")[0] for v in os.listdir(root_dir)]
print(len(all_symbols),all_symbols[:10])

3569 ['A', 'AAALY', 'AAC', 'AACAY', 'AAL', 'AAME', 'AAOI', 'AAON', 'AAP', 'AAPL']


In [None]:
#check the account and the current stock positions
import requests
# rs.robinhood.

#dir(rs.orders.stocks.urls)
#rs.profiles.load_basic_profile()
#rs.profiles.load_portfolio_profile()
#rs.profiles.load_investment_profile()
#rs.profiles.load_user_profile()
#rs.account.get_all_positions()
positions=[rs.robinhood.account.get_open_stock_positions()]
for p0 in positions[0]:
  stock_url=p0["instrument"]
  average_buy_price=p0["average_buy_price"]
  quantity=p0["quantity"]
  created_at=p0["created_at"]
  res=requests.get(stock_url)
  stock_dict=res.json()
  symbol=stock_dict["symbol"]
  price=float(rs.robinhood.stocks.get_latest_price(symbol)[0])
  print("symbol",symbol,"quantity",quantity)
  print("average_buy_price:", average_buy_price,"current price:",price)
  print("created_at:",created_at)
  #print(p0)
  print("----")

symbol STWD quantity 14.26504800
average_buy_price: 14.0203 current price: 23.58
created_at: 2020-10-29T14:12:53.010228Z
----


In [None]:
rs.robinhood.stocks.find_instrument_data("MSFT")

Found 1 results


[{'bloomberg_unique': 'EQ0010174300001000',
  'country': 'US',
  'day_trade_ratio': '0.2500',
  'default_collar_fraction': '0.05',
  'extended_hours_fractional_tradability': False,
  'fractional_tradability': 'tradable',
  'fundamentals': 'https://api.robinhood.com/fundamentals/MSFT/',
  'id': '50810c35-d215-4866-9758-0ada4ac79ffa',
  'ipo_access_cob_deadline': None,
  'ipo_access_status': None,
  'ipo_access_supports_dsp': False,
  'ipo_roadshow_url': None,
  'ipo_s1_url': None,
  'is_spac': False,
  'is_test': False,
  'list_date': '1987-09-17',
  'maintenance_ratio': '0.2500',
  'margin_initial_ratio': '0.5000',
  'market': 'https://api.robinhood.com/markets/XNAS/',
  'min_tick_size': None,
  'name': 'Microsoft Corporation Common Stock',
  'quote': 'https://api.robinhood.com/quotes/MSFT/',
  'rhs_tradability': 'tradable',
  'simple_name': 'Microsoft',
  'splits': 'https://api.robinhood.com/instruments/50810c35-d215-4866-9758-0ada4ac79ffa/splits/',
  'state': 'active',
  'symbol': 'M

#Dump

In [None]:
  # for fname in sample_files:
  #   t0=time.time()
  #   #print("epoch0",epoch0, "fname",fname)
  #   tmp_path=os.path.join(tmp_model_dir, "model-%s.model"%fname)
  #   if os.path.exists(tmp_path):
  #     checkpoint = torch.load(tmp_path)
  #     rnn.load_state_dict(checkpoint['model_state_dict'])
  #     optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  #     print("loaded model for this epoch",tmp_path)
  #     continue  
  #   #
  #   cur_path=os.path.join(root_dir,fname+".csv")
  #   cur_train0,cur_test0=get_norm_close(cur_path,prev_n,next_n,train_ratio=0.8)
  #   if n_train!=None: cur_train0=cur_train0[:n_train]
  #   if n_test!=None: cur_test0=cur_test0[:n_test]


#test filtering NA

In [None]:

def get_pd_col_data(pd_frame0,col_names0):
  all_col_data=[]
  for index0,row_dict0 in pd_frame0.iterrows():
    cur_list=[row_dict0.get(v,0.) for v in col_names0]
    valid_row=True
    for cl in cur_list: 
      if not type(cl) is float: valid_row=False
    if not valid_row: continue
    all_col_data.append(cur_list)
  return all_col_data

root_dir='stock_market_data/sp500/csv'
files=['ABC', 'ADP', 'A', 'ABT', 'ABMD', 'ADI', 'ABBV', 'AAPL', 'ADSK', 'ADM', 'ACN', 'AAP', 'AAL', 'ALGN', 'APH', 'AOS', 'AWK', 'ALLE', 'AME', 'APD', 'ARE', 'AIZ', 'ALB', 'APA', 'ALK', 'AEE', 'AMGN', 'ANTM', 'AEP', 'AON', 'AKAM', 'AXP', 'AMD', 'AMAT', 'AMP', 'ANET', 'AJG', 'AZO', 'ATVI', 'AMZN', 'AMT', 'AVB', 'ALTR', 'AVY', 'CAH', 'CDNS', 'BIO', 'CDE', 'BXP', 'BK', 'BEN', 'C', 'BMRA', 'BAX', 'BLK', 'BF-A', 'BDX', 'BR', 'BSHI', 'CB', 'CAG', 'BIIB', 'BAC', 'BMY', 'CCI', 'BSX', 'CAT', 'BRK-A', 'BBY', 'BA', 'BWA', 'CME', 'CNWT', 'CF', 'CTXS', 'D', 'CTSH', 'CHD', 'CFG', 'DFS', 'CPICQ', 'DG', 'CRM', 'CHRW', 'CLX', 'DGX', 'CPB', 'COTY', 'CHTR', 'COP', 'CNC', 'CNP', 'DE', 'COO', 'CUK', 'CPRT', 'COST', 'CINF', 'CMG', 'CL', 'CTQ', 'CTAS', 'CMI', 'CSCO', 'COWN', 'DAL', 'DTE', 'ENS', 'EQIX', 'DRE', 'DOV', 'DHI', 'EW', 'ES', 'EQR', 'DIS', 'DPZ', 'FANG', 'EXR', 'EMR', 'DLTR', 'EMN', 'FAST', 'DVA', 'EBAY', 'EA', 'DRI', 'EOG', 'EL', 'ESS', 'EIX', 'DXCM', 'EFX', 'F', 'ECL', 'ED', 'GS-PJ', 'GILD', 'GIS', 'FMBM', 'FPLPF', 'GM', 'FBHS', 'HBAN', 'FLS', 'FIS', 'FE', 'FRT', 'FRMC', 'FFIV', 'GWW', 'GRMN', 'GGG', 'FN', 'GOOG', 'GPC', 'FLT', 'FITB', 'FCX', 'FISV', 'GPN', 'FMC', 'FRC', 'HAL', 'FDX', 'FCGN', 'FB', 'GE', 'FTI', 'GD', 'HAS', 'HD', 'INTU', 'IFF', 'IRM', 'ICE', 'HLT', 'IDXX', 'HII', 'ILMN', 'HTLF', 'HPQ', 'HON', 'IBM', 'IPGP', 'HCA', 'HRL', 'IR', 'HSY', 'HOLX', 'ISRG', 'HPE', 'HRB', 'HSIC', 'INTH', 'HFC', 'HBI', 'HUM', 'IP', 'HST', 'IEX', 'HES', 'KSU', 'LNT', 'KRA', 'KHC', 'KR', 'KMB', 'JKHY', 'JNJ', 'IT', 'LEG', 'ITW', 'KSS', 'KEY', 'JNPR', 'LKQ', 'KIM', 'IVZ', 'KO', 'LNC', 'JBHT', 'LDOS', 'KMX', 'LMT', 'K', 'JPM', 'KGNR', 'KACPF', 'JCI', 'LH', 'KEYS', 'LBTYA', 'MSFT', 'MKTX', 'LYB', 'MCO', 'MRO', 'MDLZ', 'MLM', 'LVS', 'LRCX', 'MSCI', 'MOS', 'MRK', 'MET', 'MGM', 'MNST', 'MMC', 'MO', 'MCHP', 'LYV', 'MHK', 'MDT', 'LUV', 'MCK', 'MS-PF', 'MMM', 'MAA', 'MCD', 'MPC', 'MAR', 'LOW', 'MRCR', 'NOV', 'PEG', 'NVRO', 'NTRR', 'MU', 'NTRA', 'PAYX', 'NSC', 'NRG', 'ODFL', 'NTAP', 'PBCT', 'NFLX', 'ORLY', 'OMC', 'NTRS', 'NCTKF', 'NOXL', 'OKE', 'NI', 'NVR', 'NOC', 'O', 'NOW', 'PCAR', 'NEE', 'NLSN', 'NWL', 'MSI', 'NDAQ', 'NMHLY', 'OXY', 'NOK', 'NEOG', 'NCLH', 'RF', 'PSX', 'RE', 'PPG', 'ROK', 'PXD', 'RIBT', 'RCL', 'REGN', 'RMD', 'PKI', 'RL', 'RJF', 'PG', 'QRVO', 'REG', 'PHM', 'PNWRF', 'PKG', 'PNW', 'PLD', 'PVH', 'PM', 'PNR', 'PWR', 'PH', 'RLI', 'PEP', 'PRU', 'PFE', 'RHI', 'ROST', 'TAP', 'TEL', 'SRG', 'SLG', 'RSG', 'SYK', 'SNPS', 'SCHW', 'SHW', 'RXMD', 'SEGXF', 'SWKS', 'SBUX', 'RSNHF', 'SWK', 'SONC', 'ROP', 'STZ-B', 'TCYSF', 'STT', 'SPG', 'SYF', 'T', 'STX', 'SIVB', 'SO', 'ROL', 'TJX', 'SEE', 'SLB', 'SRE', 'VZ', 'UNP', 'TMUS', 'TRAUF', 'V', 'TW', 'VRSK', 'TWTR', 'URI', 'ULTA', 'UPS', 'UDR', 'TSN', 'UAL', 'TSCO', 'TTWO', 'VRSN', 'UA', 'TMO', 'WBA', 'TXN', 'UNM', 'USB', 'TXT', 'VMC', 'WAT', 'UHS', 'UEEC', 'VTR', 'TYL', 'TROW', 'TRV', 'VFC', 'WYNN', 'WSPOF', 'WU', 'YUM', 'XYL', 'WST', 'WRK', 'WEC', 'WM', 'ZTS', 'ZBH', 'XOM', 'XEL', 'WDC', 'WRB', 'WY', 'ZION', 'WHR', 'XLEFF', 'WMB', 'XLNX']

file_i=55
cols=["High","Low","Open","Close"]
for file_i in range(100):
  fname="%s.csv"%files[file_i]
  print(file_i,fname)
  cur_fpath=os.path.join(root_dir,fname)
  pd_df=pd.read_csv(cur_fpath)
  content_2d=get_pd_col_data(pd_df,cols)

  for a in content_2d[:10]:
    print(a)
  print("========")

0 ABC.csv
[2.96875, 2.8125, 2.9375, 2.953125]
[2.96875, 2.84375, 2.90625, 2.84375]
[2.9375, 2.84375, 2.84375, 2.890625]
[2.90625, 2.84375, 2.90625, 2.84375]
[2.90625, 2.84375, 2.84375, 2.875]
[2.90625, 2.84375, 2.84375, 2.859375]
[2.875, 2.84375, 2.84375, 2.859375]
[2.875, 2.84375, 2.84375, 2.875]
[2.953125, 2.84375, 2.84375, 2.953125]
[2.953125, 2.875, 2.890625, 2.9375]
1 ADP.csv
[0.8271409869194031, 0.8085539937019348, 0.0, 0.8178470134735107]
[0.8302389979362488, 0.814749002456665, 0.0, 0.8271409869194031]
[0.8426309823989868, 0.8271409869194031, 0.0, 0.8426309823989868]
[0.8612179756164551, 0.8395329713821411, 0.0, 0.8395329713821411]
[0.848825991153717, 0.8364350199699402, 0.0, 0.8364350199699402]
[0.8426309823989868, 0.8302389979362488, 0.0, 0.8302389979362488]
[0.8426309823989868, 0.8271409869194031, 0.0, 0.8302389979362488]
[0.8426309823989868, 0.8271409869194031, 0.0, 0.8426309823989868]
[0.8302389979362488, 0.8116520047187805, 0.0, 0.8116520047187805]
[0.8426309823989868, 0.8