<a href="https://colab.research.google.com/github/hmghaly/km/blob/main/stock_trading_support.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Start

In [1]:
from google.colab import drive
import os
drive.mount('/content/drive')
cwd='/content/drive/MyDrive/stocks' #directory where we keep the data
os.chdir(cwd)

Mounted at /content/drive


#Network Definition

In [2]:
import torch
from torch import nn
import torch.optim as optim
import random

torch.manual_seed(1)
random.seed(1)

#device = torch.device('cpu')
device = torch.device('cuda')

class RNN(nn.Module):
  def __init__(self, input_size, hidden_size, output_size,num_layers, matching_in_out=False, apply_sigmoid=True, apply_softmax=False, batch_size=1):
    super(RNN, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.num_layers = num_layers
    self.batch_size = batch_size
    self.apply_softmax=apply_softmax
    self.apply_sigmoid=apply_sigmoid
    self.matching_in_out = matching_in_out #length of input vector matches the length of output vector 
    self.lstm = nn.LSTM(input_size, hidden_size,num_layers)
    self.hidden2out = nn.Linear(hidden_size, output_size)
    if self.apply_softmax: self.softmax =nn.Softmax(dim=2)
    if self.apply_sigmoid: self.sigmoid =nn.Sigmoid() 
    
    #self.sigmoid = torch.sigmoid(dim=1)
    self.hidden = self.init_hidden()
  def forward(self, feature_list):
    self.hidden = self.init_hidden() ### check
    feature_list=torch.tensor(feature_list)
    feature_list=feature_list.to(device) #### <<<<<<<<<<<<<<<<< 
    if self.matching_in_out:
      lstm_out, _ = self.lstm( feature_list.view(len( feature_list), 1, -1))
      output_scores = self.hidden2out(lstm_out.view(len( feature_list), -1))
      if self.apply_sigmoid: output_scores=self.sigmoid(output_scores).to(device)
      elif self.apply_softmax: output_scores=self.softmax(output_scores).to(device)
      #output_scores = torch.sigmoid(output_space) #we'll need to check if we need this sigmoid
      return output_scores #output_scores
    else:
      outs=[]
      for i in range(len(feature_list)):
        cur_ft_tensor=feature_list[i]#.view([1,1,self.input_size])
        cur_ft_tensor=cur_ft_tensor.view([1,1,self.input_size])
        lstm_out, self.hidden = self.lstm(cur_ft_tensor, self.hidden)
        outs=self.hidden2out(lstm_out)
        if self.apply_sigmoid: outs = self.sigmoid(outs).to(device) #self.sigmoid =nn.Sigmoid()
        elif self.apply_softmax: outs = self.softmax(outs).to(device)
        
      return outs
  def init_hidden(self):
    #return torch.rand(self.num_layers, self.batch_size, self.hidden_size)
    return (torch.rand(self.num_layers, self.batch_size, self.hidden_size).to(device),
            torch.rand(self.num_layers, self.batch_size, self.hidden_size).to(device))

n_input=1
n_output=3
n_hidden =64#64
n_layers=2
LR=0.01

#rnn = RNN(n_input, n_hidden, n_output,n_layers,matching_in_out=False).to(device)
rnn = RNN(n_input, n_hidden, n_output,n_layers,matching_in_out=False).to(device)
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all cnn parameters

# actual_out=[1.,0.,1.]
# print(rnn)
# for a in range(10):
#   #rnn.zero_grad()
#   input_list=[-2.,5.2,-1.1,3.2,2,8.9,-7.3]
#   input_tensor=torch.tensor(input_list)
#   actual_out_tensor=torch.tensor(actual_out).to(device)
#   #input_tensor=torch.rand((5, n_input)).to(device)
#   rnn_output = rnn(input_tensor).to(device)
#   print("Input:",input_tensor)
#   #print("Output:", output.shape)
#   #rnn_output_list=rnn_output.ravel().tolist()
#   rnn_output_list=rnn_output.tolist()
#   print(rnn_output_list)
#   #print([round(v,4) for v in rnn_output_list])
#   loss = loss_func(actual_out_tensor.ravel(), rnn_output.ravel()) #calculate the loss, difference between the output and the desired outcome tensors
#   loss.backward()
#   optimizer.step()
#   print("-------")

#Main Functions

In [3]:
import os
import pandas as pd
import numpy as np

#define functions to extract features and process labels
def get_diff_percent(val_list,ref_val0): #ref_val is the present val
  out_vals=[]
  for val0 in val_list:
    try:
      diff0=val0-ref_val0
      precent0=100*(diff0/ref_val0)
      out_vals.append(round(precent0,2))
    except: pass
  return out_vals

class io_cls: #input to output: category >< onehot
  def __init__(self,spacing=2,max_val=10): #for a general purpose, this can be where we define the labels
    self.spacing=spacing
    self.max_val=max_val
    self.all_labels=[]
    for mv_val0 in range(-max_val,max_val+1,spacing):self.all_labels.append(str(mv_val0))
    self.n_labels=len(self.all_labels)
  def one_hot(self,val_list,ref_val): #and this is when we convert from categorical to one hot
    self.diff_list=[]
    self.one_hot_list=[]
    for val0 in val_list:
      diff0=val0-ref_val
      precent0=100*(diff0/ref_val)
      precent0_norm=int(round(precent0/self.spacing)*self.spacing) #int(round(spacing*precent0)/spacing)
      if precent0_norm<=-self.max_val: diff_str=str(-self.max_val)
      elif precent0_norm>=self.max_val: diff_str=str(self.max_val)
      else: diff_str=str(precent0_norm)
      self.diff_list.append(diff_str)
      tmp_one_hot_vals=[0.]*len(self.all_labels)
      if diff_str in self.all_labels: 
        tmp_i=self.all_labels.index(diff_str)
        tmp_one_hot_vals[tmp_i]=1.
      self.one_hot_list.append(tmp_one_hot_vals)
    return self.one_hot_list
  def out2labels(self,rnn_flat_out): #a flat rnn output to split into slices, and get the label weights for each slice - and then from one hot to categorical
    final_list=[]
    n_slices=int(len(rnn_flat_out)/len(self.all_labels))
    for i0 in range(n_slices):
      i1=i0+1
      cur_slice=rnn_flat_out[i0*len(self.all_labels):i1*len(self.all_labels)]
      tmp_list=[]
      for lb0,cs0 in zip(self.all_labels,cur_slice): tmp_list.append((lb0,cs0))
      tmp_list.sort(key=lambda x:-x[-1])
      final_list.append(tmp_list)
    return final_list

#Getting the input
def get_norm_close(fpath,prev_n0=20,next_n0=10,train_ratio=0.75):
  pd_df=pd.read_csv(fpath)
  close_col=pd_df["Close"].fillna(0)
  #close_col=pd_df.dropna(subset=['Close'], how='all', inplace=True)
  #close_col = pd_df[pd_df['Close'].notna()]
  data_len=len(close_col)
  all_data=[]
  for test_i in range(prev_n0,len(close_col)-next_n0):
    prev_items=close_col[test_i-prev_n0:test_i].to_list() #[0,1,2,3,4,5,6,7,8,9] predict the closing today and the following next_n-1 days
    next_items=close_col[test_i:test_i+next_n0].to_list()
    all_data.append((prev_items,next_items))    
  train_size=int(train_ratio*data_len)
  train_data=all_data[:train_size]
  test_data=all_data[train_size:]
  return train_data,test_data


cur_test_list=[12,9,15,10,13,14,12]
cur_ref_val=12
test_out=get_diff_percent(cur_test_list,cur_ref_val)
print(test_out)

[0.0, -25.0, 25.0, -16.67, 8.33, 16.67, 0.0]


#Loading Data

In [None]:
prev_n,next_n=20,10

root_dir='stock_market_data/sp500/csv'
sample_files=["AAPL","GOOG","FB","AMZN","EA","IBM","MSFT","GM","UPS","PG"]
for fname in sample_files:
  cur_fname= fname+".csv"
  cur_fpath=os.path.join(root_dir,cur_fname)
  print(cur_fpath)
  cur_train0,cur_test0=get_norm_close(cur_fpath) #stock_market_data/sp500/csv/AAPL.csv
  print(cur_fname, len(cur_train0),len(cur_test0))

stock_market_data/sp500/csv/AAPL.csv
AAPL.csv 7786 2566
stock_market_data/sp500/csv/GOOG.csv
GOOG.csv 3303 1071
stock_market_data/sp500/csv/FB.csv
FB.csv 1839 583
stock_market_data/sp500/csv/AMZN.csv
AMZN.csv 4672 1528
stock_market_data/sp500/csv/EA.csv
EA.csv 6123 2012
stock_market_data/sp500/csv/IBM.csv
IBM.csv 9861 3257
stock_market_data/sp500/csv/MSFT.csv
MSFT.csv 6792 2234
stock_market_data/sp500/csv/GM.csv
GM.csv 2121 678
stock_market_data/sp500/csv/UPS.csv
UPS.csv 4201 1371
stock_market_data/sp500/csv/PG.csv
PG.csv 9861 3257


In [None]:
train_i=130
for train_i in range(100):
  print(train_i)
  prev_vals,next_vals=cur_train0[train_i]
  prev_vals=[round(v,2) for v in prev_vals]
  next_vals=[round(v,2) for v in next_vals]
  prev_percents,next_percents=get_diff_percent(prev_vals,prev_vals[-1]),get_diff_percent(next_vals,prev_vals[-1])
  min_val=min(next_percents)
  min_val_index=next_percents.index(min_val)
  max_val=max(next_percents)
  max_val_index=next_percents.index(max_val)
  rebound_max_val=max(next_percents[min_val_index:])
  rebound_diff=round(rebound_max_val-min_val,2)

  print("prev_vals:", prev_vals)
  print("next_vals:", next_vals)
  #print(prev_percents)
  print("Next percent differences:", next_percents)
  print("min val:",min_val, "@ index:",min_val_index)
  print("max val:",max_val, "@ index:",max_val_index)
  print("max rebound val:",rebound_max_val)
  print("rebound_diff:",rebound_diff)
  print("---------")

#print("min val:",min_next_diff_val, "@ index:",min_val_index, "max val:",max_next_diff_val, "rebound_max_val:",rebound_max_val)

0
prev_vals: [1.72, 1.73, 1.72, 1.75, 1.77, 1.76, 1.75, 1.74, 1.75, 1.75, 1.77, 1.75, 1.76, 1.77, 1.81, 1.76, 1.73, 1.74, 1.73, 1.72]
next_vals: [1.73, 1.72, 1.75, 1.75, 1.76, 1.78, 1.8, 1.79, 1.8, 1.81]
Next percent differences: [0.58, 0.0, 1.74, 1.74, 2.33, 3.49, 4.65, 4.07, 4.65, 5.23]
min val: 0.0 @ index: 1
max val: 5.23 @ index: 9
max rebound val: 5.23
rebound_diff: 5.23
---------
1
prev_vals: [1.73, 1.72, 1.75, 1.77, 1.76, 1.75, 1.74, 1.75, 1.75, 1.77, 1.75, 1.76, 1.77, 1.81, 1.76, 1.73, 1.74, 1.73, 1.72, 1.73]
next_vals: [1.72, 1.75, 1.75, 1.76, 1.78, 1.8, 1.79, 1.8, 1.81, 1.79]
Next percent differences: [-0.58, 1.16, 1.16, 1.73, 2.89, 4.05, 3.47, 4.05, 4.62, 3.47]
min val: -0.58 @ index: 0
max val: 4.62 @ index: 8
max rebound val: 4.62
rebound_diff: 5.2
---------
2
prev_vals: [1.72, 1.75, 1.77, 1.76, 1.75, 1.74, 1.75, 1.75, 1.77, 1.75, 1.76, 1.77, 1.81, 1.76, 1.73, 1.74, 1.73, 1.72, 1.73, 1.72]
next_vals: [1.75, 1.75, 1.76, 1.78, 1.8, 1.79, 1.8, 1.81, 1.79, 1.79]
Next percent 

#Starting Training

In [None]:
import time, math, random
from random import shuffle

model_name="exp6-pred1-combined-stocks"
model_name="exp6-pred1-combined-stocks1-3layer-full"
model_name="exp6-pred1-combined-stocks1-3layer-batches1"
model_name="exp6-pred2-combined-stocks1-3layer-batches1"
model_name="exp6-pred2-combined-stocks1-4layer-batches2"
n_input=1
n_output=1
n_output=2 #>5, min <5 & rebound >5
n_hidden =64#64
n_layers=4#3
n_epochs=100
LR=0.0000001
prev_n,next_n=20,10
n_train,n_test=None,None
#n_train,n_test=1000,50
train_batch_size=10000

test_cutoff_val=0.5

torch.manual_seed(1)
random.seed(1)

def extract_labels(next_percents0):
  if next_percents0==[]: return [0.,0.]
  min_val0=min(next_percents0)
  #print("min_val0",min_val0)
  min_val_index=next_percents0.index(min_val0)
  max_val=max(next_percents0[1:])
  max_val_index=next_percents0.index(max_val)
  rebound_max_val=max(next_percents0[min_val_index:])
  rebound_diff=round(rebound_max_val-min_val0,2)
  max_greater_than_5=0.
  found_minus_5=False
  found_rebound_greater_than_5=False
  cur_min_val=None
  for i0, percent_val in enumerate(next_percents0):
    if percent_val<-5 and found_minus_5==False: 
      found_minus_5=True
      if cur_min_val==None or percent_val<cur_min_val: cur_min_val=percent_val
      #print("percent_val",percent_val,"found_minus_5",found_minus_5)
      continue
    if cur_min_val!=None and percent_val-cur_min_val>5: 
      found_rebound_greater_than_5=True
      break
  min_5_rebound_greater_than_5=0.
  if max_val>5: max_greater_than_5=1.
  if found_rebound_greater_than_5: min_5_rebound_greater_than_5=1.
  actual_out0=[max_greater_than_5,min_5_rebound_greater_than_5]
  return actual_out0


root_dir='stock_market_data/sp500/csv'
initial_files=["AAPL","GOOG","FB","AMZN","EA","IBM","MSFT","GM","UPS","PG"]
#cur_path=os.path.join(root_dir,"AAPL.csv")
all_files=[v.split(".")[0] for v in os.listdir(root_dir) if v.endswith(".csv")]
additional_files=[v for v in all_files if not v in initial_files]
sample_files=initial_files+additional_files[:90]

#cur_train0,cur_test0=get_norm_close("stock_market_data/sp500/csv/AAPL.csv",prev_n,next_n) #stock_market_data/sp500/csv/AAPL.csv



#rnn = RNN(n_input, n_hidden, n_output,n_layers,matching_in_out=False).to(device)
rnn = RNN(n_input, n_hidden, n_output,n_layers,matching_in_out=False).to(device)
loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=LR)   # optimize all cnn parameters


model_dir=os.path.join(cwd,"models", model_name) 
tmp_model_dir=os.path.join(cwd,"models", model_name,"tmp") 
if not os.path.exists(tmp_model_dir): os.makedirs(tmp_model_dir)
log_fpath=os.path.join(model_dir,"log.txt")
log_fopen=open(log_fpath,"a")
log_fopen.write(str(rnn)+"\n")

print("loading data")
all_training,all_testing=[],[]
for fname in sample_files:
  cur_path=os.path.join(root_dir,fname+".csv")
  cur_train0,cur_test0=get_norm_close(cur_path,prev_n,next_n,train_ratio=0.8)
  if n_train!=None: cur_train0=cur_train0[:n_train]
  if n_test!=None: cur_test0=cur_test0[:n_test]
  all_training.extend(cur_train0)
  all_testing.extend(cur_test0)
shuffle(all_training)
shuffle(all_testing)
print("all_training", len(all_training),"all_testing",len(all_testing))
n_batches=math.floor(len(all_training)/train_batch_size)
test_batch_size=math.floor(len(all_testing)/n_batches)


for epoch0 in range(n_epochs):
  PATH=os.path.join(model_dir, "model-%s.model"%epoch0)
  if os.path.exists(PATH):
    checkpoint = torch.load(PATH)
    rnn.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print("loaded model for this epoch",PATH)
    for a,b in  checkpoint.items():
      if "loss" in a.lower(): print(a,round(b,6))
    continue  
  print("epoch0",epoch0)
  for batch_i0 in range(n_batches+1):
    t0=time.time()
    pred_count,correct_count=0,0
    batch_i1=batch_i0+1
    cur_train_items=all_training[batch_i0*train_batch_size:batch_i1*train_batch_size]
    cur_test_items=all_testing[batch_i0*test_batch_size:batch_i1*test_batch_size]
    print("batch_i0",batch_i0, "cur_train_items",len(cur_train_items),"cur_test_items",len(cur_test_items))
    tmp_path=os.path.join(tmp_model_dir, "model-batch-%s.model"%batch_i0)
    if os.path.exists(tmp_path):
      checkpoint = torch.load(tmp_path)
      rnn.load_state_dict(checkpoint['model_state_dict'])
      optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
      print("loaded model for this epoch",tmp_path)
      continue  

    total_train_loss,total_test_loss=0,0
    train_counter,test_counter=0,0
    test_pred_counter,test_correct_counter=0,0 #how many test items reach the cutoff val for prediction, how many are correctly predicted
    #for train_i in range(1500):
    for train_i, train_item in enumerate(cur_train_items):
      if train_i%2000==0: print("train_i",train_i)
      #print(train_i)
      prev_vals,next_vals=train_item# cur_train0[train_i]
      prev_vals=[round(v,2) for v in prev_vals]
      next_vals=[round(v,2) for v in next_vals]
      prev_percents,next_percents=get_diff_percent(prev_vals,prev_vals[-1]),get_diff_percent(next_vals,prev_vals[-1])
      if prev_percents==[] or next_percents==[]: continue
      actual_out=extract_labels(next_percents)
      # print("cur_actual", cur_actual, next_percents)
      # print("--------")
      # continue
      # min_val=min(next_percents)
      # min_val_index=next_percents.index(min_val)
      # max_val=max(next_percents)
      # max_val_index=next_percents.index(max_val)
      # rebound_max_val=max(next_percents[min_val_index:])
      # rebound_diff=round(rebound_max_val-min_val,2)
      
      # max_gr_5=0.
      # if max_val>5: max_gr_5=1.
      # actual_out=[max_gr_5]
      #prev_percents=[math.log(v) for v in prev_percents] #testing
      #if prev_percents==[]: continue
      input_tensor=torch.tensor(prev_percents)
      actual_out_tensor=torch.tensor(actual_out).to(device)
      rnn_output = rnn(input_tensor).to(device)
      rnn_output_list=rnn_output.tolist()
      loss = loss_func(actual_out_tensor.ravel(), rnn_output.ravel()) #calculate the loss, difference between the output and the desired outcome tensors
      # if epoch0>3 and train_i<50:
      #   print("rnn out:",rnn_output_list, "actual:", actual_out, "loss:", loss.item())
      #print(loss)
      loss.backward()
      optimizer.step()
      total_train_loss+=loss.item()
      train_counter+=1

    for test_i, test_item in enumerate(cur_test_items):
      #print(train_i)
      if test_i%1000==0: print("test_i",test_i)
      rnn.zero_grad()
      prev_vals,next_vals=test_item# cur_train0[train_i]
      prev_vals=[round(v,2) for v in prev_vals]
      next_vals=[round(v,2) for v in next_vals]
      prev_percents,next_percents=get_diff_percent(prev_vals,prev_vals[-1]),get_diff_percent(next_vals,prev_vals[-1])
      if prev_percents==[] or next_percents==[]: continue
      actual_out=extract_labels(next_percents)

      # continue
      # min_val=min(next_percents)
      # min_val_index=next_percents.index(min_val)
      # max_val=max(next_percents)
      # max_val_index=next_percents.index(max_val)
      # rebound_max_val=max(next_percents[min_val_index:])
      # rebound_diff=round(rebound_max_val-min_val,2)
      
      # max_gr_5=0.
      # if max_val>5: max_gr_5=1.
      # actual_out=[max_gr_5]
      input_tensor=torch.tensor(prev_percents)
      actual_out_tensor=torch.tensor(actual_out).to(device)
      rnn_output = rnn(input_tensor).to(device)
      rnn_output_list=rnn_output.ravel().tolist()
      loss = loss_func(actual_out_tensor.ravel(), rnn_output.ravel()) #calculate the loss, difference between the output and the desired outcome tensors
      predicted_increase,predicted_rebound=rnn_output_list
      actual_increase,actual_rebound=actual_out
      if predicted_increase>0.6 or predicted_rebound>0.4:
        pred_count+=1
        if actual_increase>0.5 or actual_rebound>0.5: correct_count+=1
        print(test_i, "rnn out:",rnn_output_list, "actual:", actual_out, "loss:", loss.item())


      # if test_i<50:
      #   print(test_i, "rnn out:",rnn_output_list, "actual:", actual_out, "loss:", loss.item())
      #print(loss)
      pred_val=sum(rnn_output_list)/len(rnn_output_list)
      if pred_val>=test_cutoff_val and False:
        test_pred_counter+=1
        #if pred_val>0.5
        print("pred:", round(pred_val,2), "actual:", sum(actual_out))
        print("prev_percents",prev_percents)
        print("next_percents",next_percents)
        print("-------------")
      total_test_loss+=loss.item()
      test_counter+=1


    avg_train_loss=round(total_train_loss/train_counter,6)
    avg_test_loss=round(total_test_loss/test_counter,6)
    correct_ratio=0
    if pred_count>0: correct_ratio=round(correct_count/pred_count,2)
    print("pred_count",pred_count,"correct_count",correct_count,"correct_ratio",correct_ratio)
    # print("epoch0",epoch0, fname, "avg_train_loss",avg_train_loss, "avg_test_loss",avg_test_loss)
    # print("-------")
    
    t1=time.time()
    elapsed=round(t1-t0,2) 
    t0=time.time()    
    line="Epoch # %s - Batch: %s -  train loss: %s - test loss: %s - elapsed: %s"%(epoch0, batch_i0, avg_train_loss,avg_test_loss, elapsed)
    #line="Epoch # %s  -  train loss: %s - test loss: %s - elapsed: %s"%(epoch0, avg_train_loss,avg_test_loss, elapsed)
    print(line)
    log_fopen=open(log_fpath,"a")
    log_fopen.write(line+"\n")
    log_fopen.close() 
    cur_checkpoint={
            'epoch': epoch0,
            'n_input': n_input,
            'n_hidden': n_hidden,
            'n_layers': n_layers,
            'n_output': n_output,
            'LR': LR,
            'model_state_dict': rnn.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss': avg_train_loss,
            'test_loss': avg_test_loss
            }
    torch.save(cur_checkpoint, tmp_path)
  
  torch.save(cur_checkpoint, PATH)  
  print("model saved")
  for f in os.listdir(tmp_model_dir):
    tmp_fpath=os.path.join(tmp_model_dir,f)
    os.remove(tmp_fpath)
  print("deleted temporary files")
  print("-----------")


loading data
all_training 632849 all_testing 155258
loaded model for this epoch /content/drive/MyDrive/stocks/models/exp6-pred2-combined-stocks1-4layer-batches2/model-0.model
train_loss 0.129435
test_loss 0.130063
loaded model for this epoch /content/drive/MyDrive/stocks/models/exp6-pred2-combined-stocks1-4layer-batches2/model-1.model
train_loss 0.127988
test_loss 0.12422
loaded model for this epoch /content/drive/MyDrive/stocks/models/exp6-pred2-combined-stocks1-4layer-batches2/model-2.model
train_loss 0.127113
test_loss 0.120902
epoch0 3
batch_i0 0 cur_train_items 10000 cur_test_items 2464
loaded model for this epoch /content/drive/MyDrive/stocks/models/exp6-pred2-combined-stocks1-4layer-batches2/tmp/model-batch-0.model
batch_i0 1 cur_train_items 10000 cur_test_items 2464
loaded model for this epoch /content/drive/MyDrive/stocks/models/exp6-pred2-combined-stocks1-4layer-batches2/tmp/model-batch-1.model
batch_i0 2 cur_train_items 10000 cur_test_items 2464
train_i 0




train_i 2000
train_i 4000
train_i 6000
train_i 8000
test_i 0
40 rnn out: [0.6386612057685852, 0.3049921691417694] actual: [1.0, 0.0] loss: 0.11179296672344208
92 rnn out: [0.675680935382843, 0.34003931283950806] actual: [0.0, 1.0] loss: 0.4460464119911194
134 rnn out: [0.6416076421737671, 0.2823859751224518] actual: [0.0, 0.0] loss: 0.24570110440254211
148 rnn out: [0.614881694316864, 0.2910372316837311] actual: [0.0, 0.0] loss: 0.2313910871744156
171 rnn out: [0.6062631607055664, 0.24598395824432373] actual: [1.0, 0.0] loss: 0.10776840150356293
176 rnn out: [0.6710989475250244, 0.3122907876968384] actual: [0.0, 0.0] loss: 0.27394968271255493
281 rnn out: [0.6279201507568359, 0.29323819279670715] actual: [0.0, 0.0] loss: 0.24013617634773254
353 rnn out: [0.6046468019485474, 0.23933196067810059] actual: [1.0, 0.0] loss: 0.10679197311401367
383 rnn out: [0.6083676218986511, 0.2651468515396118] actual: [1.0, 0.0] loss: 0.11183938384056091
418 rnn out: [0.6972544193267822, 0.35021105408668

#Testing on actual data

In [5]:
import torch
e0=2
model_name="exp5-pred1-3-test"
model_name="exp6-pred1-combined-stocks"
model_name="exp6-pred1-combined-stocks1-3layer-full" 
model_name="exp6-pred1-combined-stocks1-3layer-batches1"
model_name="exp6-pred2-combined-stocks1-3layer-batches1"
model_name="exp6-pred2-combined-stocks1-4layer-batches2"

pred_cutoff_val=0.6
torch.manual_seed(1)
random.seed(1)

model_dir=os.path.join(cwd,"models", model_name) 
PATH=os.path.join(model_dir, "model-%s.model"%e0)
checkpoint = torch.load(PATH)
rnn = RNN(checkpoint["n_input"], checkpoint["n_hidden"] , checkpoint["n_output"] , checkpoint["n_layers"] , matching_in_out=False).to(device)
rnn.load_state_dict(checkpoint['model_state_dict'])
rnn.eval()
root_dir='stock_market_data/sp500/csv'
sample_test_files=['XOM', 'XEL', 'WDC', 'WRB', 'WY', 'ZION', 'WHR', 'XLEFF', 'WMB', 'XLNX']
#sample_test_files=["AAPL","GOOG","FB","AMZN","EA","IBM","MSFT","GM","UPS","PG"]
for fname in sample_test_files:
  print(fname)
  pred_count,correct_count=0,0
  cur_fpath=os.path.join(root_dir,fname+".csv")
  #cur_train0,cur_test0=get_norm_close("stock_market_data/sp500/csv/AAPL.csv") #stock_market_data/sp500/csv/AAPL.csv
  cur_train0,cur_test0=get_norm_close(cur_fpath) #stock_market_data/sp500/csv/AAPL.csv
  for test_i, test_item in enumerate(cur_test0):
    #print(train_i)
    rnn.zero_grad()
    prev_vals,next_vals=test_item# cur_train0[train_i]
    prev_vals=[round(v,2) for v in prev_vals]
    next_vals=[round(v,2) for v in next_vals]
    prev_percents,next_percents=get_diff_percent(prev_vals,prev_vals[-1]),get_diff_percent(next_vals,prev_vals[-1])
    actual_out=extract_labels(next_percents)
    # min_val=min(next_percents)
    # min_val_index=next_percents.index(min_val)
    # max_val=max(next_percents)
    # max_val_index=next_percents.index(max_val)
    # rebound_max_val=max(next_percents[min_val_index:])
    # rebound_diff=round(rebound_max_val-min_val,2)
    
    # max_gr_5=0.
    # if max_val>5: max_gr_5=1.
    # actual_out=[max_gr_5]
    
    input_tensor=torch.tensor(prev_percents)
    actual_out_tensor=torch.tensor(actual_out)
    rnn_output = rnn(input_tensor)
    rnn_output_list=rnn_output.ravel().tolist()
    predicted_increase,predicted_rebound=rnn_output_list
    actual_increase,actual_rebound=actual_out
    # predicted_val=sum(rnn_output_list)
    # actual_val=sum(actual_out)
    # if test_i<50:
    #   print(test_i, "rnn out:",rnn_output_list, "actual:", actual_out, "loss:", loss.item())
    #   #print(loss)
    #   print("--------")
    # continue

    if predicted_increase>pred_cutoff_val or predicted_rebound>0.35:
      pred_count+=1
      if actual_increase>0.5 or actual_rebound>0.5: correct_count+=1
      print(test_i, "rnn out:",rnn_output_list, "actual:", actual_out, "loss:", loss.item())
      #if actual_increase>0.5: correct_count+=1
      # print(fname, "predicted_val:",round(predicted_val,2), "actual_out:",round(actual_val,2))
      # print("next_percents",next_percents)
      # #print(rnn_output_list)
      # #print(sum(rnn_output_list))
      # print("------")
  correct_ratio=0    
  if pred_count>0: correct_ratio=round(correct_count/pred_count,2)
  print(">>>>", fname, "pred_count",pred_count,"correct_count",correct_count,"correct_ratio",correct_ratio)
  print("=========")


XOM




621 rnn out: [0.600794792175293, 0.21132026612758636] actual: [1.0, 0.0] loss: 0.07818968594074249
2771 rnn out: [0.6441137790679932, 0.2492138147354126] actual: [1.0, 0.0] loss: 0.07818968594074249
2777 rnn out: [0.6164774298667908, 0.23347288370132446] actual: [0.0, 0.0] loss: 0.07818968594074249
2778 rnn out: [0.6829203367233276, 0.2958845794200897] actual: [0.0, 0.0] loss: 0.07818968594074249
2779 rnn out: [0.6592207551002502, 0.28471821546554565] actual: [0.0, 0.0] loss: 0.07818968594074249
2780 rnn out: [0.6596035361289978, 0.28418242931365967] actual: [0.0, 0.0] loss: 0.07818968594074249
2781 rnn out: [0.6845536231994629, 0.30300453305244446] actual: [0.0, 1.0] loss: 0.07818968594074249
2782 rnn out: [0.6667603254318237, 0.29412606358528137] actual: [0.0, 1.0] loss: 0.07818968594074249
2783 rnn out: [0.6823538541793823, 0.30344557762145996] actual: [1.0, 1.0] loss: 0.07818968594074249
2784 rnn out: [0.6484724283218384, 0.30321961641311646] actual: [1.0, 1.0] loss: 0.078189685940

In [None]:
train_batch_size=3000
n_batches=math.floor(len(all_training)/train_batch_size)
test_batch_size=math.floor(len(all_testing)/n_batches)
print("train_batch_size",train_batch_size, "test_batch_size",test_batch_size, "n_batches",n_batches)

train_batch_size 3000 test_batch_size 736 n_batches 101


In [None]:
all_files=[v.split(".")[0] for v in os.listdir(root_dir) if v.endswith(".csv")]

In [None]:
all_files[-10:]
#print(test_batch_size)

['XOM', 'XEL', 'WDC', 'WRB', 'WY', 'ZION', 'WHR', 'XLEFF', 'WMB', 'XLNX']

#Dump

In [None]:
  # for fname in sample_files:
  #   t0=time.time()
  #   #print("epoch0",epoch0, "fname",fname)
  #   tmp_path=os.path.join(tmp_model_dir, "model-%s.model"%fname)
  #   if os.path.exists(tmp_path):
  #     checkpoint = torch.load(tmp_path)
  #     rnn.load_state_dict(checkpoint['model_state_dict'])
  #     optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  #     print("loaded model for this epoch",tmp_path)
  #     continue  
  #   #
  #   cur_path=os.path.join(root_dir,fname+".csv")
  #   cur_train0,cur_test0=get_norm_close(cur_path,prev_n,next_n,train_ratio=0.8)
  #   if n_train!=None: cur_train0=cur_train0[:n_train]
  #   if n_test!=None: cur_test0=cur_test0[:n_test]
