In [1]:
import torch
import torch.nn as nn 
import torch.utils.data as data
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import numpy as np
import yfinance as yf
import pandas as pd

Data Pre-Processing

In [103]:
df = pd.read_csv('./data/all_stocks_5yr.csv')
#add 2 ratios
df['ratio1']=((df['close']+df['open'])/2-df['low'])/(df['high']-df['low'])
df['ratio2']=(df['close']-df['open'])/(df['high']-df['low'])
df=df[['date', 'open', 'close', 'high', 'low', 'ratio1', 'ratio2', 'volume', 'Name']]
print(df.shape)
df.head(1)

(619040, 9)


Unnamed: 0,date,open,close,high,low,ratio1,ratio2,volume,Name
0,2013-02-08,15.07,14.75,15.12,14.63,0.571429,-0.653061,8407500,AAL


In [106]:
#calculate avg and set up target
print(df['Name'].unique().shape[0])
df['avg']=(df['open']+df['close']+df['high']+df['low'])/4
df['trend_tmr_up']=None
for i in range(len(df)-1):
    if df.iloc[i, 8]==df.iloc[i+1, 8]:
        #tmr avg > today avg
        if df.iloc[i+1, 9] > df.iloc[i, 9]: 
            df.iloc[i, 10]=True
        else:
            df.iloc[i, 10]=False

df=df.dropna()
print(df.trend_tmr_up.isnull().any())
print(len(df))

df.head(1)

505
False
618523


Unnamed: 0,date,open,close,high,low,ratio1,ratio2,volume,Name,avg,trend_tmr_up
0,2013-02-08,15.07,14.75,15.12,14.63,0.571429,-0.653061,8407500,AAL,14.8925,False


Data Transforming & Loading

In [107]:
data_all = df[['open', 'close', 'high', 'low', 'ratio1', 'ratio2', 'volume', 'trend_tmr_up']].to_numpy()
train_size = 396000
valid_size = 99000
test_size = len(df)-train_size-valid_size
scaler = MinMaxScaler(feature_range=(-1, 1))

train_valid_set, test_set = data.random_split(data_all, [train_size+valid_size, test_size])

train_valid_x, train_valid_y = train_valid_set[:][:, 0:-1], train_valid_set[:][:, -1]
test_x, test_y = test_set[:][:, 0:-1], test_set[:][:, -1]

train_valid_x = scaler.fit_transform(train_valid_x)
test_x = scaler.transform(test_x)

train_valid_x = torch.from_numpy(train_valid_x.astype(np.float32)).to(torch.float32)
train_valid_y = torch.from_numpy(train_valid_y.astype(np.int8)).to(torch.int8)
test_x = torch.from_numpy(test_x.astype(np.float32)).to(torch.float32)
test_y = torch.from_numpy(test_y.astype(np.int8)).to(torch.int8)

train_valid_data = TensorDataset(train_valid_x, train_valid_y.reshape(-1,1))
test_data = TensorDataset(test_x, test_y.reshape(-1,1))

train_data, valid_data = data.random_split(train_valid_data, [train_size, valid_size])

print(f'train size is {len(train_data)}')
print(f'valid size is {len(valid_data)}')
print(f'test size is {len(test_data)}')

batch_size = 100
train_load = DataLoader(train_data, batch_size, shuffle=True)
valid_load = DataLoader(valid_data, batch_size, shuffle=True)
test_load = DataLoader(test_data, batch_size, shuffle=True)

train size is 396000
valid size is 99000
test size is 123523


In [108]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MLP

In [109]:
class StockNet(nn.Module):
    def __init__(self, input_size, hidden_size_1, hidden_size_2, output_size):
        super().__init__()
        self.l1 = nn.Linear(input_size, hidden_size_1)
        self.relu1 = nn.ReLU()
        self.l2 = nn.Linear(hidden_size_1, hidden_size_2)
        self.relu2 = nn.ReLU()
        self.l3 = nn.Linear(hidden_size_2, output_size)

    
    def forward(self, x):
        out = self.l1(x)
        out = self.relu1(out)
        out = self.l2(out)
        out = self.relu2(out)
        out = self.l3(out)
        return out

Training

In [112]:
input_size = 7
output_size = 2
hidden_size_1 = 100
hidden_size_2 = 50
learning_rate = 0.001
num_epoch = 2

In [113]:
model = StockNet(input_size, hidden_size_1, hidden_size_2, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epoch):
    for i, (inputs, labels) in enumerate(train_load):
        inputs = inputs.to(device)
        labels = labels.reshape(len(labels))
        #################################################
        labels = labels.type(torch.LongTensor).to(device)
        #################################################
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if (i+1)%1000==0:
            print(f'epoch {epoch+1}/{num_epoch}, loss = {loss.item():.4f}')

epoch 1/2, loss = 0.6947
epoch 1/2, loss = 0.6010
epoch 1/2, loss = 0.6145
epoch 2/2, loss = 0.5300
epoch 2/2, loss = 0.6704
epoch 2/2, loss = 0.6117


Validating

In [114]:
with torch.no_grad():
    n_correct=0
    n_samples=0
    how_many_1=0
    for inputs, labels in valid_load:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)

        _, predictions = torch.max(outputs, 1)
        n_samples += labels.shape[0]
        n_correct += (predictions == labels[:, 0]).sum().item()
        how_many_1 += labels.sum()
    
    acc = 100.0 * n_correct / n_samples
    print(f'valid accuracy = {acc}')
    print(f'data balance ratio is {how_many_1/n_samples}')

valid accuracy = 69.47272727272727
data balance ratio is 0.5326666831970215


Testing

In [115]:
with torch.no_grad():
    n_correct=0
    n_samples=0
    how_many_1=0
    for inputs, labels in test_load:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)

        _, predictions = torch.max(outputs, 1)
        n_samples += labels.shape[0]
        n_correct += (predictions == labels[:, 0]).sum().item()
        how_many_1 += labels.sum()
    
    acc = 100.0 * n_correct / n_samples
    print(f'test accuracy = {acc}')
    print(f'data balance ratio is {how_many_1/n_samples}')

test accuracy = 69.50041692640237
data balance ratio is 0.5330181121826172


Single Stock Prediction

In [116]:
ticker = 'AAPL'
time_frame = '1mo'

In [117]:
aapl = yf.Ticker(ticker)
aapl_df = aapl.history(period=time_frame)
aapl_df.reset_index(inplace=True)
aapl_df.rename(columns={'Date':'date','Open':'open', 'Close':'close','High':'high', 'Low':'low','Volume':'volume'}, inplace=True)
aapl_df['Name']='AAP'
aapl_df.drop(columns='Dividends', inplace=True)
aapl_df.drop(columns='Stock Splits', inplace=True)

aapl_df['ratio1']=((aapl_df['close']+aapl_df['open'])/2-aapl_df['low'])/(aapl_df['high']-aapl_df['low'])
aapl_df['ratio2']=(aapl_df['close']-aapl_df['open'])/(aapl_df['high']-aapl_df['low'])
aapl_df['avg']=(aapl_df['open']+aapl_df['close']+aapl_df['high']+aapl_df['low'])/4
aapl_df['trend_tmr_up']=None

for i in range(len(aapl_df)-1):
    if aapl_df.iloc[i+1, 9] > aapl_df.iloc[i, 9]:
        aapl_df.iloc[i, 10]=True
    else:
        aapl_df.iloc[i, 10]=False

aapl_df.iloc[-1, -1]=3
aapl_df.head(1)

Unnamed: 0,date,open,high,low,close,volume,Name,ratio1,ratio2,avg,trend_tmr_up
0,2022-01-19,169.783686,170.862313,165.728854,166.018478,94815000,AAP,0.423151,-0.733464,168.098333,False


In [118]:
x = aapl_df[['open', 'close', 'high', 'low', 'ratio1', 'ratio2', 'volume']].to_numpy()
y = aapl_df['trend_tmr_up'].to_numpy().astype(np.int8)
x = scaler.transform(x)
x = torch.from_numpy(x.astype(np.float32)).to(torch.float32).to(device)

outputs_aapl = model(x)
_, pred_aapl = torch.max(outputs_aapl, 1)

print(f'Prediction: {pred_aapl.cpu().numpy()}')
print(f'Reality   : {y}')

Prediction: [0 0 0 1 1 0 0 1 1 1 1 0 1 0 1 1 0 0 1 1 1 0 0]
Reality   : [0 0 0 0 1 0 1 1 1 1 0 0 1 1 1 0 0 0 1 1 0 0 3]


In [81]:
def count_one(x):
    count=0
    for i in range(len(x)):
        if x[i]==1:
            count+=1
    return count

In [119]:
print(f'accuracy is {1-count_one(pred_aapl.cpu()+y)/(len(y)-1)}')

accuracy is 0.7272727272727273


Single Day Prediction

In [121]:
sss=aapl_df.loc[aapl_df['date']=='2022-02-16', ['open', 'close', 'high', 'low', 'ratio1', 'ratio2', 'volume']].to_numpy()
sss=scaler.transform(sss)
sss=torch.from_numpy(sss).to(torch.float32).to(device)
model(sss)
_, p = torch.max(model(sss), 1)
sss, p, model(sss)

(tensor([[-0.8333, -0.8330, -0.8339, -0.8342,  0.9766, -0.4021, -0.8021]],
        device='cuda:0'),
 tensor([1], device='cuda:0'),
 tensor([[-0.5018,  0.2410]], device='cuda:0', grad_fn=<AddmmBackward0>))

Something Good

In [122]:
def data_processing_yf (ticker, time_frame):
    aapl = yf.Ticker(ticker)
    aapl_df = aapl.history(period=time_frame)
    aapl_df.reset_index(inplace=True)
    aapl_df.rename(columns={'Date':'date','Open':'open', 'Close':'close','High':'high', 'Low':'low','Volume':'volume'}, inplace=True)
    aapl_df['Name']=ticker
    if 'Dividends' in aapl_df.columns:
        aapl_df.drop(columns='Dividends', inplace=True)
    if 'Stock Splits' in aapl_df.columns:
        aapl_df.drop(columns='Stock Splits', inplace=True)

    aapl_df['ratio1']=((aapl_df['close']+aapl_df['open'])/2-aapl_df['low'])/(aapl_df['high']-aapl_df['low'])
    aapl_df['ratio2']=(aapl_df['close']-aapl_df['open'])/(aapl_df['high']-aapl_df['low'])
    aapl_df['avg']=(aapl_df['open']+aapl_df['close']+aapl_df['high']+aapl_df['low'])/4
    aapl_df['trend_tmr_up']=None

    for i in range(len(aapl_df)-1):
        if aapl_df.iloc[i+1, 9] > aapl_df.iloc[i, 2]:
            aapl_df.iloc[i, 10]=True
        else:
            aapl_df.iloc[i, 10]=False

    aapl_df.iloc[-1, -1]=3
    return aapl_df

In [126]:
def test_acc (aapl_df):
    x = aapl_df[['open', 'close', 'high', 'low', 'ratio1', 'ratio2', 'volume']].to_numpy()
    y = aapl_df['trend_tmr_up'].to_numpy().astype(np.int8)
    x = scaler.transform(x)
    x = torch.from_numpy(x.astype(np.float32)).to(torch.float32).to(device)

    outputs_aapl = model(x)
    _, pred_aapl = torch.max(outputs_aapl, 1)

    #print(f'Prediction: {pred_aapl.cpu().numpy()}')
    #print(f'Reality   : {y}')
    print(f'{aapl_df.Name[0]} accuracy is {1-count_one(pred_aapl.cpu()+y)/(len(y)-1)}')

In [127]:
for i in range (50):
    if df.Name.unique()[i] in ['AET', 'AGN', 'ALXN', 'ANDV', 'APC']:
        continue
    ticker = df.Name.unique()[i] 
    time_frame = '1y'
    test_acc(data_processing_yf(ticker, time_frame))

AAL accuracy is 0.5533596837944664
AAPL accuracy is 0.5770750988142292
AAP accuracy is 0.5849802371541502
ABBV accuracy is 0.5335968379446641
ABC accuracy is 0.541501976284585
ABT accuracy is 0.5968379446640316
ACN accuracy is 0.5612648221343873
ADBE accuracy is 0.45849802371541504
ADI accuracy is 0.5375494071146245
ADM accuracy is 0.5652173913043479
ADP accuracy is 0.5138339920948616
ADSK accuracy is 0.5573122529644269
ADS accuracy is 0.5573122529644269
AEE accuracy is 0.5731225296442688
AEP accuracy is 0.5889328063241106
AES accuracy is 0.6047430830039526
AFL accuracy is 0.5454545454545454
AIG accuracy is 0.5573122529644269
AIV accuracy is 0.5098814229249011
AIZ accuracy is 0.5533596837944664
AJG accuracy is 0.5849802371541502
AKAM accuracy is 0.5494071146245059
ALB accuracy is 0.5612648221343873
ALGN accuracy is 0.4347826086956522
ALK accuracy is 0.6086956521739131
ALLE accuracy is 0.5533596837944664
ALL accuracy is 0.5849802371541502
AMAT accuracy is 0.6047430830039526
AMD accuracy