In [4]:
!pip install yfinance



In [9]:
# Math and data preprocessing libraries
import math
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# For handling dataset
import yfinance as yf
from datetime import date

# For visualization
import seaborn as sns
import matplotlib.pyplot as plt

# For deep learning
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# For evaluation
from sklearn.metrics \
import root_mean_squared_error, mean_absolute_percentage_error

ModuleNotFoundError: No module named 'yfinance'

In [None]:
device = torch.device('cuda' if \
                      torch.cuda.is_available() else 'cpu')

print(device)

In [None]:
start_date = '2020-01-01'
end_date = '2024-12-31'

df = yf.download('GOOG', start=start_date, end=end_date)

# Inspect the data
print()
print(df.head())
print(df.info())

In [None]:
ncols = 1
nrows = int(round(df.shape[1] / ncols, 0))
fig, ax = plt.subplots(nrows=nrows, ncols=ncols, \
                       sharex=True, figsize=(14, 7))
for i, ax in enumerate(fig.axes):
    sns.lineplot(data=df.iloc[:, i], ax=ax)
    ax.tick_params(axis="x", rotation=30, \
                   labelsize=10, length=0)
fig.tight_layout()
plt.show()

In [None]:
# Train test split
train_ratio = 0.8
training_data_len = math.ceil(len(df) * train_ratio)

# Splitting the dataset
train_data = df[:training_data_len][['Open']]
test_data = df[training_data_len:][['Open']]
print(train_data.shape)
print(test_data.shape)

# (1006, 1)
# (251, 1)

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
train_scaled = scaler.fit_transform(train_data.values)
test_scaled = scaler.transform(test_data.values)

for v in train_data.values[:5, 0]:
  print(f'{v:6.3f}', end=' ')
print()
# 66.681 66.995 67.101 69.484 69.193

for v in train_scaled[:5, 0]:
  print(f'{v:6.3f}', end=' ')
print()
# 0.144  0.147  0.148  0.172  0.169

In [None]:
sequence_length = 50

def convert_data_into_tensors(data_seq):
    features, labels = [],[]
    for i in range(len(data_seq) - sequence_length):
        features.append(data_seq[i: i+sequence_length])
        labels.append(data_seq[i+sequence_length,0])
    features, labels = np.array(features), np.array(labels)
    
    features = torch.tensor(features, dtype=torch.float32)
    labels = torch.tensor(labels, dtype=torch.float32)
    return features, labels

X_train, Y_train = convert_data_into_tensor(train_scaled)
X_test, Y_test = convert_data_into_tensor(test_scaled)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# torch.Size([956, 50, 1])
# torch.Size([956])
# torch.Size([201, 50, 1])
# torch.Size([201])

In [None]:
batch_size=32
def to_loader(x, y, batch_size, shuffle):
    dataset = torch.utils.data.TensorDataset(x, y)
    return torch.utils.data.DataLoader(dataset, batch_size, shuffle)

train_loader = to_loader(X_train, Y_train, batch_size, shuffle=True)
test_loader = to_loader(X_test, Y_test, batch_size, shuffle=False)

print(train_loader)
print(test_loader)

# <torch.utils.data.dataloader.DataLoader object at 0x78eacd13fdd0>
# <torch.utils.data.dataloader.DataLoader object at 0x78eacd13fd50>

In [None]:
# set random seed

manual_seed = 42
torch.manual_seed(manual_seed)
np.random.seed(manual_seed)

In [10]:
input_size = X_train.shape[-1]
num_layers = 2
hidden_size = 64

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        # (lstm): LSTM(1, 64, num_layers=2, batch_first=True)
        
        self.linear = nn.Linear(hidden_size, 1)
        # (linear): Linear(in_features=64, out_features=1, bias=True)
        
    def forward(self, x):
        out, _ = self.lstm(x)
        return = self.linear(out)
    
model = LSTMModel(input_size, hidden_size, num_layers).to(device)
print(model)   

In [None]:
def train(model, train_loader, test_loader):
    train_hist = []
    test_hist = []
    num_epochs = 10
    loss_fn = nn.MSELoss(reduction='mean')
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    
    for epoch in range(num_epochs):
        total_train_loss = 0.0
        total_test_loss = 0.0
        
        model.train()
        for batch_x, batch_y in train_loader:
            # (1) Move input and target tensors to the device (e.g., GPU)
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            # (2) Pass the input (batch_x) through the model
            #     The model outputs shape [batch_size, sequence_length, 1]
            #     Take the prediction at the last timestep (index -1) and feature index 0
            #     → model(batch_x)[:, -1, 0]
            pred = model(batch_x)[:, -1, 0]
            
            # (3) Compute the loss between pred and batch_y by using loss_fn
            loss = loss_fn(pred, batch_y)
            
            # (4) Clear previous gradients to avoid accumulation
            optimizer.zero_grad()
            
            # (5) Perform backpropagation to compute gradients
            loss.backward()
            
            # (6) Update the model parameters using the optimizer
            optimizer.step()
            
            total_train_loss += loss.item()
        avg_loss = total_train_loss / len(train_loader)
        train_hist.append(avg_loss)
        
        # evaluate
        model.eval()
        with torch.no_grad():
            for test_x, test_y in test_loader:
                # (1) Move input and target tensors to the device (e.g., GPU)
                test_x, test_y = test_x.to(device), test_y.to(device)
                
                # (2) Pass the input (test_x) through the model
                #     The model outputs shape [batch_size, sequence_length, 1]
                #     Take the prediction at the last timestep (index -1) and feature index 0
                #     → model(test_x)[:, -1, 0]
                test_pred = model(test_x)[:, -1, 0]
                
                # (3) Compute the loss between test_pred and test_y by using loss_fn
                test_loss = loss_fn(test_pred, test_y)
                
                total_test_loss += test_loss.item()
            avg_test_loss = total_test_loss / len(test_loader)
            test_hist.append(avg_test_loss)
            
            print(f'Epoch {epoch + 1:2d}/{num_epochs} - Training Loss: {avg_loss:.4f}, Test Loss: {avg_test_loss:.4f}')
        
        x = np.linspace(1, num_epochs, num_epochs)
        plt.plot(x, train_hist, scalex=True, label="Training loss")
        plt.plot(x, test_hist, label="Test loss")
        plt.legend()

train(model, train_loader, test_loader)
        
# Epoch  1/10 - Training Loss: 0.1736, Test Loss: 0.4874
# Epoch  2/10 - Training Loss: 0.0388, Test Loss: 0.0877
# Epoch  3/10 - Training Loss: 0.0078, Test Loss: 0.0072
# Epoch  4/10 - Training Loss: 0.0031, Test Loss: 0.0097
# Epoch  5/10 - Training Loss: 0.0023, Test Loss: 0.0123
# Epoch  6/10 - Training Loss: 0.0020, Test Loss: 0.0188
# Epoch  7/10 - Training Loss: 0.0020, Test Loss: 0.0111
# Epoch  8/10 - Training Loss: 0.0021, Test Loss: 0.0165
# Epoch  9/10 - Training Loss: 0.0018, Test Loss: 0.0225
# Epoch 10/10 - Training Loss: 0.0017, Test Loss: 0.0177    
        
            
    

In [None]:
#Forecasting Visualization
# plt 관련 내용은 복습 필요(단순 타이핑)

def plot_forecasting(model, X_test, y_test):
    model.eval()
    num_forecast_steps = 30
    input_data = X_test[-num_forecast_steps].cpu().numpy().squeeze()
    
    forecasted_values = []
    with torch.no_grad():
        for i in range(2*num_forecast_steps):
            input_tensor = torch.as_tensor(input_data).view(1,-1,1).to(device)
            predicted = model(input_tensor)[0,-1, 0].item()
            
            forecasted_values.append(predicted)
            input_data = np.roll(input_data, shift=-1)
            
            if i<num_forecast_steps:
                input_data[-1] = y_test[-num_forecast_steps+i]
            else:
                input_data[-1] = predicted
    df_out = df.copy()
    last_data = df_out.index[-1]
    future_dates = pd.date_range(start=last_date + pd.DateOffset(1), periods=num_forecast_steps)
    combined_dates = df_out.index.append(future_dates)
    plt.rcParams['figure.figsize'] = [14, 4]
    plt.plot(test_data.index[-100:-30], test_data[-100:-30], label = "test_data", color = "b")
    plt.plot(test_data.index[-30:], test_data.iloc[-30:], label='actual values', color='green')
    
    forecasted_cases = scaler.inverse_transform(np.expand_dims(forecasted_values, axis=0)).flatten()
    plt.plot(combined_dates[-60:], forecasted_cases, label='forecasted values', color='red')

    plt.xlabel('Time Step')
    plt.ylabel('Value')
    plt.legend()
    plt.title('Time Series Forecasting')
    plt.grid(True)

plot_forecasting(model, X_test, y_test)
    
    
            

In [None]:
# Test

def test(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        X_test = X_test.to(device)
        y_hat = model(X_test)
        test_predictions = y_hat[:, -1, 0]
        
    test_predictions = test_predictions.cpu().numpy()
    y_test = y_test.cpu().numpy()
    
    rmse = root_mean_squared_error(y_test, test_predictions)
    mape = mean_absolute_percentage_error(y_test, test_predictions)
    
    return rmse, mape

rmse, mape = test(model, X_test, y_test)

print(f'RMSE: {rmse:.4f}')
print(f'MAPE: {mape:.4f}')

# RMSE: 0.1194
# MAPE: 0.0847
        

In [None]:
#Other Models: CNN, RNN

class Conv1DModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Conv1DModel, self).__init__()
        
        self.conv1d = nn.Conv1d(in_channels=input_size, out_channels=hidden_size, kernel_size=2, stride=1)
        self.fc = nn.Linear(hidden_size, 1)
    def forward(self, x):
        x = x.transpose(1,2)
        x = self.conv1d(x)
        x = x.transpose(1,2)
        return self.fc(x)      
        
        

In [None]:
input_size = X_train.shape[-1]
num_layers = 2
hidden_size = 64

model = Conv1DModel(input_size, hidden_size).to(device)

print(model)

In [None]:
train(model, train_loader, test_loader)

In [None]:
plot_forecasting(model, X_test, y_test)

In [None]:
rmse, mape = test(model, X_test, y_test)
print(f'RMSE: {rmse:.4f}')
print(f'MAPE: {mape:.4f}')

In [None]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(RNNModel, self).__init__()
        
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        out, _ = self.rnn(x)
        return self.fc(out)

In [None]:
input_size = X_train.shape[-1]
num_layers = 2
hidden_size = 64

model = RNNModel(input_size, hidden_size, num_layers).to(device)

print(model)

In [None]:
train(model, train_loader, test_loader)

In [None]:
plot_forecasting(model, X_test, y_test)

In [None]:
rmse, mape = test(model, X_test, y_test)
print(f'RMSE: {rmse:.4f}')
print(f'MAPE: {mape:.4f}')

# RMSE: 0.0862
# MAPE: 0.0606

In [None]:
#Optional (Encoder-Decoder)
#Data Preprocessing

sequence_length = 50
target_len = 10

def create_enc_dec_sequences(data):
    features, labels = [],[]
    for i in range(len(data) - sequence_length - target_len):
        features.append(data[i:i+sequence_length])
        labels.append(data[i+sequence_length:i+sequence_length + target_length])
        
    features = np.array(features, dtype=float32)
    labels = np.array(labels, dtype=float32)
    
    features = torch.tensor(features, dtype = torch.float32)
    labels = torch.tensor(labels, dtype = torch.float32)
    
    return features, labels

X_train, y_train = create_enc_dec_sequence(train_scaled)
X_test, y_test = create_end_dec_sequence(test_scaled)
print(X_train_.shape)
print(y_train_.shape)
print(X_test_.shape)
print(y_test_.shape)

train_loader = to_loader(X_train, y_train, batch_size, shuffle=True)
test_loader = to_loader(X_test, y_test, batch_size, shuffle=False)

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(EncoderRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)        
        
    def forward(self, x):
        -, h = self.rnn(x)
        return h
    
class DecoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(DecoderRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first = True)
        self.fc = nn.Linear(hidden_size, input_size)
    
    def forward(self, x):
        out, h = self.rnn(x, h)
        out = self.fc(out)
        return out, h
    
class RNNRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(RNNRNN, self).__init__()
        self.encoder = EndocerRNN(input_size, hidden_size, num_layers)
        self.decoder = DecoderRNN(input_size, hidden_size, num_layers)
        
    def forward(self, source, target_len):
        h = self.encoder(source)
        predictions = []
        input = source[:, -1, :].unsqueeze(1)
        for t in range(target_len):
            out, h = self.decoder(input, h)
            predictions.append(out.squeeze(1))
            input = out
        outputs = torch.stack(predictions, dim=1)
        
        return outputs
    
model_ = RNNRNN(input_size, hidden_size, num_layers),to(device)
print(model_)

# RNNRNN(
#   (encoder): EncoderRNN(
#     (rnn): RNN(1, 64, num_layers=2, batch_first=True)
#   )
#   (decoder): DecoderRNN(
#     (rnn): RNN(1, 64, num_layers=2, batch_first=True)
#     (fc): Linear(in_features=64, out_features=1, bias=True)
#   )
# )        

In [None]:
# Train

def train_(model, train_loader, test_loader):
    train_hist = []
    test_hist = []
    num_epochs = 10
    loss_fn = nn.MSELoss(reduction = 'mean')
    optimizer = optim.Adam(model.parameters(), lr = 1e-3)
    
    for epoch in range(num_epochs):
        total_train_loss = 0.0
        total_test_loss = 0.0
        
        #train
        model.train()
        for batch_x, batch_y in train_loader :
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            pred = model(batch_x, batch_y.shape[1])
            loss = loss_fn(pred, batch_y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_train_loss += loss.item()
            
        avg_loss = total_train_loss / len(train_loader)
        train_hist.append(avg_loss)
        
        
        #evaluate
        model.eval()
        with torch.no_grad():
            for test_x, test_y in test_loader :
                test_x, test_y = test_x.to(device), test_y.to(device)
                test_pred = model(test_x, target_len)
                test_loss = loss_fn(test_pred, test_y)
                total_test_loss += test_loss.item()
        
        avg_test_loss = total_test_loss / len(test_loader)
        test_hist.append(avg_test_loss)
        
        print(f'Epoch {epoch + 1:2d}/{num_epochs} - Training Loss: {avg_loss:.4f}, Test Loss: {avg_test_loss:.4f}')
        
    x = np.linspace(1, num_epochs, num_epochs)
    plt.plot(x, train_hist, scalex=True, label="Training loss")
    plt.plot(x, test_hist, label="Test loss")
    plt.legend()

train_(model_, train_loader_, test_loader_)
                
        

In [None]:
#Plot Forecasting
def plot_forecasting(model, X_test, y_test):
    model.eval()
    num_forecast_steps = 30
    
    with torch.no_grad():
        for i in range(1, min(num_forecast_steps, len(X_test)) + 1):
            input_tensor = X_test[-i].unsqueeze(0).to(device)
            predicted = model(input_tensor, target_len)
            forecasted_values = predicted.squeeze(0).cpu().numpy()
            forecasted_values = scaler.inverse_transform(
                forecasted_values.reshape(-1, 1)
            ).flatten()
            idx = test_data.index[-i - target_len + 1 : -i + 1 if i > 1 else None]
            plt.plot(idx, forecasted_values, color="red", alpha=0.6,
                     label="forecasted values" if i == 1 else "")
            
    plt.plot(test_data.index[-100:-39], test_data[-100:-39], label="test_data", color="b")
    plt.plot(test_data.index[-39:], test_data.iloc[-39:], label="actual values", color="green")

    plt.xlabel('Time Step')
    plt.ylabel('Value')
    plt.legend()
    plt.title('Time Series Forecasting')
    plt.grid(True)
    plt.show()

plot_forecasting(model_, X_test_, y_test_)

In [None]:
def test_(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        X_test = X_test.to(device)
        y_hat = model(X_test, target_len)
        test_predictions = y_hat[:, :, 0]

    test_predictions = test_predictions.cpu().numpy()
    y_test = y_test.cpu().numpy().reshape(-1, target_len)

    rmse = root_mean_squared_error(y_test, test_predictions)
    mape = mean_absolute_percentage_error(y_test, test_predictions)

    return rmse, mape

rmse, mape = test_(model_, X_test_, y_test_)
print(f'RMSE: {rmse:.4f}')
print(f'MAPE: {mape:.4f}')

# RMSE: 0.2088
# MAPE: 0.1593