### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [19]:
#pip install torch

In [20]:
#pip install lightning

In [21]:
#pip install pandas

In [22]:
#pip install scikit-learn

In [23]:
import torch # torch will allow us to create tensors.
import torch.nn as nn # torch.nn allows us to create a neural network.
import torch.nn.functional as F # nn.functional give us access to the activation and loss functions.
from torch.optim import Adam # optim contains many optimizers. This time we're using Adam

import lightning as L # lightning has tons of cool tools that make neural networks easier
from torch.utils.data import TensorDataset, DataLoader # these are needed for the training data
from lightning_fabric.utilities.seed import seed_everything
## Set the seed so that, hopefully, everyone will get the same results as me.
#from pytorch_lightning.utilities.seed import seed_everything

In [24]:
## Instead of coding an LSTM by hand, let's see what we can do with PyTorch's nn.LSTM()
class LightningLSTM(L.LightningModule):
    
    def __init__(self): # __init__() is the class constructor function, and we use it to initialize the Weights and Biases.
        
        super().__init__() # initialize an instance of the parent class, LightningModule.

        seed_everything(seed=42)
        
        ## input_size = number of features (or variables) in the data. In our example
        ##              we only have a single feature (value)
        ## hidden_size = this determines the dimension of the output
        ##               in other words, if we set hidden_size=1, then we have 1 output node
        ##               if we set hiddeen_size=50, then we hve 50 output nodes (that can then be 50 input
        ##               nodes to a subsequent fully connected neural network.
        self.lstm = nn.LSTM(input_size=1, hidden_size=1) 
         

    def forward(self, input):
        ## transpose the input vector
        input_trans = input.view(len(input), 1)
        
        lstm_out, temp = self.lstm(input_trans)
        
        ## lstm_out has the short-term memories for all inputs. We make our prediction with the last one
        prediction = lstm_out[-1] 
        return prediction
        
        
    def configure_optimizers(self): # this configures the optimizer we want to use for backpropagation.
        return Adam(self.parameters(), lr=0.1) ## we'll just go ahead and set the learning rate to 0.1

    
    def training_step(self, batch, batch_idx): # take a step during gradient descent.
        input_i, label_i = batch # collect input
        output_i = self.forward(input_i[0]) # run input through the neural network
        loss = (output_i - label_i)**2 ## loss = squared residual
        
        ###################
        ##
        ## Logging the loss and the predicted values so we can evaluate the training
        ##
        ###################
        self.log("train_loss", loss)
        
        if (label_i == 0):
            self.log("out_0", output_i)
        else:
            self.log("out_1", output_i)

        return loss

In [25]:
model = LightningLSTM() # First, make model from the class

## print out the name and value for each parameter
print("Before optimization, the parameters are...")
for name, param in model.named_parameters():
    print(name, param.data)
    
print("\nNow let's compare the observed and predicted values...")
print("Company A: Observed = 0, Predicted =", model(torch.tensor([0., 0.5, 0.25, 1.])).detach())
print("Company B: Observed = 1, Predicted =", model(torch.tensor([1., 0.5, 0.25, 1.])).detach())

Global seed set to 42


Before optimization, the parameters are...
lstm.weight_ih_l0 tensor([[ 0.7645],
        [ 0.8300],
        [-0.2343],
        [ 0.9186]])
lstm.weight_hh_l0 tensor([[-0.2191],
        [ 0.2018],
        [-0.4869],
        [ 0.5873]])
lstm.bias_ih_l0 tensor([ 0.8815, -0.7336,  0.8692,  0.1872])
lstm.bias_hh_l0 tensor([ 0.7388,  0.1354,  0.4822, -0.1412])

Now let's compare the observed and predicted values...
Company A: Observed = 0, Predicted = tensor([0.6675])
Company B: Observed = 1, Predicted = tensor([0.6665])


In [26]:
import pandas as pd
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('ibm.csv', sep=";")

# Convert the DataFrame to a flat list
flat_list = df['ibm'].values.flatten().tolist()
#print(flat_list)

scaled_list = [(x - min(flat_list)) / (max(flat_list) - min(flat_list)) for x in flat_list]

#print(scaled_list)
splitrange = round(len(scaled_list)/2)
train_scaled_list = scaled_list[0:splitrange]
test_scaled_list = scaled_list[splitrange+1:]
scaled_list = train_scaled_list

In [27]:
input_scaled_list = []
result_scaled_list = []

#Cria blocos de input com range de 5 dias
blocos = 20
for i in range(len(scaled_list)):
    if i >= blocos:
        result_scaled_list.append(scaled_list[i])
        input_scaled_list.append(scaled_list[i-blocos:i])

#print(input_scaled_list)
#print(result_scaled_list)

In [28]:
## create the training data for the neural network.
inputs = torch.tensor(input_scaled_list)
labels = torch.tensor(result_scaled_list)

dataset = TensorDataset(inputs, labels) 
dataloader = DataLoader(dataset)

In [29]:
## NOTE: Because we have set Adam's learning rate to 0.1, we will train much, much faster.
## Before, with the hand made LSTM and the default learning rate, 0.001, it took about 5000 epochs to fully train
## the model. Now, with the learning rate set to 0.1, we only need 300 epochs. Now, because we are doing so few epochs,
## we have to tell the trainer add stuff to the log files every 2 steps (or epoch, since we have to rows of training data)
## because the default, updating the log files every 50 steps, will result in a terrible looking graphs. So
import time
trainer = L.Trainer(max_epochs=30, log_every_n_steps=2)

start_time = time.monotonic()
trainer.fit(model, train_dataloaders=dataloader)
end_time = time.monotonic()

duration = end_time - start_time

print(f'Training duration: {duration:.2f} seconds')

print("After optimization, the parameters are...")
for name, param in model.named_parameters():
    print(name, param.data)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name | Type | Params
------------------------------
0 | lstm | LSTM | 16    
------------------------------
16        Trainable params
0         Non-trainable params
16        Total params
0.000     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.


Training duration: 1656.76 seconds
After optimization, the parameters are...
lstm.weight_ih_l0 tensor([[17.6984],
        [-7.2458],
        [10.7526],
        [ 8.9974]])
lstm.weight_hh_l0 tensor([[-16.8688],
        [ -3.0060],
        [ -7.5004],
        [ -1.7574]])
lstm.bias_ih_l0 tensor([ 6.2326, -3.0391,  6.1287,  0.4478])
lstm.bias_hh_l0 tensor([ 6.0899, -2.1700,  5.7417,  0.1195])


In [30]:
input_scaled_list_for_test = []
result_scaled_list_for_test = []

#Cria blocos de teste com range de 60 dias
blocos = 60
for i in range(len(test_scaled_list)):
    if i >= blocos:
        input_scaled_list_for_test.append(scaled_list[i-blocos:i])
        result_scaled_list_for_test.append(scaled_list[i])

#print(input_scaled_list_for_test)

In [31]:
print("\nNow let's compare the observed and predicted values...")
#print("Company A: Observed = 0, Predicted =", model(torch.tensor([0.219034455415012, 0.24204144341591535, 0.35408746962902576, 0.23655432389056874, 0.23655432389056874])).detach())


predicted_scaled = []
#for i in range(len(result_scaled_list)):
for i in range(len(input_scaled_list_for_test)):
    predicted_scaled.append(model(torch.tensor(input_scaled_list_for_test[i]).detach()).item())
    #predicted.append(model(torch.tensor(input_scaled_list[i]).detach()).item())
    #model_result = model(torch.tensor(input_scaled_list[i]).detach())
    #print(torch.max(model_result, dim=1))
    
#print (predicted_scaled)


Now let's compare the observed and predicted values...


In [32]:
original_min = min(flat_list)
original_max = max(flat_list)
predicted_real_value = [(x * (original_max - original_min)) + original_min for x in predicted_scaled]
result_real_value = [(x * (original_max - original_min)) + original_min for x in result_scaled_list_for_test]


In [33]:
import numpy as np
np.savetxt("predicted.csv", predicted_real_value, delimiter=",", header = "predicted")
np.savetxt("input_ibov.csv", input_scaled_list_for_test, delimiter=",", header = "ibov")
np.savetxt("result_real_value.csv", result_real_value, delimiter=",", header = "real")

In [None]:
#print(teste)

In [34]:
from sklearn.metrics import r2_score, mean_squared_error

# calculate the R-squared
r_squared = r2_score(result_real_value, predicted_real_value)

# calculate the mean squared error
mse = mean_squared_error(result_real_value, predicted_real_value)

print(f'R-squared: {r_squared:.2f}')
print(f'Mean squared error: {mse:.2f}')

R-squared: -1.77
Mean squared error: 2335.72
