### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [73]:
import numpy as np
import pandas as pd
import random
import os,errno
import sys
from tqdm import trange

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [112]:
df=pd.read_csv('preprocessed.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,M_SESSION_UID,M_SESSION_TIME,M_FRAME_IDENTIFIER,TIMESTAMP,M_ZONE_START,M_ZONE_FLAG,M_TRACK_TEMPERATURE,M_TRACK_LENGTH,M_FORECAST_ACCURACY,...,M_WEATHER_FORECAST_SAMPLES_M_WEATHER,M_WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE,M_TRACK_TEMPERATURE_CHANGE,M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE,M_AIR_TEMPERATURE_CHANGE,M_RAIN_PERCENTAGE,M_WEATHER,M_AI_DIFFICULTY,M_TOTAL_LAPS,Num_Predictions
0,0,1.30021e+19,2803.836,82458,1642362000.0,0.088,0.0,33,4650,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,200.0,1
1,1,1.30021e+19,2803.836,82458,1642362000.0,0.167,0.0,33,4650,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,200.0,1
2,2,1.30021e+19,2803.836,82458,1642362000.0,0.238,0.0,33,4650,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,200.0,1
3,3,1.30021e+19,2803.836,82458,1642362000.0,0.298,0.0,33,4650,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,200.0,1
4,4,1.30021e+19,2803.836,82458,1642362000.0,0.353,0.0,33,4650,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,200.0,1


In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3572282 entries, 0 to 3572281
Data columns (total 30 columns):
 #   Column                                          Dtype  
---  ------                                          -----  
 0   Unnamed: 0                                      int64  
 1   M_SESSION_UID                                   float64
 2   M_SESSION_TIME                                  float64
 3   M_FRAME_IDENTIFIER                              int64  
 4   TIMESTAMP                                       float64
 5   M_ZONE_START                                    float64
 6   M_ZONE_FLAG                                     float64
 7   M_TRACK_TEMPERATURE                             int64  
 8   M_TRACK_LENGTH                                  int64  
 9   M_FORECAST_ACCURACY                             int64  
 10  M_AIR_TEMPERATURE                               int64  
 11  M_NUM_WEATHER_FORECAST_SAMPLES                  int64  
 12  M_TRACK_ID                  

In [95]:
#split input and output for the model
Y=df['M_WEATHER']
Numpred=df['Num_Predictions']
rainpercentage=df['M_RAIN_PERCENTAGE']
X=df.drop(['M_WEATHER', 'M_RAIN_PERCENTAGE', 'Num_Predictions', 'Unnamed: 0'], axis=1)

In [96]:
#!pip install sklearn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [97]:
print(X_train.shape)
print(y_train.shape)

(2679211, 27)
(2679211,)


In [100]:
#convert numpy arrays into tensors
x_torch_train = torch.from_numpy(X_train.to_numpy()).type(torch.Tensor)
x_torch_test = torch.from_numpy(X_test.to_numpy()).type(torch.Tensor)
y_torch_train = torch.from_numpy(y_train.to_numpy()).type(torch.Tensor)
y_torch_test = torch.from_numpy(y_test.to_numpy()).type(torch.Tensor)

In [75]:

class lstm_encoder(nn.Module):
	''' Encodes time-series sequence '''

	def __init__(self, input_size, hidden_size,num_layers):
		'''
        : param input_size:     the number of features in the input X
        : param hidden_size:    the number of features in the hidden state h
        : param num_layers:     number of recurrent layers (i.e., 2 means there are
        :                       2 stacked LSTMs)
        '''

		super(lstm_encoder,self).__init__()
		self.input_size = input_size
		self.hidden_size = hidden_size
		self.num_layers = num_layers

		self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, bidirectional = False)

	def forward(self,x_input):
		#x_input = input of shape(seq_len, batch_size, input_size)
		'''
        : param x_input:               input of shape (seq_len, # in batch, input_size)
        : return lstm_out, hidden:     lstm_out gives all the hidden states in the sequence;
        :                              hidden gives the hidden state and cell state for the last
        :                              element in the sequence 
        '''

		lstm_out, self.hidden = self.lstm(x_input.view(x_input.shape[0],x_input.shape[1],sefl.input_size))

		return lstm_out, self.hidden

	def init_hidden(self, batch_size):
		'''
        initialize hidden state
        : param batch_size:    x_input.shape[1]
        : return:              zeroed hidden state and cell state 
        '''

		return(torch.zeros(self.num_layers, batch_size, self.hidden_size),
			torch.zeros(self.num_layers,batch_size,self.hidden_size))


In [76]:

class lstm_decoder(nn.Module):	
	''' Decodes hidden state output by encoder '''

	def __init__(self, input_size, hidden_size, num_layers):
		'''
        : param input_size:     the number of features in the input X
        : param hidden_size:    the number of features in the hidden state h
        : param num_layers:     number of recurrent layers (i.e., 2 means there are
        :                       2 stacked LSTMs)
        '''

		super(lstm_decoder, self).__init__()
		self.input_size = input_size
		self.hidden_size = hidden_size
		self.num_layers = num_layers

		self.lstm = nn.LSTM(input_size = input_size, hidden_size= hidden_size, num_layers=num_layers, bidirectional = False)

		self.linear = nn.Linear(hidden_size, input_size)

	def forward(self, x_input, encoder_hidden_states):
		'''        
        : param x_input:                    should be 2D (batch_size, input_size)
        : param encoder_hidden_states:      hidden states
        : return output, hidden:            output gives all the hidden states in the sequence;
        :                                   hidden gives the hidden state and cell state for the last
        :                                   element in the sequence 
 
        '''

		lstm_out, self.hidden = self.lstm(x_input.unsqueeze(0), encoder_hidden_states)
		output = self.linear(lstm_out.squeeze(0))

		return output, self.hidden


In [111]:
class lstm_seq2seq(nn.Module):
	''' train LSTM encoder-decoder and make predictions '''

	def __init__(self, input_size, hidden_size):
		'''
        : param input_size:     the number of expected features in the input X
        : param hidden_size:    the number of features in the hidden state h
        '''


		super(lstm_seq2seq,self).__init__()

		self.input_size = input_size
		self.hidden_size = hidden_size

		self.encoder = lstm_encoder(input_size = input_size, hidden_size = hidden_size)
		self.decoder = lstm_decoder(input_size = input_size, hidden_size = hidden_size)

   
    def train_model(self, input_tensor, target_tensor, n_epochs, target_len, batch_size, training_prediction = 'recursive',
                    teacher_forcing_ratio = 0.5, learning_rate = 0.01, dynamic_tf = False):
    	'''
        train lstm encoder-decoder
        
        : param input_tensor:              input data with shape (seq_len, # in batch, number features); PyTorch tensor    
        : param target_tensor:             target data with shape (seq_len, # in batch, number features); PyTorch tensor
        : param n_epochs:                  number of epochs 
        : param target_len:                number of values to predict 
        : param batch_size:                number of samples per gradient update
        : param training_prediction:       type of prediction to make during training ('recursive', 'teacher_forcing', or
        :                                  'mixed_teacher_forcing'); default is 'recursive'
        : param teacher_forcing_ratio:     float [0, 1) indicating how much teacher forcing to use when
        :                                  training_prediction = 'teacher_forcing.' For each batch in training, we generate a random
        :                                  number. If the random number is less than teacher_forcing_ratio, we use teacher forcing.
        :                                  Otherwise, we predict recursively. If teacher_forcing_ratio = 1, we train only using
        :                                  teacher forcing.
        : param learning_rate:             float >= 0; learning rate
        : param dynamic_tf:                use dynamic teacher forcing (True/False); dynamic teacher forcing
        :                                  reduces the amount of teacher forcing for each epoch
        : return losses:                   array of loss function for each epoch
        '''

    	losses = np.full(n_epochs, np.nan)

    	optimizer = optim.Adam(self.parameters(), lr = learning_rate)
    	criterion = nn.MSELoss()

    	#calculate number of batch iterations
    	n_batches = int(input_tensor.shape[1]/batch_size)

    	with trange(n_epochs) as tr:
    		for it in tr:

    			batch_loss = 0
    			batch_loss_tf = 0
    			batch_loss_no_tf = 0
    			num_tf = 0
    			num_no_tf = 0

    			for b in range(n_batches):
    				#select data
    				input_batch = input_tensor[:,b:b+batch_size,:]
    				target_batch = target_tensor[:,b:b+batch_size,:]

    				#output tensor
    				outputs = torch.zeros(target_len, batch_size, input_batch.shape[2])

    				#initiate hidden state
    				encoder_hidden = self.encoder.init_hidden(batch_size)

    				#zero the gradient
    				optimizer.zero_grad()

    				#encoder_outputs
    				encoder_output, encoder_hidden = self.encoder(input_batch)

    				#decoder outputs
    				decoder_input = input_batch[-1,:,:]#shape:(batch_size,input_size)
    				decoder_hidden = encoder_hidden

    				if training_prediction == 'recursive':

    					for t in range(target_len):
    						decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
    						outputs[t] = decoder_output
    						decoder_input = decoder_output

    				
    				if training_prediction == 'teacher_forcing':
    					for t in range(target_len):
    						decoder_output, decoder_hidden = self.decoder(decoder_input,decoder_hidden)
    						outputs[t] = decoder_output

    						if random.random()<teacher_forcing_ratio:#teacher forcing
    							decoder_input = target_batch[t,:,:]

    						else:#recursive
    							decoder_input = decoder_output

    				loss = criterion(outputs, target_batch)
    				batch_loss += loss.item()

    				#backpropogation
    				loss.backward()
    				optimizer.step()

    			#epoch loss
    			batch_loss /= n_batches
    			losses[it] = batch_loss

    			#dynamic teacher forcing
    			if dynamic_tf and teacher_forcing_ratio>0:
    				teacher_forcing_ratio = teacher_forcing_ratio-0.02

    			#progress bar
    			tr.set_postfix(loss ="{0:3f}".format(batch_loss))

    	return losses

    def predict(self, input_tensor, target_len):
    	'''
        : param input_tensor:      input data (seq_len, input_size); PyTorch tensor 
        : param target_len:        number of target values to predict 
        : return np_outputs:       np.array containing predicted values; prediction done recursively 
        '''

    	#encode input tensor
    	input_tensor = input_tensor.unsqueeze(1) #add in batch size of 1
    	encoder_output, encoder_hidden = self.encoder(input_tensor)

    	#initialize tensor for prediction
    	outputs = torch.zeros(target_len, input_tensor.shape[2])

    	#decode input_tensor
    	decoder_input = input_tensor[-1:,:]
    	decoder_hidden = encoder_hidden

    	for t in range(target_len):
    		decoder_output, decoder_hidden = self.decoder(decoder_input,decoder_hidden)
    		outputs[t] = decoder_output.squeeze(0)
    		decoder_input = decoder_output

    	np_outputs = outputs.detach().numpy()

    	return np_outputs

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 20)

In [None]:
device =torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = lstm_encoder_decoder.lstm_seq2seq(input_size = X_train.shape[2], hidden_size = 15)
model.to(device)
x_torch_train.to(device)
_train.to(device)
X_test.to(device)
Y_test.to(device)
loss = model.train_model(X_train, Y_train, n_epochs = , target_len = , batch_size = , training_prediction = 'teacher_forcing', teacher_forcing_ratio = 0.6, learning_rate = 0.01, dynamic_tf = False)
