<a href="https://colab.research.google.com/github/kjmobile/lb/blob/main/Copy_of_Ch10_deeplearning_lab_Part2_A_for_GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Learning


In [None]:
# following packages have to be re-installed for every new session in Google colab

In [None]:
!pip install --upgrade pip
!pip install torchinfo
!pip install pytorch_lightning
!pip install islp

In [None]:
import numpy as np, pandas as pd
from matplotlib.pyplot import subplots
from sklearn.linear_model import (LinearRegression,LogisticRegression,Lasso)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from ISLP import load_data
from ISLP.models import ModelSpec as MS
from sklearn.model_selection import (train_test_split,GridSearchCV)

### Torch-Specific Imports


In [None]:
import torch
from torch import nn
from torch.optim import RMSprop
from torch.utils.data import TensorDataset

In [None]:
from torchmetrics import (MeanAbsoluteError,R2Score)
from torchinfo import summary

In [None]:
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import CSVLogger

In [None]:
from pytorch_lightning import seed_everything
seed_everything(0, workers=True)
torch.use_deterministic_algorithms(True, warn_only=True)

In [None]:
from torchvision.io import read_image
from torchvision.datasets import MNIST, CIFAR100
from torchvision.models import (resnet50, ResNet50_Weights)
from torchvision.transforms import (Resize, Normalize, CenterCrop,ToTensor)

In [None]:
from ISLP.torch import (SimpleDataModule, SimpleModule, ErrorTracker, rec_num_workers)
from ISLP.torch.imdb import (load_lookup, load_tensor, load_sparse, load_sequential)
from glob import glob
import json

## IMDB Document Classification


#### Binary Sentiment Classification Model

In [None]:
(imdb_seq_train, imdb_seq_test) = load_sequential(root='data/IMDB')
padded_sample = np.asarray(imdb_seq_train.tensors[0][0])
sample_review = padded_sample[padded_sample > 0][:12]
sample_review[:12]


#### dataset shape

In [None]:
print("Train dataset shape:", imdb_seq_train.tensors[0].shape)
print("Test dataset shape:", imdb_seq_test.tensors[0].shape)



In [None]:
lookup = load_lookup(root='data/IMDB')
' '.join(lookup[i] for i in sample_review)

#### check index-word mapper

In [None]:
list(lookup.items())[:10] # vacabrary dict: indexed in squence of frequency of the words in the corpus  #Q1

#### view the first review

In [None]:
imdb_seq_train.tensors[1][0]


In [None]:
first_review = imdb_seq_train.tensors[0][0]   # feature vector of the first review  #Q2
first_label = imdb_seq_train.tensors[1][0]    # label of the first review (0: negative, 1: positive) --> so supervised learning

active_word_indices = torch.nonzero(first_review).squeeze()

print("Label of the first review (0=negative, 1=positive):", first_label.item())
print("Active word indices in the review:", active_word_indices.tolist())


#### Convert word indices to actual words using lookup

In [None]:
lookup = load_lookup(root='data/IMDB')
active_words = [lookup[idx.item()] for idx in active_word_indices]

print("Words appearing in the first review:")
print(active_words)

In [None]:
imdb_seq_train.tensors[0][0].shape # 3 addition for extra tokens to handle biases : 1:<PAD>, 2<START>, 3<UNK> (Unknown word) #Q3

In [None]:
print("Index 0:", lookup[0])
print("Index 1:", lookup[1])
print("Index 2:", lookup[2])

In [None]:
max_num_workers=10
(imdb_train,
 imdb_test) = load_tensor(root='data/IMDB')
imdb_dm = SimpleDataModule(imdb_train,
                           imdb_test,
                           validation=2000,
                           num_workers=min(6, max_num_workers),
                           batch_size=512)


In [None]:
class IMDBModel(nn.Module):   #Q4

    def __init__(self, input_size):
        super(IMDBModel, self).__init__()
        self.dense1 = nn.Linear(input_size, 16)
        self.activation = nn.ReLU()
        self.dense2 = nn.Linear(16, 16)
        self.output = nn.Linear(16, 1)

    def forward(self, x):
        val = x
        for _map in [self.dense1,
                     self.activation,
                     self.dense2,
                     self.activation,
                     self.output]:
            val = _map(val)
        return torch.flatten(val)


In [None]:
imdb_model = IMDBModel(imdb_test.tensors[0].size()[1])
summary(imdb_model,
        input_size=imdb_test.tensors[0].size(),
        col_names=['input_size',
                   'output_size',
                   'num_params'])


In [None]:
imdb_optimizer = RMSprop(imdb_model.parameters(), lr=0.001)
imdb_module = SimpleModule.binary_classification(
                         imdb_model,
                         optimizer=imdb_optimizer)


In [None]:
imdb_logger = CSVLogger('logs', name='IMDB')
imdb_trainer = Trainer(deterministic=False, #changed to False for latest cuda's default setting
                       max_epochs=10, #reduced 30 to 10 (note: accuracy remains similar but test loss doubled)
                       logger=imdb_logger,
                       enable_progress_bar=True,
                       callbacks=[ErrorTracker()],
                       log_every_n_steps=10)
imdb_trainer.fit(imdb_module,
                 datamodule=imdb_dm)

In [None]:
test_results = imdb_trainer.test(imdb_module, datamodule=imdb_dm)
test_results

### Comparison to Lasso


In [None]:
((X_train, Y_train),
 (X_valid, Y_valid),
 (X_test, Y_test)) = load_sparse(validation=2000, ## separate 2000 records (from 25,000) as validation set
                                 random_state=0,
                                 root='data/IMDB')


In [None]:
lam_max = np.abs(X_train.T * (Y_train - Y_train.mean())).max()
lam_val = lam_max * np.exp(np.linspace(np.log(1),
                                       np.log(1e-4), 50))


In [None]:
logit = LogisticRegression(penalty='l1',
                           C=1/lam_max,
                           solver='liblinear',
                           warm_start=True,
                           fit_intercept=True)


In [None]:
coefs = []
intercepts = []

for l in lam_val:
    logit.C = 1/l
    logit.fit(X_train, Y_train)
    coefs.append(logit.coef_.copy())
    intercepts.append(logit.intercept_)



In [None]:
coefs = np.squeeze(coefs)
intercepts = np.squeeze(intercepts)


In [None]:
from sklearn.metrics import accuracy_score
y_pred = logit.predict(X_test)
test_accuracy = accuracy_score(Y_test, y_pred)
print(f"Test accuracy (Lasso): {test_accuracy:.4f}")

from sklearn.metrics import log_loss
test_loss = log_loss(Y_test, logit.predict_proba(X_test))
print(f"Test loss (Lasso): {test_loss:.4f}")


In [None]:
%%capture
fig, axes = subplots(1, 2, figsize=(12, 4), sharey=True)
for ((X_, Y_),
     data_,
     color) in zip([(X_train, Y_train),
                    (X_valid, Y_valid),
                    (X_test, Y_test)],
                    ['Training', 'Validation', 'Test'],
                    ['black', 'red', 'blue']):
    linpred_ = X_ * coefs.T + intercepts[None,:]
    label_ = np.array(linpred_ > 0)
    accuracy_ = np.array([np.mean(Y_ == l) for l in label_.T])
    axes[0].plot(-np.log(lam_val / X_train.shape[0]),
                 accuracy_,
                 '.--',
                 color=color,
                 markersize=13,
                 linewidth=2,
                 label=data_)
axes[0].legend()
axes[0].set_xlabel(r'$-\log(\lambda)$', fontsize=20)
axes[0].set_ylabel('Accuracy', fontsize=20)



In [None]:
def summary_plot(results,
                 ax,
                 col='loss',
                 valid_legend='Validation',
                 training_legend='Training',
                 ylabel='Loss',
                 fontsize=20):
    for (column,
         color,
         label) in zip([f'train_{col}_epoch',
                        f'valid_{col}'],
                       ['black',
                        'red'],
                       [training_legend,
                        valid_legend]):
        results.plot(x='epoch',
                     y=column,
                     label=label,
                     marker='*',
                     color=color,
                     ax=ax)
    ax.set_xlabel('Epoch')
    ax.set_ylabel(ylabel)
    return ax

In [None]:
imdb_results = pd.read_csv(imdb_logger.experiment.metrics_file_path)
summary_plot(imdb_results,
             axes[1],
             col='accuracy',
             ylabel='Accuracy')
axes[1].set_xticks(np.linspace(0, 30, 7).astype(int))
axes[1].set_ylabel('Accuracy', fontsize=20)
axes[1].set_xlabel('Epoch', fontsize=20)
axes[1].set_ylim([0.5, 1]);
axes[1].axhline(test_results[0]['test_accuracy'],
                color='blue',
                linestyle='--',
                linewidth=3)
fig

In [None]:
del(imdb_model,
    imdb_trainer,
    imdb_logger,
    imdb_dm,
    imdb_train,
    imdb_test)

## Recurrent Neural Networks


### LSTM sentiment prediction to the IMDb movie-review data, as discussed in Section


In [None]:
max_num_workers=10
imdb_seq_dm = SimpleDataModule(imdb_seq_train,
                               imdb_seq_test,
                               validation=2000,
                               batch_size=300,
                               num_workers=min(6, max_num_workers)
                               )


In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(input_size, 32)
        self.lstm = nn.LSTM(input_size=32,
                            hidden_size=32,
                            batch_first=True)
        self.dense = nn.Linear(32, 1)
    def forward(self, x):
        val, (h_n, c_n) = self.lstm(self.embedding(x))
        return torch.flatten(self.dense(val[:,-1]))


In [None]:
lstm_model = LSTMModel(X_test.shape[-1])
summary(lstm_model,
        input_data=imdb_seq_train.tensors[0][:10],
        col_names=['input_size',
                   'output_size',
                   'num_params'])


In [None]:
lstm_module = SimpleModule.binary_classification(lstm_model)
lstm_logger = CSVLogger('logs', name='IMDB_LSTM')


In [None]:
lstm_trainer = Trainer(deterministic=False, # for cuda setting
                       max_epochs=10,  # reduced the max epoch from 20 to 10 for testing (accuarcy similar, loss is even better)
                       logger=lstm_logger,
                       enable_progress_bar=True,
                       callbacks=[ErrorTracker()])
lstm_module.train() # to meet cuda's default
lstm_trainer.fit(lstm_module,
                 datamodule=imdb_seq_dm)


In [None]:
lstm_trainer.test(lstm_module, datamodule=imdb_seq_dm)

In [None]:
lstm_results = pd.read_csv(lstm_logger.experiment.metrics_file_path)
fig, ax = subplots(1, 1, figsize=(6, 6))
summary_plot(lstm_results,
             ax,
             col='accuracy',
             ylabel='Accuracy')
ax.set_xticks(np.linspace(0, 20, 5).astype(int))
ax.set_ylabel('Accuracy')
ax.set_ylim([0.5, 1])


In [None]:
del(lstm_model,
    lstm_trainer,
    lstm_logger,
    imdb_seq_dm,
    imdb_seq_train,
    imdb_seq_test)


### Time Series Prediction


In [None]:
NYSE = load_data('NYSE')
cols = ['DJ_return', 'log_volume', 'log_volatility']
X = pd.DataFrame(StandardScaler(
                     with_mean=True,
                     with_std=True).fit_transform(NYSE[cols]),
                 columns=NYSE[cols].columns,
                 index=NYSE.index)


In [None]:
for lag in range(1, 6):
    for col in cols:
        newcol = np.zeros(X.shape[0]) * np.nan
        newcol[lag:] = X[col].values[:-lag]
        X.insert(len(X.columns), "{0}_{1}".format(col, lag), newcol)
X.insert(len(X.columns), 'train', NYSE['train'])
X = X.dropna()


In [None]:
Y, train = X['log_volume'], X['train']
X = X.drop(columns=['train'] + cols)
X.columns


In [None]:
M = LinearRegression()
M.fit(X[train], Y[train])
M.score(X[~train], Y[~train])

In [None]:
X_day = pd.concat([X,
                  pd.get_dummies(NYSE['day_of_week'])],
                  axis=1).dropna()

In [None]:
M.fit(X_day[train], Y[train])
M.score(X_day[~train], Y[~train])

In [None]:
ordered_cols = []
for lag in range(5,0,-1):
    for col in cols:
        ordered_cols.append('{0}_{1}'.format(col, lag))
X = X.reindex(columns=ordered_cols)
X.columns


In [None]:
X_rnn = X.to_numpy().reshape((-1,5,3))
X_rnn.shape

In [None]:
class NYSEModel(nn.Module):
    def __init__(self):
        super(NYSEModel, self).__init__()
        self.rnn = nn.RNN(3,
                          12,
                          batch_first=True)
        self.dense = nn.Linear(12, 1)
        self.dropout = nn.Dropout(0.1)
    def forward(self, x):
        val, h_n = self.rnn(x)
        val = self.dense(self.dropout(val[:,-1]))
        return torch.flatten(val)
nyse_model = NYSEModel()

In [None]:
datasets = []
for mask in [train, ~train]:
    X_rnn_t = torch.tensor(X_rnn[mask].astype(np.float32))
    Y_t = torch.tensor(Y.values[mask].astype(np.float32))
    datasets.append(TensorDataset(X_rnn_t, Y_t))
nyse_train, nyse_test = datasets


In [None]:
summary(nyse_model,
        input_data=X_rnn_t,
        col_names=['input_size',
                   'output_size',
                   'num_params'])


In [None]:
max_num_workers=10
nyse_dm = SimpleDataModule(nyse_train,
                           nyse_test,
                           num_workers=min(4, max_num_workers),
                           validation=nyse_test,
                           batch_size=64)

In [None]:
for idx, (x, y) in enumerate(nyse_dm.train_dataloader()):
    out = nyse_model(x)
    print(y.size(), out.size())
    if idx >= 2:
        break


In [None]:
nyse_optimizer = RMSprop(nyse_model.parameters(),
                         lr=0.001)
nyse_module = SimpleModule.regression(nyse_model,
                                      optimizer=nyse_optimizer,
                                      metrics={'r2':R2Score()})


In [None]:
nyse_trainer = Trainer(deterministic=False, # changed to False; results will not be fixed
                       max_epochs=20, # reduced max epoch from 200 to 20 for a stable testing on cpu
                       enable_progress_bar=True,
                       gradient_clip_val=1.0, # added to avoid exploding gradient
                       callbacks=[ErrorTracker()])
nyse_module.train()
nyse_trainer.fit(nyse_module,
                 datamodule=nyse_dm)
nyse_trainer.test(nyse_module,
                  datamodule=nyse_dm)

In [None]:
datasets = []
for mask in [train, ~train]:
    X_day_t = torch.tensor(
                   np.asarray(X_day[mask]).astype(np.float32))
    Y_t = torch.tensor(np.asarray(Y[mask]).astype(np.float32))
    datasets.append(TensorDataset(X_day_t, Y_t))
day_train, day_test = datasets

In [None]:
day_dm = SimpleDataModule(day_train,
                          day_test,
                          num_workers=min(4, max_num_workers),
                          validation=day_test,
                          batch_size=64)


In [None]:
class NonLinearARModel(nn.Module):
    def __init__(self):
        super(NonLinearARModel, self).__init__()
        self._forward = nn.Sequential(nn.Flatten(),
                                      nn.Linear(20, 32),
                                      nn.ReLU(),
                                      nn.Dropout(0.5),
                                      nn.Linear(32, 1))
    def forward(self, x):
        return torch.flatten(self._forward(x))


In [None]:
nl_model = NonLinearARModel()
nl_optimizer = RMSprop(nl_model.parameters(),
                           lr=0.001)
nl_module = SimpleModule.regression(nl_model,
                                        optimizer=nl_optimizer,
                                        metrics={'r2':R2Score()})


In [None]:
nl_trainer = Trainer(deterministic=False, # changed
                     max_epochs=10, # max epoch reduced from 20 to 10
                     enable_progress_bar=True,
                     callbacks=[ErrorTracker()])
nl_trainer.fit(nl_module, datamodule=day_dm)
nl_trainer.test(nl_module, datamodule=day_dm)