In [None]:
pip install datasets



In [None]:
!pip install transformers



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import os
import re
import numpy as np
from keras.layers import Input
from keras.layers import Dense, Dropout
from keras.layers import LSTM, Embedding, Bidirectional, GRU
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim

from copy import deepcopy

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

TRAIN_PATH = '/content/drive/MyDrive/train.csv'
VAL_PATH = '/content/drive/MyDrive/val.csv'
TEST_PATH = '/content/drive/MyDrive/test.csv'

train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

train_df = train_df[['id',	'cmt',	'price',	'service',	'ship',	'performance'	,'genuine',	'configuration'	,'access',	'other',	'model']]
val_df = val_df[['id',	'cmt',	'price',	'service',	'ship',	'performance'	,'genuine',	'configuration'	,'access',	'other',	'model']]
test_df =test_df[['id',	'cmt',	'price',	'service',	'ship',	'performance'	,'genuine',	'configuration'	,'access',	'other',	'model']]

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

raw_datasets = DatasetDict({'train': train_dataset, 'val': val_dataset, 'test': test_dataset})



In [None]:
def make_outputs(df):
    outputs = []
    for row in range(len(df)):
        row_one_hot = []
        for col in range(2, len(df.columns)):
            sentiment = df.iloc[row, col]
            if sentiment == 0.0:
                row_one_hot.extend([1, 0, 0])  # None
            elif sentiment == 1.0:
                row_one_hot.extend([0, 1, 0])  # Pos
            elif sentiment == -1.0:
                row_one_hot.extend([0, 0, 1])  # Neg
        outputs.append(row_one_hot)
    return np.array(outputs, dtype='uint8')

y_train = make_outputs(train_df)
y_test = make_outputs(test_df)
y_val = make_outputs(val_df)

In [None]:
from transformers import AutoTokenizer, TFAutoModel, AutoModel

PRETRAINED_MODEL = 'vinai/phobert-base'
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

def tokenize_function(dataset):
    cmt_texts = dataset['cmt']
    return tokenizer(cmt_texts, max_length=tokenizer.model_max_length, padding='max_length', truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets['train']

Dataset({
    features: ['id', 'cmt', 'price', 'service', 'ship', 'performance', 'genuine', 'configuration', 'access', 'other', 'model', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1725
})

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
def get_encoder(config):
    x_dim = config['model.x_dim']
    phi_dim = config['model.phi_dim']
    hid_dim = config['model.hid_dim']
    dropout_prob = config['train.dropout_prob']

    activation = nn.ReLU()

    encoder = nn.Sequential(
        nn.Linear(x_dim, hid_dim),
        nn.Dropout(p=dropout_prob),
        activation,
        nn.Linear(hid_dim, phi_dim),
        nn.Dropout(p=dropout_prob),
    )

    return encoder

In [None]:
class ProtoCL(nn.Module):
    """
    Implementation based off of PCOC from:
    @article{harrison2019continuous,
    title={Continuous meta-learning without tasks},
    author={Harrison, James and Sharma, Apoorva and Finn, Chelsea and Pavone, Marco},
    journal={arXiv preprint arXiv:1912.08866},
    year={2019}
    }
    """
    def __init__(self, config):
        super().__init__()

        self.config = deepcopy(config)
        self.x_dim = config['model.x_dim']
        self.phi_dim = config['model.phi_dim']
        self.y_dim = config['model.y_dim']

        self.sigma_eps = np.zeros(
            [self.y_dim, 1]) + np.asarray(eval(config['model.sigma_eps'])
        )
        self.cov_dim = self.sigma_eps.shape[-1] # 1
        print("Using %d parameters in covariance:" % self.cov_dim)
        if self.phi_dim % self.cov_dim != 0:
            raise ValueError("cov_dim must evenly divide phi_dim")

        self.logSigEps = nn.Parameter(
            torch.from_numpy(np.log(self.sigma_eps)),
            requires_grad=self.config['train.learnable_noise'],
        )

        Linv_offset = config['model.Linv_init']
        dir_scale = config['model.dirichlet_scale']

        self.Q = nn.Parameter(
            torch.randn(self.y_dim, self.cov_dim, self.phi_dim // self.cov_dim)
        )
        self.logLinv = nn.Parameter(
            torch.randn(self.y_dim, self.cov_dim) + Linv_offset
        )

        self.log_dirichlet_priors = nn.Parameter(
            dir_scale * torch.ones(self.y_dim),
            requires_grad=config['train.learnable_dirichlet'],
        )

        self.normal_nll_const = self.phi_dim * np.log(2 * np.pi)

        self.encoder = get_encoder(config).to(device)

        params = [
            {'params': self.encoder.parameters()},
        ]

        if self.config['train.learnable_noise']:
            params.append({'params': [self.logSigEps]})

        if self.config['train.learnable_dirichlet']:
            params.append({'params': [self.log_dirichlet_priors]})

        self.optimizer = optim.Adam(
            params,
            lr=config['train.learning_rate'],
            weight_decay=config['train.weight_decay'],
        )

        self.scheduler = torch.optim.lr_scheduler.StepLR(
            self.optimizer,
            step_size=1,
            gamma=0.5,
        )

    @property
    def invSigEps(self):
        return torch.exp(-self.logSigEps)  # .repeat(self.y_dim,1)

    @property
    def SigEps(self):
        return torch.exp(self.logSigEps)  # .repeat(self.y_dim,1)

    def prior_params(self):
        Q0 = self.Q
        Linv0 = torch.exp(self.logLinv)
        dir_weights = torch.exp(self.log_dirichlet_priors)
        return Q0, Linv0, dir_weights

    def set_params(self, params):
        self.Q = nn.Parameter(params[0])
        self.logLinv = nn.Parameter(torch.log(params[1]))
        self.log_dirichlet_priors = nn.Parameter(
            torch.log(params[2]),
            requires_grad=self.config['train.learnable_dirichlet'],
        )

    def recursive_update(self, phi, y, params):
        """
            inputs: phi: shape (..., cov_dim, k )
                    y:   shape (..., y_dim )
                    params: tuple of Q, Linv, dir_weights
                        Q: shape (..., y_dim, cov_dim, k)
                        Linv: shape (..., y_dim, cov_dim)
                        dir_weights: shape (..., y_dim)
        """
        Q, Linv, dir_weights = params

        # zeros out entries all except class y
        invSigEps_masked = self.invSigEps * y.squeeze(1).unsqueeze(-1)

        Q = Q + invSigEps_masked.unsqueeze(-1) * phi
        Linv = Linv + invSigEps_masked
        dir_weights = dir_weights + y
        return (Q, Linv, dir_weights)

    def log_predictive_prob(self, x, y, posterior_params):
        """
            input:  x: shape (..., x_dim)
                    y: shape (..., y_dim)
                    posterior_params: tuple of Q, Linv:
                        Q: shape (..., y_dim, cov_dim, k)
                        Linv: shape (..., y_dim, cov_dim)
                        dir_weights: shape (..., y_dim)
            output: logp: log p(y, x | posterior_params) (..., y_dim)
                    updated_params: updated posterior params after factoring in (x,y) pair
        """

        x_shape = list(x.shape)

        if len(x_shape) > 4:  # more than one batch dim
            x = x.reshape([-1] + x_shape[-3:])

        phi = self.encoder(x)  # (..., phi_dim)
        if len(x_shape) > 4:
            phi = phi.reshape(x_shape[:-3] + [self.phi_dim])

        Q, Linv, dir_weights = posterior_params
        mu = Q / Linv.unsqueeze(-1) # (1, 1, y_dim, 1, phi_dim)
        pred_cov = 1. / Linv + self.SigEps  # (..., cov_dim, y_dim)

        phi_shape = phi.shape # (b, phi_dim)
        phi_reshaped = phi.reshape(*(list(phi_shape)[:-1] + [1, self.cov_dim, -1]))  # (..., 1, cov_dim, k)

        err = phi_reshaped - mu # (b, y_dim, 1, phi_dim)

        nll_quadform = (err ** 2 / pred_cov.unsqueeze(-1)).sum(-1).sum(-1)
        nll_logdet = (self.phi_dim / self.cov_dim) * torch.log(pred_cov).sum(-1) # (b, 1, y_dim) sum of log of diagonal entries

        logp = -0.5 * (nll_quadform + nll_logdet + self.normal_nll_const)  # log p(x | y)

        logp += torch.log(dir_weights / dir_weights.sum(-1, keepdim=True))  # multiply by p(y) posterior to get p(x, y)

        posterior_params = [p.detach() for p in posterior_params]
        updated_params = self.recursive_update(phi_reshaped, y, posterior_params)
        updated_params = [p.detach() for p in updated_params]
        self.set_params(updated_params)

        return logp, updated_params


    def nll(self, log_pi):
        """
            log_pi: shape(batch_size x t x ...)
            log_prgx: shape (batch_size x t x ...)
        """
        return -torch.logsumexp(log_pi, dim=1)

    def log_posterior(self, x_mat, y_mat):
        """
        Takes in x,y batches; recursively compute posteriors
        Inputs:
        - x_mat; shape = batch size x x_dim
        - y_mat; shape = batch size x y_dim
        """

        # define initial params and append to list
        # we add a batch dimension if its not already there
        # prior_params = tuple(p[None, ...] if len(p.shape) < 4 for p in self.prior_params()) # (Q0, Linv0, dir_weights)
        prior_params = list(self.prior_params())
        prior_params[0] = prior_params[0][None, ...] if len(prior_params[0].shape) == 3 else prior_params[0]
        prior_params[1] = prior_params[1][None, ...] if len(prior_params[1].shape) == 2 else prior_params[1]
        prior_params[2] = prior_params[2][None, ...] if len(prior_params[2].shape) == 1 else prior_params[2]

        # if classification, log_pi == p(y,x|eta) for all y (batchsize, y_dim)
        log_p, updated_posterior_params = self.log_predictive_prob(
            x_mat,
            y_mat,
            prior_params,
        )

        # normalize to get p(y | x) # (batchsize, y_dim)
        nll = -nn.functional.log_softmax(log_p.squeeze(1), dim=-1) # (..., y_dim)

        return updated_posterior_params, nll

    def forward(self, x, posterior_params):
        """
            input: x, posterior params
            output: log p(x | y) for all y
        """
        x_shape = list(x.shape)

        if len(x_shape) > 4:  # more than one batch dim
            x = x.reshape([-1] + x_shape[-3:])

        phi = self.encoder(x)  # (..., phi_dim)
        if len(x_shape) > 4:
            phi = phi.reshape(x_shape[:-3] + [self.phi_dim])

        Q, Linv, dir_weights = posterior_params
        mu = Q / Linv.unsqueeze(-1)  # (..., y_dim, cov_dim, k)
        pred_cov = 1. / Linv + self.SigEps()  # (..., y_dim, cov_dim)

        phi_shape = phi.shape
        phi_reshaped = phi.reshape(*(list(phi_shape)[:-1] + [self.cov_dim, -1]))  # (..., cov_dim, k)

        err = phi_reshaped.unsqueeze(-3) - mu  # (..., y_dim, cov_dim, k)

        nll_quadform = (err ** 2 / pred_cov.unsqueeze(-1)).sum(-1).sum(-1)
        nll_logdet = (self.phi_dim / self.cov_dim) * torch.log(pred_cov).sum(-1)  # sum of log of diagonal entries

        logp = -0.5 * (nll_quadform + nll_logdet + self.normal_nll_const)  # log p(x | y)

        logp += torch.log(dir_weights / dir_weights.sum(-1, keepdim=True))  # multiply by p(y) to get p(x, y)

        return logp


In [None]:
def to_tensorflow_format(tokenized_dataset):
    features = tokenized_dataset.features
    return tokenized_dataset.remove_columns(list(features)[:-3]).with_format('tensorflow')

from tensorflow.data import Dataset
def preprocess_tokenized_dataset(tokenized_dataset, tokenizer, labels, batch_size, shuffle=False,repeat=True):
    tf_dataset = to_tensorflow_format(tokenized_dataset)
    features = {x: tf_dataset[x] for x in tokenizer.model_input_names}
    labels = labels.reshape(len(labels), -1)

    tf_dataset = Dataset.from_tensor_slices((features, labels))

    if shuffle:
        tf_dataset = tf_dataset.shuffle(buffer_size=len(tf_dataset))
    tf_dataset = tf_dataset.batch(batch_size).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

    if repeat:
        tf_dataset = tf_dataset.repeat(10)

    return tf_dataset

train_tf_dataset = preprocess_tokenized_dataset(tokenized_datasets['train'], tokenizer, y_train, 20, shuffle=True)
val_tf_dataset = preprocess_tokenized_dataset(tokenized_datasets['val'], tokenizer, y_val, 20, repeat=False)
test_tf_dataset = preprocess_tokenized_dataset(tokenized_datasets['test'], tokenizer, y_test, 20, repeat=False)


In [None]:
proto_cl_model = ProtoCL(config)
criterion = torch.nn.NLLLoss()

Using 1 parameters in covariance:


In [None]:
config = {
    'model.x_dim': 256,
    'model.phi_dim': 128,
    'model.y_dim': 3,
    'model.hid_dim': 256,
    'model.sigma_eps': '0.01',
    'model.Linv_init': 0.1,
    'model.dirichlet_scale': 0.1,
    'train.learnable_noise': True,
    'train.learnable_dirichlet': True,
    'train.weight_decay': 0.0001,
    'train.dropout_prob': 0.5,
    'train.learning_rate': 0.003
}

model = ProtoCL(config)

Using 1 parameters in covariance:


In [None]:
model.compile()

In [None]:
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

In [None]:
x_train_tensor = torch.tensor(tokenized_datasets["train"]["input_ids"])
x_val_tensor = torch.tensor(tokenized_datasets["val"]["input_ids"])

In [None]:
dense_layer = nn.Linear(768,256)
x_train_tensor = dense_layer(x_train_tensor)

RuntimeError: ignored

In [None]:
train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)

In [None]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 1

for epoch in range(num_epochs):
    for batch in train_dataloader:
        x_batch, y_batch = batch
        x_batch = x_batch.float()
        print((x_batch.shape))
        print((y_batch.shape))
        #posterior_params , nll = model.log_posterior(x_batch, y_batch)
        # model.optimizer.zero_grad()
        # nll.backward()
        # model.optimizer.step()

torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size([32, 256])
torch.Size([32, 27])
torch.Size(