In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%%capture
import pickle
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset
!pip install transformers
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
from torch.optim import SGD, Adam
from torch.utils.data import DataLoader


from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn import metrics

import time

In [None]:
def check_gpu():
    # If there's a GPU available...
    if torch.cuda.is_available():
        # Tell PyTorch to use the GPU.
        device = torch.device("cuda")
    # If not...
    else:
        device = torch.device("cpu")
    return device

## Embed Dataset

In [None]:
class Embed():
    """Class for creating scibert embeddings"""

    def __init__(self, text, device, transformer):
        # text is dataframe for dataset vs string for predicting
        assert isinstance(text, str) or isinstance(text, pd.core.frame.DataFrame)
        if type(text) == str:
            docs = text
        else:
            x, y = text.columns[0], text.columns[1]
            docs = text[x].values  # convert the abstracts column to an array
            self.labels = text[y].values  # convert the label column to an array
        self.embedded_docs = []  # container for the hidden represntation of each abstract
        # get tokenizer and embedding model
        tokenizer = AutoTokenizer.from_pretrained(transformer, do_lower_case=True)
        model = AutoModel.from_pretrained(transformer).to(device)
        model.eval()
        with torch.no_grad():
            count = 1
            for sent in docs:
                print(count)
                try:
                      # 1. tokenize the abstract
                      # 2. passed the tokenized abstract through the model
                      # 3. pool the output representation to get a final representation of the abstract
                      # 4. output representation added to self.embedded_docs
                    encoded_dict = tokenizer.encode_plus(
                          sent,  # Sentence to encode.
                          add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                          max_length=350,  # Pad & truncate all sentences.
                          pad_to_max_length=False,
                          return_tensors='pt',  # Return pytorch tensors.
                          truncation=True
                      )
                    input_doc = encoded_dict['input_ids'].to(device)
                    outputs = model(input_doc)
                    # cls word/sentence represet
                    cls = torch.mean(outputs[0][0], 0)
                    cls = cls.cpu()
                    self.embedded_docs.append(cls)
                except:
                    np.delete(self.labels, count-1) # delete invalid sentence 
                    print("Error! Sentence: {} Number: {}".format(sent, count-1))

                
                count += 1 # for keeping track of abstract being printed
            if type(text) is not str:
                self.df = pd.DataFrame({x: self.embedded_docs, y: self.labels})

    def __len__(self):
        return len(self.embedded_docs)

    def __getitem__(self, index: int) -> object:
        return self.embedded_docs[index], torch.tensor(self.labels[index], dtype=torch.long)

In [None]:
# read in dataset

df = pd.read_csv("/content/drive/My Drive/CIRVR/Dialogue Act Classification/out.csv")
df = df.dropna(0) # drop rows with invalid values
df.head(), df.shape, df['tag'].value_counts(), df.isna().any()

(                                            sentence tag
 0    um i do not really think that it is uh too many  sv
 1  i think it is a severe invasion of somebody's ...  sv
 2  i really do not see that that is a very very v...  sv
 3  and i personally i do not think i would work f...  sv
 4                                               yeah  aa,
 (126631, 2),
 sd        75140
 sv        26422
 aa        11133
 qy         4725
 ny         3034
 qw         1979
 nn         1377
 ad          746
 qo          656
 ar          345
 ng          302
 no          285
 fp          225
 aap_am      105
 fa           79
 ft           78
 Name: tag, dtype: int64,
 sentence    False
 tag         False
 dtype: bool)

In [None]:
# map labels to integers ––> REPLACE WITH OUR WORK

labels = df[df.columns[1]].values.ravel().tolist()
map_labels = dict([(y, x) for x, y in enumerate(sorted(set(labels)))])
map_labels = [map_labels[x] for x in labels]
df[df.columns[1]] = map_labels
df.head()

Unnamed: 0,sentence,tag
0,um i do not really think that it is uh too many,15
1,i think it is a severe invasion of somebody's ...,15
2,i really do not see that that is a very very v...,15
3,and i personally i do not think i would work f...,15
4,yeah,0


In [None]:
# split dataset
new = {}
# split train / val
new['full_train'], new['val'] = train_test_split(df, train_size=.5, random_state=42,
                                                 stratify=df[df.columns[1]])

device = check_gpu()

new['full_train'].shape

dataset_bert = {'val': Embed(new['val'], device, 'bert-large-uncased'), 
                 'train': Embed(new['full_train'], device, 'bert-large-uncased')}

In [None]:
import pickle

saved_map = {
    'dataset_bert': dataset_bert
}

with open("./bert_large_embeddings.pickle", 'wb') as f:
    pickle.dump(saved_map,f)

## Load Bert Embeddings

In [None]:
import numpy as np
import pickle

# Load dataset

with open("./bert_large_embeddings.pickle", 'rb') as f:
    saved_map = pickle.load(f)

dataset_bert = saved_map['dataset_bert']

In [None]:
val_x, val_y = dataset_bert['val'].df.values[0:,0], dataset_bert['val'].df.values[0:,1].reshape(-1,1)
train_x, train_y = dataset_bert['train'].df.values[0:,0], dataset_bert['train'].df.values[0:,1].reshape(-1,1)

train_x, val_x = np.array([x.numpy() for x in train_x]), np.array([x.numpy() for x in val_x])

In [None]:
train_x

array([[-0.23098074,  0.13367033, -0.80111426, ..., -0.5852024 ,
        -0.2854892 , -0.01072676],
       [-0.18599667, -0.01940358, -0.64128816, ..., -0.242809  ,
         0.26364633, -0.04832397],
       [-0.16606979, -0.22590105,  0.01791661, ..., -0.21526839,
         0.10275817,  0.09724989],
       ...,
       [-0.30446026, -0.36025745, -0.5469742 , ..., -0.70823807,
         0.03368468,  0.32480893],
       [-0.0328631 , -0.17845334, -0.53364414, ..., -0.89846003,
        -0.18791944,  0.28966963],
       [-0.21454969, -0.0991615 , -0.3672959 , ..., -0.32371002,
         0.12710686,  0.09192964]], dtype=float32)

In [None]:
val_y = np.array([int(y[0]) for y in val_y])
train_y = np.array([int(y[0]) for y in train_y])