In [32]:
import pandas as pd
import numpy as np
import torch
from nltk.tokenize import sent_tokenize
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split, StratifiedKFold

In [30]:
#Initialize
pretrained_weights = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
model = BertModel.from_pretrained(pretrained_weights)

In [62]:
def process_document(row, tokenizer, model):
    max_len = 150
    sentences = np.array(sent_tokenize(row['Incident Description']))
    tokenized = [tokenizer.encode(sentence, add_special_tokens=True, 
                                  max_length = max_len, 
                                  truncation_strategy='longest_first') for sentence in sentences]
    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    attention_mask = np.where(padded != 0, 1, 0)
    input_ids = torch.tensor(padded)
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)

    features = np.mean(np.last_hidden_states[0][:,0,:].numpy(), axis=0).reshape((1, 768))
    label = np.array(row['labels']).reshape((1, 1))
    return(np.concatenate([label, features], axis=1))

def preprocess_batch(range, tokenizer, model):
    start_row = range[0]
    chunk_size = (range[1] - range[0]) + 1
    raw_data = pd.read_csv('labeled_examples.csv', skiprows=start_row, nrows=chunk_size, header=None, names = ['Incident Description', 'labels'])
    raw_data = raw_data.fillna('')
    embeddings = raw_data.apply(process_document, axis=1)
    data_chunk = np.concatenate(embeddings, axis=0)
    np.save('embedding_chunks/chunk_'+str(range[0])+'-'+str(range[1])+'.npy', data_chunk)
    

In [24]:
steps = np.linspace(start=0, stop = 41543, num=1000, dtype=int)
[preprocess_batch((steps[i], steps[i+1]-1), tokenizer, model) for i in range(len(steps) - 1)]

torch.Size([104, 500])
torch.Size([104, 500])


KeyError: 'Incident Description'