In [None]:
from transformers import BertTokenizer
import numpy as np
import pandas as pd
import torch
from torch import nn
import matplotlib.pyplot as plt

# Custom imports
import bert_nbme

In [None]:
# Define globals
CONFIG = 'bert-base-uncased'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'DEVICE: {DEVICE}')

# Import data
notes_df = pd.read_csv('data/patient_notes.csv')
train_df = pd.read_csv('data/train.csv')
features_df = pd.read_csv('data/features.csv')

In [None]:
none_row = pd.DataFrame({'feature_num': [-1], 'case_num': [-1], 'feature_text': ['NONE']}, index=[len(features_df)])
features_df = pd.concat((features_df, none_row))  # Add NONE value as a feature
features_df['feature_index'] = range(len(features_df))

# APPEND AND CLEAN DATA
data = train_df[train_df['annotation'] != '[]']  # Drop blank annotations ('[]')
data['annotation'] = [i.translate(i.maketrans('', '', '[]\'')).split(' ') for i in data['annotation']]
data = data.merge(features_df[['feature_num', 'feature_text', 'feature_index']], on='feature_num')  # Add features
data = data.merge(notes_df[['pn_num', 'pn_history']], on='pn_num')  # Add notes
data = data.dropna().reset_index(drop=True)  # Drop and reindex any leftover trouble-makers

In [None]:
data['pn_history'].iloc[0]

In [None]:
# Tokenize word lists
tokenizer = BertTokenizer.from_pretrained(CONFIG)
encoded_word_lists = [tokenizer.encode(x) for x in data['pn_history']]

In [None]:
def substring_loc(input_str, substring):
    start_ind = input_str.find(substring)
    end_ind = start_ind + len(substring)
    return start_ind, end_ind
    

test = 'ape is my middle name'
sub = 'is my middle'
i = test.find(sub)
test[i:len(sub) + i]

In [None]:
data.location
data.pn_history.iloc[0][696:724]

In [None]:
max([len(x) for x in data.pn_history])
data.pn_history.where(data.pn_history.str.len() > 949).dropna()