In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import inflect
from tqdm import tqdm
from pytorch_pretrained_bert.tokenization import BertTokenizer

pd.set_option('display.max_rows', 500)

In [2]:
!pip install --upgrade pandas==1.3.2

Requirement already up-to-date: pandas==1.3.2 in /usr/local/share/anaconda3/lib/python3.7/site-packages (1.3.2)


In [3]:
def read_training(experiment_number):
    train_path = "/data/users/linh/USF_Practicum/glioma/glioma_train_180" + "_" + str(experiment_number)+".pkl"
    data = pd.read_pickle(train_path)
    return data

In [4]:
# 1
def text_replace(x):
    y=re.sub('\*', '', x)
    y=re.sub('\/\/', '', y)
    y=re.sub('\\\\', '', y)
  #  y=re.sub(' \*.*?\* ','',x) #remove de-identified brackets
 #   y=re.sub('\*.*?\//*','',x) #remove de-identified brackets
    y=re.sub('[0-9]+\.  ','',y) #remove 1.  , 2.   since the segmenter segments based on this. preserve 1.2 
    y=re.sub('dr\.','doctor',y)
    y=re.sub('m\.d\.','md',y)
    y=re.sub('admission date:','',y)
    y=re.sub('discharge date:','',y)
    y=re.sub('--|__|==','',y)
    y=re.sub(r"\b\d+\b", lambda m: inflect.engine().number_to_words(m.group()), y) # '\b \b' means whole word only
    return y

def text_clean(df): 
    df_cleaned = df.copy()
    df_cleaned['text']=df_cleaned['text'].fillna(' ')
    df_cleaned['text']=df_cleaned['text'].str.replace('\n',' ')
    df_cleaned['text']=df_cleaned['text'].str.replace('\r',' ')
    df_cleaned['text']=df_cleaned['text'].apply(str.strip)
    df_cleaned['text']=df_cleaned['text'].str.lower()
    df_cleaned['text']=df_cleaned['text'].apply(lambda x: text_replace(x))
    return df_cleaned

In [5]:
# 1
def trunk510(df):
    want = pd.DataFrame({'ID':[], 'Token_trunc':[]})
    for i in range(len(df)):
        length = df['len'][i]
        n = int(np.ceil(length/400))
        for j in range(n):
            tok = df['Token'][i][j*400: j*400+510]
            want = want.append({
                'Token_trunc': tok,
                'ID': df['ptId'][i]}, ignore_index=True)
    return want

In [109]:
for experiment_number in tqdm(range(21)):
    data = read_training(experiment_number)
    df = pd.DataFrame(data)
    df['ptId'] = df.index
    df_cleaned = text_clean(df)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    df_cleaned['Token'] = df_cleaned['text'].apply(lambda x: tokenizer.tokenize(x))
    df_cleaned['len'] = df_cleaned['Token'].apply(lambda x: len(x))
    df_trunked = trunk510(df_cleaned) # no label needed
    df_trunked.to_pickle('/data/users/linh/USF_Practicum/glioma_tokenized/glioma_train_180_' + str(experiment_number) + '_tokens' + '.pkl')

100%|██████████| 21/21 [41:56<00:00, 119.85s/it]


In [110]:
df_train_v = pd.read_pickle('/data/users/linh/USF_Practicum/glioma_tokenized/glioma_train_180_8_tokens.pkl')
df_train_v['ID'] = df_train_v['ID'].apply(lambda x: int(x))
df_train_v

Unnamed: 0,ID,Token_trunc
0,0,"[b, "", thank, -, you, for, referring, to, the,..."
1,0,"[mouth, daily, ., col, ##chi, ##cine, zero, .,..."
2,0,"[:, negative, for, chest, pain, ,, pal, ##pit,..."
3,0,"[with, g, ##lio, ##bla, ##sto, ##ma, ., there,..."
4,0,"[or, loss, of, consciousness, ., he, does, thi..."
...,...,...
21633,377,"[tesla, ., contrast, media, :, intra, ##ven, #..."
21634,377,"[., maintenance, phase, :, administered, at, o..."
21635,377,"[all, patients, receiving, con, ##com, ##itan,..."
21636,377,"[radio, ##therapy, given, at, /, day, five, da..."


# 6. Test set

In [7]:
def read_testing(experiment_number):
    test_path = "/data/users/linh/USF_Practicum/glioma/glioma_test_180" + "_" + str(experiment_number)+".pkl"
    data_test = pd.read_pickle(test_path)
    return data_test

In [112]:
for experiment_number in tqdm(range(21)):
    data_test = read_testing(experiment_number)
    df_test = pd.DataFrame(data_test)
    df_test['ptId'] = df_test.index
    df_test_cleaned = text_clean(df_test)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    df_test_cleaned['Token'] = df_test_cleaned['text'].apply(lambda x: tokenizer.tokenize(x))
    df_test_cleaned['len'] = df_test_cleaned['Token'].apply(lambda x: len(x))
    df_test_trunked = trunk510(df_test_cleaned) # no label needed
    df_test_trunked.to_pickle('/data/users/linh/USF_Practicum/glioma_tokenized/glioma_test_180_' + str(experiment_number) + '_tokens' + '.pkl')

100%|██████████| 21/21 [10:41<00:00, 30.55s/it]


In [113]:
df_test_v = pd.read_pickle('/data/users/linh/USF_Practicum/glioma_tokenized/glioma_test_180_5_tokens.pkl')
df_test_v['ID'] = df_test_v['ID'].apply(lambda x: int(x))
df_test_v

Unnamed: 0,ID,Token_trunc
0,0,"[b, "", thank, -, you, for, referring, to, the,..."
1,0,"[mouth, daily, ., col, ##chi, ##cine, zero, .,..."
2,0,"[:, negative, for, chest, pain, ,, pal, ##pit,..."
3,0,"[with, g, ##lio, ##bla, ##sto, ##ma, ., there,..."
4,0,"[or, loss, of, consciousness, ., he, does, thi..."
...,...,...
6204,96,"[tesla, ., contrast, media, :, intra, ##ven, #..."
6205,96,"[., maintenance, phase, :, administered, at, o..."
6206,96,"[all, patients, receiving, con, ##com, ##itan,..."
6207,96,"[radio, ##therapy, given, at, /, day, five, da..."
