In [316]:
import sys
sys.path.append('../')
import datasets
from dataproc import extract_wvs
from dataproc import get_discharge_summaries
from dataproc import group_and_sort
from dataproc import filter_patients_and_labels
from dataproc import concat_and_split_disch
from dataproc import build_vocab
from dataproc import sort_by_length
from dataproc import vocab_index_descriptions
from dataproc import word_embeddings
from constants import DISCH_DIR, DATA_DIR

from nltk.tokenize import RegexpTokenizer

import numpy as np
import pandas as pd

from tqdm import tqdm

from collections import Counter
import csv
from datetime import datetime
import operator
import re

In [395]:
df = pd.read_csv('/project/ucb/nlp/ucb_data.csv')

In [396]:
df

Unnamed: 0,person_id,note_text,visit_occurrence_id,codes
0,7119,burning with urination for two days . also fel...,68384,535.00;250.00;345.90
1,7249,tikx is a lady who carries the diagnosis of lu...,70091,719.45
2,3786,rnbc is a pleasant 69yo pt seen for f/u after ...,35337,40.19;599.0;244.9;401.9;995.29;453.9
3,6420,darkening and thickening of right lateral 1st ...,60839,27.24;30.29;38.04;715.16;333.94;380.4;357.4;27...
4,388,available information aub chart xxxxx availabl...,3012,34.0;40.11;48.6;486;401.1;747.60;340
5,356,the patient was followed by wfqj rxco . he app...,2674,333.94;345.80;356.4;438.84
6,3782,jhp doing well with current meds . wrnm has up...,35224,42.54;244.8;425.4;345.90
7,4174,patient notes feeling well without any specifi...,39660,244.8;345.90
8,3444,patient is a very pleasant 37-year-old caucasi...,32110,345.90;295.90;453.40
9,7855,"feels great and safe on keppra , with no seizu...",8307115,V58.69;V79.0;780.99;799.81


# Preprocess text

In [397]:
pattern = re.compile(".*\d.*")

p2 = re.compile(r'\w+')

In [398]:
def preprocess(text):
    text = text.replace('-', ' ')
    tokens = []
    for t in text.split():
        if pattern.match(t):
            try:
                date = datetime.strptime(t, '%m/%d/%Y')
                tokens.append('**DATE**')
            except:
                tokens.append('**NUMERIC**')
        else:
            #remove periods from the middle of words
            if len(t) > 1:
                for tok in p2.findall(t):
                    tokens.append(tok)
            else:
                tokens.append(t)
    if len(tokens) < 1:
        return np.nan
    return ' '.join(tokens)

In [399]:
df['text'] = df.apply((lambda row: preprocess(row['note_text'])), axis=1)

In [400]:
df = df.dropna()

In [401]:
df['length'] = df.apply(lambda row: len(str(row['text']).split()), axis=1)
df = df[df['length'] > 10]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [402]:
df.to_csv('/project/ucb/nlp/ucb_data_proc.csv', columns=['person_id', 'visit_occurrence_id', 'text', 'codes', 'length'],\
          header=['SUBJECT_ID', 'HADM_ID', 'TEXT', 'LABELS', 'length'], index=False)

# Split into train/dev/test

In [424]:
trf = open('/project/ucb/nlp/ucb_train.csv', 'w')
dvf = open('/project/ucb/nlp/ucb_dev.csv', 'w')
tef = open('/project/ucb/nlp/ucb_test.csv', 'w')
trw = csv.writer(trf)
dvw = csv.writer(dvf)
tew = csv.writer(tef)

In [425]:
trw.writerow(['SUBJECT_ID', 'HADM_ID', 'TEXT', 'LABELS', 'length'])
dvw.writerow(['SUBJECT_ID', 'HADM_ID', 'TEXT', 'LABELS', 'length'])
tew.writerow(['SUBJECT_ID', 'HADM_ID', 'TEXT', 'LABELS', 'length'])

In [426]:
df = pd.read_csv('/project/ucb/nlp/ucb_data_proc.csv')

In [427]:
subjs = df['SUBJECT_ID'].unique()

In [428]:
hadms = df['HADM_ID'].unique()

In [429]:
n_subj = len(subjs)
n_hadm = len(hadms)
n_subj, n_hadm

(2948, 5304)

In [430]:
split = [int(round(0.8*n_subj)), int(round(0.9*n_subj)), n_subj]
split

[2358, 2653, 2948]

In [431]:
train = set(subjs[:split[0]])
dev = set(subjs[split[0]:split[1]])
test = set(subjs[split[1]:split[2]])

In [432]:
dev

{333,
 381,
 403,
 488,
 556,
 625,
 683,
 837,
 856,
 857,
 937,
 954,
 985,
 1024,
 1043,
 1067,
 1070,
 1076,
 1083,
 1100,
 1161,
 1174,
 1184,
 1185,
 1211,
 1253,
 1278,
 1391,
 1414,
 1421,
 1425,
 1433,
 1439,
 1472,
 1489,
 1524,
 1532,
 1558,
 1591,
 1609,
 1661,
 1674,
 1703,
 1721,
 1806,
 1808,
 1844,
 1866,
 1928,
 1931,
 1955,
 1965,
 1972,
 2008,
 2017,
 2047,
 2080,
 2101,
 2104,
 2106,
 2113,
 2253,
 2331,
 2332,
 2345,
 2364,
 2410,
 2507,
 2569,
 2624,
 2642,
 2647,
 2671,
 2672,
 2688,
 2812,
 2819,
 2822,
 2837,
 2838,
 2873,
 2880,
 2897,
 2935,
 3052,
 3067,
 3073,
 3077,
 3132,
 3185,
 3235,
 3248,
 3264,
 3279,
 3303,
 3324,
 3330,
 3343,
 3348,
 3358,
 3360,
 3361,
 3393,
 3400,
 3427,
 3549,
 3550,
 3593,
 3708,
 3718,
 3733,
 3743,
 3798,
 3801,
 3927,
 4020,
 4026,
 4052,
 4062,
 4072,
 4108,
 4177,
 4181,
 4200,
 4205,
 4224,
 4273,
 4284,
 4296,
 4403,
 4405,
 4414,
 4419,
 4420,
 4422,
 4462,
 4485,
 4535,
 4544,
 4546,
 4613,
 4622,
 4623,
 4647,
 4689

In [433]:
with open('/project/ucb/nlp/ucb_data_proc.csv', 'r') as f:
    r = csv.reader(f)
    next(r)
    for row in r:
        subj_id = int(row[0])
        if subj_id in train:
            trw.writerow(row)
        elif subj_id in dev:
            dvw.writerow(row)
        else:
            tew.writerow(row)
trf.close()
dvf.close()
tef.close()

# Make vocab from train set

In [434]:
df = pd.read_csv('/project/ucb/nlp/ucb_train.csv')

In [435]:
df

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,LABELS,length
0,7119,68384,burning with urination for two days . also fel...,535.00;250.00;345.90,38
1,7249,70091,tikx is a lady who carries the diagnosis of lu...,719.45,65
2,3786,35337,rnbc is a pleasant **NUMERIC** pt seen for f u...,40.19;599.0;244.9;401.9;995.29;453.9,89
3,6420,60839,darkening and thickening of right lateral **NU...,27.24;30.29;38.04;715.16;333.94;380.4;357.4;27...,26
4,388,3012,available information aub chart xxxxx availabl...,34.0;40.11;48.6;486;401.1;747.60;340,356
5,356,2674,the patient was followed by wfqj rxco . he app...,333.94;345.80;356.4;438.84,76
6,3782,35224,jhp doing well with current meds . wrnm has up...,42.54;244.8;425.4;345.90,65
7,4174,39660,patient notes feeling well without any specifi...,244.8;345.90,47
8,3444,32110,patient is a very pleasant **NUMERIC** year ol...,345.90;295.90;453.40,80
9,7855,8307115,"feels great and safe on keppra , with no seizu...",V58.69;V79.0;780.99;799.81,53


In [436]:
code_set = set()
vocab = set()

In [437]:
for row in tqdm(df.itertuples()):
    if type(row[-3]) != str:
        print(row)
    text = row[-3].split()
    codes = row[-2].split(';')
    code_set = code_set.union(set(codes))
    vocab = vocab.union(set(text))

4650it [00:03, 1341.52it/s]


In [438]:
len(code_set)

2235

In [439]:
len(vocab)

20415

In [440]:
vocab

{'qqja',
 'adviced',
 'localizes',
 'foul',
 'four',
 'qqjr',
 'woods',
 'clotted',
 'hanging',
 'localized',
 'increase',
 'zaqr',
 'coarctation',
 'haggenstein',
 'demoted',
 'dialntin',
 'badreldin',
 'naturopathic',
 'spbn',
 'trileptal',
 'stabbed',
 'anterolisthesis',
 'disturb',
 'pivw',
 'probiotic',
 'wednesday',
 'rfcr',
 'persisted',
 'rfci',
 'stereotypical',
 'medicartions',
 'commented',
 'tumer',
 'bill',
 'peduncle',
 'rll',
 'ffot',
 'thalassemia',
 'clavicular',
 'sustaining',
 'scraped',
 'errors',
 'relieving',
 'tiered',
 'cooking',
 'rle',
 'aggression',
 'resilient',
 'fub',
 'hallucinating',
 'shocks',
 'pyvb',
 'fun',
 'pericardial',
 'diagnostic',
 'sufered',
 'natured',
 'incoherant',
 'kids',
 'k',
 'etiology',
 'neruo',
 'neurologist',
 'relpax',
 'appropriately',
 'kyphoplasty',
 'replace',
 'cocaine',
 'khajavi',
 'dne',
 'pvfc',
 'dna',
 'superficially',
 'insecurity',
 'dno',
 'dni',
 'dnj',
 'extremtites',
 'meloxicam',
 'therefore',
 'dnr',
 'strike',

In [441]:
with open('/project/ucb/nlp/vocab.csv', 'w') as of:
    for word in vocab:
        of.write(word + '\n')

In [442]:
for splt in ['train', 'dev', 'test']:
    filename = '/project/ucb/nlp/ucb_%s.csv' % splt
    df = pd.read_csv(filename)
    #df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('/project/ucb/nlp/%s.csv' % splt, index=False)

In [443]:
df = pd.read_csv('/project/ucb/nlp/train.csv')

In [444]:
df['length'].describe()

count    4650.000000
mean      124.199140
std       178.429133
min        11.000000
25%        40.000000
50%        73.000000
75%       137.000000
max      3025.000000
Name: length, dtype: float64

In [445]:
df['num_codes'] = df.apply(lambda row: len(str(row['LABELS']).split(';')), axis=1)

In [446]:
df['num_codes'].describe()

count    4650.000000
mean        3.722796
std         3.265273
min         1.000000
25%         2.000000
50%         3.000000
75%         5.000000
max        72.000000
Name: num_codes, dtype: float64

# Pretrain word embeddings

In [463]:
reload(word_embeddings)
w2v_file = word_embeddings.word_embeddings('full', 'processed', '/project/ucb/nlp/ucb_data_proc.csv', 0, 20)

building word2vec vocab on processed data...
training...
writing embeddings to /project/ucb/nlp/processed_full.w2v


In [455]:
import gensim

In [464]:
model = gensim.models.Word2Vec.load('/project/ucb/nlp/processed_full.w2v')

In [496]:
model.wv.most_similar(positive=['discharged'])

[('released', 0.7766711711883545),
 ('transferred', 0.746239423751831),
 ('admitted', 0.7170929312705994),
 ('sent', 0.6884703636169434),
 ('rehab', 0.666911780834198),
 ('placed', 0.6370691061019897),
 ('rehabilitation', 0.6314958930015564),
 ('switched', 0.6283079385757446),
 ('loaded', 0.6237520575523376),
 ('readmitted', 0.6231414079666138)]

In [500]:
reload(extract_wvs)
extract_wvs.gensim_to_embeddings('/project/ucb/nlp/processed_full.w2v', '/project/ucb/nlp/vocab.csv', 'full')

20415it [00:28, 712.19it/s]
