# Preprocessing

In [24]:
import numpy as np
import pandas as pd
import re, nltk, string
from sklearn.model_selection import train_test_split
import ggplot, datetime, time
import matplotlib
from collections import Counter
from matplotlib import pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline

## Select data for top ICD code

After some tests, we find that it is preferable to look for the top values before converting the top column to a list. In the spirit of not modifying the coding too early, we keep the ICD9 column as a string too.

In [25]:
df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])

In [26]:
df.shape

(52696, 5)

In [27]:
df.head()

Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,100001,58526,2117-09-17 00:00:00,25013 3371 5849 5780 25063 5363 4580 25043 403...,Admission Date: [**2117-9-11**] ...
1,100003,54610,2150-04-21 00:00:00,53100 2851 07054 5715 45621 53789 4019 53550 7823,Admission Date: [**2150-4-17**] ...
2,100006,9895,2108-04-17 00:00:00,49320 51881 486 20300 2761 7850 3090,Admission Date: [**2108-4-6**] Discharg...
3,100007,23018,2145-04-07 00:00:00,56081 5570 9973 486 4019,Admission Date: [**2145-3-31**] ...
4,100009,533,2162-05-21 00:00:00,41401 99604 4142 25000 27800 4148 4111 2859 40...,Admission Date: [**2162-5-16**] ...


In [28]:
def find_top_codes(df, col_name, n):
    """ Find the top codes from a columns of strings
    Returns a list of strings to make sure codes are treated as classes down the line """
    string_total = df[col_name].str.cat(sep=' ')
    counter_total = Counter(string_total.split(' '))
    return [word for word, word_count in counter_total.most_common(n)]

In [29]:
def select_codes_in_string(string, top_codes):
    """ Creates a sring of the codes which are both in the original string
    and in the top codes list """
    r = ''
    for code in top_codes:
        if code in string:
            r += ' ' + code
    return r.strip()

In [30]:
def filter_top_codes(df, col_name, n, filter_empty = True):
    """ Creates a dataframe with the codes column containing only the top codes
    and filters out the lines without any of the top codes if True
    
    Note: we may actually want to keep even the empty lines """
    r = df.copy()
    top_codes = find_top_codes(r, col_name, n)
    r[col_name] = r[col_name].apply(lambda x: select_codes_in_string(x, top_codes))
    if filter_empty:
        r = r.loc[r[col_name] != '']
    return r, top_codes

In [31]:
df, top_codes = filter_top_codes(df, 'ICD9', 20, filter_empty = True)

In [32]:
top_codes

['4019',
 '4280',
 '42731',
 '41401',
 '5849',
 '25000',
 '2724',
 '51881',
 '5990',
 '53081',
 '2720',
 '2859',
 '2449',
 '486',
 '2851',
 '2762',
 '496',
 '99592',
 '5070',
 '0389']

In [33]:
df.shape

(43992, 5)

In [34]:
df.head()

Unnamed: 0,HADM_ID,SUBJECT_ID,DATE,ICD9,TEXT
0,100001,58526,2117-09-17 00:00:00,5849,Admission Date: [**2117-9-11**] ...
1,100003,54610,2150-04-21 00:00:00,4019 2851,Admission Date: [**2150-4-17**] ...
2,100006,9895,2108-04-17 00:00:00,51881 486,Admission Date: [**2108-4-6**] Discharg...
3,100007,23018,2145-04-07 00:00:00,4019 486,Admission Date: [**2145-3-31**] ...
4,100009,533,2162-05-21 00:00:00,4019 41401 25000 2720 2859,Admission Date: [**2162-5-16**] ...


## "Splitting" the dataframe

## Vectorizing the text

Resources:   
- https://code.google.com/archive/p/word2vec/downloads # Cannot be downloaded   
- https://github.com/3Top/word2vec-api   
- https://keras.io/layers/embeddings/   
- https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html