In [24]:
import pandas as pd
import numpy as np
import os
import pickle

from string import digits, punctuation
import re
import spacy

import xml.etree.ElementTree as ET

from nltk.corpus import stopwords 
from nltk.corpus import wordnet 
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt
%matplotlib inline

In [25]:
# Write tokenizer as a class
class Tokenization(object):
    """tokenize each word and for each tocken:
    1. excludes digits and punctuations, 
    2. excludes token that has '-year', 
    3. excludes stop words,
    4. makes the token stemmed,
    and return a string with all tokens separated by a space"""
    from string import digits, punctuation
    import spacy
    from nltk.tokenize import word_tokenize
    #import en_core_web_sm
    import dill
    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
    from nltk.stem import PorterStemmer

    def __init__(self, #nlp=word_tokenize()#spacy.load('en_core_web_sm'), 
                 stop=list(ENGLISH_STOP_WORDS)+['year', 'month', 'old'], 
                 stemmer=PorterStemmer()):
        #self.nlp = nlp
        self.stop = stop
        self.stemmer = stemmer

    def fit(self, x, y=None):
        return self
        
    def transform(self, x):
        def words2tokens(sentence, stop=self.stop, stemmer=self.stemmer):
            l = []    
            tokens = word_tokenize(sentence)
            for token in tokens:                
                t = token.lower()
                if (t in punctuation) or t.isdigit():
                    continue
                if ('-year' in t) or ('-month' in t) or ('-day' in t):
                    continue
                if ('/' in t) or ('=' in t) or (t in stop):
                    continue
                l.append(stemmer.stem(t))
            return ' '.join(l)

        series = x.apply(lambda s: words2tokens(s, stop=self.stop, stemmer=self.stemmer))    
        return series

In [26]:
# Write matrix converter as a class
class MatrixConverter(object):
    """Converter a tfidf matrix to a df"""
    def __init__(self):
        pass
    def fit(self, x, y=None):
        return self    
    def transform(self, x):
        return pd.DataFrame(x.todense())

In [46]:
X_test = pd.read_csv('X_test.csv', header=None, squeeze=True)
#X_test = X_test.iloc[:,0]
Y_test = pd.read_csv('Y_test.csv')

In [47]:
X_test.head()

0    Urinary tract infection in a 2-year, 8-month -...
1    Hydronephrosis. Followup. Interval growth in b...
2              Recurrent UTI. Normal renal ultrasound.
3    3-year - old girl with urinary tract infection...
4    Cough for one week. Lungs clear and heart normal.
Name: 0, dtype: object

In [48]:
Y_test.head()

Unnamed: 0,ICD9_785.6,ICD9_599.0,ICD9_789.00,ICD9_V67.09,ICD9_462,ICD9_786.50,ICD9_593.5,ICD9_V13.09,ICD9_788.41,ICD9_787.03,...,ICD9_596.54,ICD9_753.21,ICD9_789.09,ICD9_786.59,ICD9_795.5,ICD9_783.0,ICD9_753.3,ICD9_596.8,ICD9_486,ICD9_780.6
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
model = pickle.load(open("model_pipe.pkl", "rb"))
predict_test = model.predict(X_test)
classification = classification_report(Y_test, predict_test)

print('test micro f1 score:', f1_score(Y_test, predict_test, average='micro'))
print('test classification report:')
print(classification)

test micro f1 score: 0.732177263969
test classification report:
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.48      0.82      0.61        17
          2       0.00      0.00      0.00         1
          3       0.00      0.00      0.00         0
          4       0.00      0.00      0.00         1
          5       0.86      0.86      0.86         7
          6       0.00      0.00      0.00         3
          7       0.00      0.00      0.00         1
          8       0.00      0.00      0.00         0
          9       0.00      0.00      0.00         2
         10       0.00      0.00      0.00         3
         11       0.00      0.00      0.00         0
         12       0.78      0.78      0.78         9
         13       0.00      0.00      0.00         0
         14       1.00      1.00      1.00         1
         15       0.86      0.86      0.86         7
         16       0.00      0.00  

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [51]:
sample = pd.Series(X_test[0])
sample

0    Urinary tract infection in a 2-year, 8-month -...
dtype: object

In [53]:
icd_codes = pickle.load(open("icd_codes.pkl", "rb"))
icd_codes
#model.predict(sample)

['785.6',
 '599.0',
 '789.00',
 'V67.09',
 '462',
 '786.50',
 '593.5',
 'V13.09',
 '788.41',
 '787.03',
 '277.00',
 '511.9',
 '786.07',
 '753.0',
 '518.0',
 '788.30',
 '791.0',
 '593.1',
 '591',
 '759.89',
 '758.6',
 '592.0',
 '786.05',
 '279.12',
 '786.2',
 '741.90',
 '786.09',
 '593.70',
 'V13.02',
 '599.7',
 'V42.0',
 '079.99',
 '493.90',
 'V72.5',
 '593.89',
 '596.54',
 '753.21',
 '789.09',
 '786.59',
 '795.5',
 '783.0',
 '753.3',
 '596.8',
 '486',
 '780.6']

In [61]:
results = model.predict(sample)
[[icd_codes[i] for i, r in enumerate(result) if r == 1] for result in results]

[['599.0']]