In [1]:
import json
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from gensim.parsing import preprocessing
from gensim.parsing.preprocessing import strip_tags, strip_punctuation,strip_numeric,remove_stopwords, stem_text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

import itertools
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from string import ascii_uppercase

import gensim
import logging

import re

In [2]:
def remove_special_char(txt):
    return re.sub(r'[^a-zA-Z0-9 :,_/;.]',r'',txt)

In [3]:
def remove_empty(df_filter, filter_name):
    return_val = df_filter.copy()
    return_val = return_val.fillna('N/A')
    return_val = return_val[return_val[filter_name]!='N/A']
    return return_val

In [4]:
def remove_unreadable(txt):
    return re.sub(r'_[a-zA-Z0-9]+_',r'\n',txt)

In [5]:
def format_str(txt):
    return_val = txt.replace('\r',' ')
    return_val = return_val.strip()
    return_val = re.sub(r'(\s*\n\s*){2,}',r';', return_val)
    return_val = return_val.replace('(\n)+',' ')
    return_val = re.sub(r'(\s)+',r' ', return_val)
    return_val = return_val.strip()
    return return_val

In [6]:
def preprocess_radi_txt(df_filter, filter_str):
    return_val = df_filter.copy()
    # remove empty entries
    return_val = remove_empty(return_val, filter_str)
    # remove unreadable str
    # remove speical char
    return_val[filter_str] =  return_val[filter_str].apply(remove_unreadable)
    # remove empty entries
    return_val = remove_empty(return_val, filter_str)
    # format_str
    return_val[filter_str] =  return_val[filter_str].apply(format_str)
    # remove empty entries
    return_val = remove_empty(return_val, filter_str)
    # format_str
    return_val[filter_str] =  return_val[filter_str].apply(remove_special_char)
    # remove empty entries
    return_val = remove_empty(return_val, filter_str)
    
    return return_val

In [7]:
def read_df_fr_path(file_path):
    df_filter = pd.read_excel(file_path)
    # filter all data without any empty data
    df_filter = preprocess_radi_txt(df_filter, 'Radiology text')
    return df_filter

In [8]:
def clean_txt(txt):
    CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation,remove_stopwords, stem_text]
    words = preprocessing.preprocess_string(txt.lower(), CUSTOM_FILTERS)
    if not words:
        return 'N/A'
    return words

In [9]:
def cpt_ext(txt):
    try:
        splited_list = txt.split(';')
        new_txt = ""
        for i in splited_list:
            for j in i.split(':'):
                if j.strip().lower()=='procedure':
                    new_txt=i.split(':')[1].strip()
                    return new_txt
                elif j.strip().lower()=='exam':
                    new_txt=i.split(':')[1].strip()
                    return new_txt
        for i in range(len(splited_list)):
            tmp_txt = splited_list[i].strip()
            if 'REPORT'== tmp_txt.split(" ")[0].strip():
                new_txt = tmp_txt.split(" ")[1:]
                new_txt = " ".join(new_txt).strip()
                if new_txt == "":
                    new_txt = splited_list[i+1].strip()
        return new_txt
    except:
        return 'N/A'

In [10]:
# data cleaning for CPT
def df_clean_CPT(df_filter):
    df_return = df_filter.copy()
    # specific cleaning empty entry in CPT_text
    # empty entries mean failed convertion during the extraction process
    df_return['CPT_text'] = df_return['Radiology text'].apply(cpt_ext)
    df_return = remove_empty(df_return, 'CPT_text')
    # transferring words to sentences
    df_return['CPT_text'] = df_return['CPT_text'].apply(clean_txt)
    df_return = remove_empty(df_return, 'CPT_text')
    return df_return

In [11]:
def load_data(filepath):
    # load data as dataframe
    df_filter = read_df_fr_path(filepath)
    # filter all data without any empty data
    df_return = df_clean_CPT(df_filter)
    return df_return

In [28]:
data_df = pd.read_excel('../../data/test.xlsx')

In [29]:
radiology_text = preprocess_radi_txt(data_df, 'Radiology text')

In [30]:
radiology_text['CPT_text'] = radiology_text['Radiology text'].apply(cpt_ext)

In [31]:
# transferring words to sentences
radiology_text['CPT_text'] = radiology_text['CPT_text'].apply(clean_txt)
radiology_text = remove_empty(radiology_text, 'CPT_text')

In [32]:
radiology_text = remove_empty(radiology_text, 'CPT_text')

In [33]:
new_df = {'radiology_text':radiology_text['Radiology text'], 'cpt_ext':radiology_text['CPT_text']}

In [34]:
new_df = pd.DataFrame(new_df)

In [35]:
new_df.to_csv("/Users/hiro/Desktop/renew_text.csv")

In [44]:
words = ['2atoha', 's1so9', 'lkoc', 'proin']

In [45]:
new_words = []

In [48]:
for word in words:
    word_val = word
    word_val = re.sub(r'(\d+[a-zA-Z]*)',r' ', word_val)
    word_val = re.sub(r'([a-zA-Z]*\d+[a-zA-Z]*)',r' ', word_val)
    word_val = re.sub(r'([a-zA-Z]*\d+)',r' ', word_val)
    if word_val ==' ':
        print("empty")
    print(word)

empty
2atoha
s1so9
lkoc
proin


In [13]:
train_df = load_data('../../data/train.xlsx')
test_df = load_data('../../data/test.xlsx')

In [14]:
vect = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_features=400)
tfidf = TfidfTransformer()
def X_vectorization(df):
    return_df = vect.fit_transform(df)
    return_df = tfidf.fit_transform(return_df)
    return return_df

In [15]:
X_train = X_vectorization(train_df['CPT_text'])
y_train = train_df['cpt_label']
X_test = X_vectorization(test_df['CPT_text'])
y_test = test_df['cpt_label']

In [16]:
def log_reg():
    print(" ")
    print("logistic regression")
    reg = LogisticRegression(random_state=0, solver = 'newton-cg', class_weight='balanced')
    reg = reg.fit(X_train, y_train)
    pred = reg.predict(X_test)
    print (classification_report(y_test, pred))

In [None]:
def ran_for():
    print(" ")
    print("random forest")
    reg = RandomForestClassifier(
        random_state=0, 
        class_weight = 'balanced',
        n_jobs=-1)
    reg = reg.fit(X_train, y_train)
    pred = reg.predict(X_test)
    print (classification_report(y_test, pred))

In [None]:
def xgboost_test():
    print(" ")
    print("xgboost")
    reg = XGBClassifier(n_estimators=100, 
                        max_depth=3, 
                        random_state=0,
                        n_jobs = -1)
    reg = reg.fit(X_train, y_train)
    pred = reg.predict(X_test)
    print (classification_report(y_test, pred))


In [None]:
log_reg()
ran_for()
xgboost_test()