https://www.kaggle.com/code/danielcwq/predict-medical-specialty-fastai

## Data Preprocessing and Cleaning

In [63]:
import pandas as pd

# Load data and remove null rows
mtsamples = pd.read_csv('mtsamples.csv',index_col=0)
mtsamples = mtsamples[mtsamples['transcription'].notnull()]
mtsamples = mtsamples[mtsamples['medical_specialty'].notnull()]

mtsamples = mtsamples.drop(mtsamples[mtsamples['medical_specialty']==' SOAP / Chart / Progress Notes'].index)
mtsamples = mtsamples.drop(mtsamples[mtsamples['medical_specialty']==' Office Notes'].index)
mtsamples = mtsamples.drop(mtsamples[mtsamples['medical_specialty']==' IME-QME-Work Comp etc.'].index)
mtsamples = mtsamples.drop(mtsamples[mtsamples['medical_specialty']==' Discharge Summary'].index)
mtsamples = mtsamples.drop(mtsamples[mtsamples['medical_specialty']==' Emergency Room Reports'].index)
mtsamples = mtsamples.drop(mtsamples[mtsamples['medical_specialty']==' Letters'].index)

mtsamples = mtsamples.drop(mtsamples[mtsamples['medical_specialty']==' Autopsy'].index)
mtsamples = mtsamples.drop(mtsamples[mtsamples['medical_specialty']==' Consult - History and Phy.'].index)
mtsamples = mtsamples.drop(mtsamples[mtsamples['medical_specialty']==' General Medicine'].index)
mtsamples = mtsamples.drop(mtsamples[mtsamples['medical_specialty']==' Lab Medicine - Pathology'].index)
mtsamples = mtsamples.drop(mtsamples[mtsamples['medical_specialty']==' Pain Management'].index)
mtsamples = mtsamples.drop(mtsamples[mtsamples['medical_specialty']==' Physical Medicine - Rehab'].index)
mtsamples = mtsamples.drop(mtsamples[mtsamples['medical_specialty']==' Hospice - Palliative Care'].index)

# Creating lists of labels and transcriptions
speciality = list(mtsamples['medical_specialty'])
transcriptions = list(mtsamples['transcription'])

In [64]:
# pre process transcriptions
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim.parsing.preprocessing import remove_stopwords

snow_stemmer = SnowballStemmer(language='english')
def snow_stem(trx):
    stem = [snow_stemmer.stem(word) for word in trx]
    return stem

# Text preprocessing and cleaning
trx_cleaned = []
for i in range(len(transcriptions)):
    trx = transcriptions[i]
    trx = trx.lower()
    trx = trx.replace("."," ")
    trx = trx.replace(","," ")
    trx = trx.replace(":","")
    trx = trx.replace("-"," ")
    trx = re.sub(r'[0-9]+','',trx)
    trx = trx.replace("  "," ")
    trx = trx.replace("  "," ")
    
    trx = word_tokenize(trx)
    trx = snow_stem(trx)
    trx = ' '.join([t for t in trx])
    trx = remove_stopwords(trx)
    
    trx_cleaned.append(trx)

## Featurization and Train-Test Split

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Generating features using TF-IDF 
X = trx_cleaned
Y = speciality
tfidf_vec = TfidfVectorizer(stop_words='english')
X = tfidf_vec.fit_transform(X)

# Splitting data into train and test sets
X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,train_size=0.666,random_state=500)

## Random Forest

In [48]:
#### Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np

# Initializing and fitting classifier with train data
classifier = RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=42)
classifier.fit(X_train,Y_train)

# Predicting the test set results
y_pred = classifier.predict(X_validation)

# Performance Evaluation
y_val = np.array(Y_validation)
print(classification_report(y_val,y_pred))

                             precision    recall  f1-score   support

       Allergy / Immunology       0.00      0.00      0.00         0
                    Autopsy       1.00      0.25      0.40         4
                 Bariatrics       0.00      0.00      0.00        10
 Cardiovascular / Pulmonary       0.13      0.11      0.12       119
               Chiropractic       0.00      0.00      0.00         7
 Consult - History and Phy.       0.18      0.32      0.23       176
 Cosmetic / Plastic Surgery       0.00      0.00      0.00         7
                  Dentistry       0.00      0.00      0.00        10
                Dermatology       0.00      0.00      0.00        11
       Diets and Nutritions       0.00      0.00      0.00         5
       ENT - Otolaryngology       0.00      0.00      0.00        25
              Endocrinology       0.00      0.00      0.00         7
           Gastroenterology       0.07      0.06      0.06        67
           General Medicine      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Logistic Regression

In [71]:
### Logistic Regression
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

X_train,X_validation,Y_train,Y_validation = train_test_split(X,Y,test_size=0.4,random_state=500,shuffle=True)

model = LogisticRegression(multi_class='multinomial',solver='lbfgs',class_weight='balanced',random_state=500)
model.fit(X_train,Y_train)
ypred = model.predict(X_validation)

y = np.array(Y_validation)
print(classification_report(y,ypred))

                             precision    recall  f1-score   support

       Allergy / Immunology       0.00      0.00      0.00         4
                 Bariatrics       0.56      0.62      0.59         8
 Cardiovascular / Pulmonary       0.53      0.66      0.59       148
               Chiropractic       0.27      0.57      0.36         7
 Cosmetic / Plastic Surgery       0.22      0.50      0.31         8
                  Dentistry       0.57      0.89      0.70         9
                Dermatology       0.40      0.75      0.52         8
       Diets and Nutritions       1.00      1.00      1.00         5
       ENT - Otolaryngology       0.51      0.80      0.62        41
              Endocrinology       0.36      0.71      0.48         7
           Gastroenterology       0.47      0.65      0.54        99
      Hematology - Oncology       0.26      0.37      0.30        38
                 Nephrology       0.43      0.74      0.55        39
                  Neurology      

## SVM

In [72]:
### SVM
X = trx_cleaned
Y = speciality
tfidf_vec = TfidfVectorizer(stop_words='english')
X = tfidf_vec.fit_transform(X)
X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=0.2,random_state=500)

from sklearn.svm import SVC
classifier = SVC(kernel='rbf',random_state=42)
classifier.fit(X_train,Y_train)
ypred = classifier.predict(X_validation)

from sklearn.metrics import classification_report
y = np.array(Y_validation)
print(classification_report(y,ypred))

                             precision    recall  f1-score   support

       Allergy / Immunology       0.00      0.00      0.00         1
                 Bariatrics       0.00      0.00      0.00         5
 Cardiovascular / Pulmonary       0.27      0.38      0.31        66
               Chiropractic       0.00      0.00      0.00         2
 Cosmetic / Plastic Surgery       0.00      0.00      0.00         4
                  Dentistry       1.00      0.17      0.29         6
                Dermatology       0.00      0.00      0.00         6
       ENT - Otolaryngology       0.25      0.21      0.23        19
              Endocrinology       0.00      0.00      0.00         4
           Gastroenterology       0.20      0.17      0.18        47
      Hematology - Oncology       0.33      0.08      0.13        25
                 Nephrology       0.50      0.09      0.15        23
                  Neurology       0.40      0.39      0.40        46
               Neurosurgery      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
