In [33]:
# Import necessary libraries
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing  import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import optuna
np.random.seed(13)

warnings.filterwarnings("ignore")

In [34]:
# Import validated data
data = pd.read_csv("data/data_validated.csv")
medical_data = pd.read_csv("data/data_medic_validated.csv")

In [35]:
data.loc[data["feedback"].notnull()]

Unnamed: 0,username,comments,labels,feedback
34,BBG1308,High speed scanners make quick work of this.\n...,veterinarian,"should be ""other"" this person sounds like a ve..."
56,Averycooldood,"I hope this is not rude to VTs, but as a VA I’...",veterinarian,"should be ""other"" VA means vetinarian assistant"


In [36]:
# change reviewed columns
data.loc[data[data["feedback"].notnull()].index,"labels"] = "other"

In [37]:
data.loc[data["feedback"].notnull()]

Unnamed: 0,username,comments,labels,feedback
34,BBG1308,High speed scanners make quick work of this.\n...,other,"should be ""other"" this person sounds like a ve..."
56,Averycooldood,"I hope this is not rude to VTs, but as a VA I’...",other,"should be ""other"" VA means vetinarian assistant"


In [38]:
data["labels"].value_counts()

labels
other             240
veterinarian       56
medical doctor      4
Name: count, dtype: int64

In [39]:
# drop unnecessary samples with other labels
rows_to_drop = data.loc[data["labels"] == "other"].sample(200).index
data.drop(index=rows_to_drop, inplace=True)
data.reset_index(drop=True, inplace=True)

In [40]:
data.head()

Unnamed: 0,username,comments,labels,feedback
0,HiddenJindo,Hello! Veterinary medicine is an incredibly re...,veterinarian,
1,rotten-cheese-ball,I’ve taken a nationally administered exam befo...,other,
2,Im-just-guessing,"Removing dewclaws? No I agree with you, realis...",veterinarian,
3,bilbany12,I'm a dentist and I respect vets more than hum...,other,
4,robino358,My only concern with that would be if any inte...,veterinarian,


In [41]:
medical_data.head(10)

Unnamed: 0,username,comments,labels,feedback
0,No_Cellist_746,Hi! I am in my first year out of school and yo...,medical doctor,this is the correct label
1,raygunlock,This behavior is so unacceptable. Please don’t...,medical doctor,this is the correct label
2,heretoroastmk,Your salary feels on par for my classmates goi...,medical doctor,this is the correct label
3,ReindeerVarious3024,I’m a human doctor who has fostered many a dog...,medical doctor,this is the correct label
4,Kirsten,My sister is a veterinarian and I am a physici...,medical doctor,this is the correct label
5,Prestigious_Union_50,I'm not excusing it ...but it's often a strain...,medical doctor,this is the correct label
6,dedcrypticginger,It does include a LOT of on call cases/hours/w...,medical doctor,this is the correct label
7,Amythyst72,"Female, medical oncologist in CA. HCOL area. B...",medical doctor,this is the correct label
8,DrZefe,Best job in the world IMO. But it’s not for ev...,medical doctor,this is the correct label
9,unbuhhlievable,Do not let people gaslight you into thinking y...,medical doctor,"im unsure on this one. ""tech"" is usually for ""..."


In [42]:
# Drop last row because of uncertainty
medical_data.drop(index=[9],inplace=True)

In [43]:
medical_data.head(10)

Unnamed: 0,username,comments,labels,feedback
0,No_Cellist_746,Hi! I am in my first year out of school and yo...,medical doctor,this is the correct label
1,raygunlock,This behavior is so unacceptable. Please don’t...,medical doctor,this is the correct label
2,heretoroastmk,Your salary feels on par for my classmates goi...,medical doctor,this is the correct label
3,ReindeerVarious3024,I’m a human doctor who has fostered many a dog...,medical doctor,this is the correct label
4,Kirsten,My sister is a veterinarian and I am a physici...,medical doctor,this is the correct label
5,Prestigious_Union_50,I'm not excusing it ...but it's often a strain...,medical doctor,this is the correct label
6,dedcrypticginger,It does include a LOT of on call cases/hours/w...,medical doctor,this is the correct label
7,Amythyst72,"Female, medical oncologist in CA. HCOL area. B...",medical doctor,this is the correct label
8,DrZefe,Best job in the world IMO. But it’s not for ev...,medical doctor,this is the correct label


In [44]:
data.head()

Unnamed: 0,username,comments,labels,feedback
0,HiddenJindo,Hello! Veterinary medicine is an incredibly re...,veterinarian,
1,rotten-cheese-ball,I’ve taken a nationally administered exam befo...,other,
2,Im-just-guessing,"Removing dewclaws? No I agree with you, realis...",veterinarian,
3,bilbany12,I'm a dentist and I respect vets more than hum...,other,
4,robino358,My only concern with that would be if any inte...,veterinarian,


In [45]:
medical_data.head()

Unnamed: 0,username,comments,labels,feedback
0,No_Cellist_746,Hi! I am in my first year out of school and yo...,medical doctor,this is the correct label
1,raygunlock,This behavior is so unacceptable. Please don’t...,medical doctor,this is the correct label
2,heretoroastmk,Your salary feels on par for my classmates goi...,medical doctor,this is the correct label
3,ReindeerVarious3024,I’m a human doctor who has fostered many a dog...,medical doctor,this is the correct label
4,Kirsten,My sister is a veterinarian and I am a physici...,medical doctor,this is the correct label


In [46]:
# Drop unnecessary columns
columns_to_drop = ["feedback","username"]

data.drop(columns=columns_to_drop,inplace=True)
medical_data.drop(columns=columns_to_drop,inplace=True)

In [47]:
# Merge data with sourced medical data
data = pd.concat([data,medical_data],ignore_index=True)

In [48]:
# Saved training data
data.to_csv("data/training_data.csv",index=False)

In [49]:
data["labels"].value_counts()

labels
veterinarian      56
other             40
medical doctor    13
Name: count, dtype: int64

In [50]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data["labels"])

In [51]:
bow_encoder = CountVectorizer(
    stop_words="english", dtype="float32"
)
X = bow_encoder.fit_transform(data["comments"])

In [53]:
# Get quick baselines for the models
model = {
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(random_state=42),
    "LGBM": LGBMClassifier(random_state=42,verbose= -1),
    "XGB": XGBClassifier(random_state=42),
}
loocv = LeaveOneOut()
for name, clf in model.items():
    scores = cross_val_score(clf,X,y,cv=loocv,scoring="accuracy")
    print(name)
    print(f"LOOCV Accuracy: {scores.mean()}")
    print("--------")

LR
LOOCV Accuracy: 0.6788990825688074
--------
DT
LOOCV Accuracy: 0.7798165137614679
--------
RF
LOOCV Accuracy: 0.6330275229357798
--------
LGBM
LOOCV Accuracy: 0.6330275229357798
--------
XGB
LOOCV Accuracy: 0.7339449541284404
--------


In [54]:
# Finetune best performing model, Decision Tree with optuna
def decision_tree_trial(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 5),
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy","log_loss"])
    }
    clf = DecisionTreeClassifier(**params)
    score = cross_val_score(clf,X,y,cv=loocv,scoring="accuracy").mean()
    return score

study = optuna.create_study(direction="maximize")
study.optimize(decision_tree_trial, n_trials=100)

[I 2024-05-16 22:51:13,534] A new study created in memory with name: no-name-6774e716-2ac3-46a6-ab3e-10b46c3c1c1a
[I 2024-05-16 22:51:13,780] Trial 0 finished with value: 0.6880733944954128 and parameters: {'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 4, 'criterion': 'entropy'}. Best is trial 0 with value: 0.6880733944954128.
[I 2024-05-16 22:51:14,101] Trial 1 finished with value: 0.7614678899082569 and parameters: {'max_depth': 8, 'min_samples_split': 8, 'min_samples_leaf': 3, 'criterion': 'log_loss'}. Best is trial 1 with value: 0.7614678899082569.
[I 2024-05-16 22:51:14,307] Trial 2 finished with value: 0.6422018348623854 and parameters: {'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 3, 'criterion': 'gini'}. Best is trial 1 with value: 0.7614678899082569.
[I 2024-05-16 22:51:14,645] Trial 3 finished with value: 0.7247706422018348 and parameters: {'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'entropy'}. Best is trial 1 with

In [56]:
# Get best performing parameters
trial = study.best_trial

print(f"Best Parameters: {trial.params}")
print(f"Best Score: {trial.value}")

Best Parameters: {'max_depth': 11, 'min_samples_split': 10, 'min_samples_leaf': 1, 'criterion': 'log_loss'}
Best Score: 0.8440366972477065
