In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

#### Import the data
df = pd.read_csv('../data/processed/diag_translated.csv')
Y = df['diag_simple'].values

# Remove CFTD and unclear diagnosis
df['diag_simple'].value_counts()
df['diag_simple'] = df['diag_simple'].replace('CFTD', 'UNCLEAR')
# Drop the rows with unclear diagnosis
df = df[df['diag_simple'] != 'UNCLEAR']
# Do the same for the X array based on the df index

cv_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
df['diag_simple'].value_counts()

In [None]:
from langchain.embeddings import HuggingFaceInstructEmbeddings
embeddings = HuggingFaceInstructEmbeddings(
    query_instruction="Represent the medicine document for classification: "
)
text_reports = df["raw_text"].to_list()
embeddings_results = embeddings.embed_documents(text_reports)
X_instructor = np.array(embeddings_results)
Y = df['diag_simple'].values
print(X_instructor.shape)
print(Y.shape)

In [None]:
clf_dummy = DummyClassifier(strategy='prior')
cv_scores_dummy = cross_val_score(clf_dummy, X_instructor, Y, cv=cv_fold)
print("Dummy Classifier Results:")
print(f"All CV Scores: {cv_scores_dummy}")
print(f"Mean CV Score:  {np.mean(cv_scores_dummy)}")
print(f"Standard Deviation CV Score: {np.std(cv_scores_dummy)}")

In [None]:
clf = LogisticRegression(max_iter=3000)
cv_scores = cross_val_score(clf, X_instructor, Y, cv=cv_fold)
print("Results with Logistic Regression and OpenAI Embeddings on English Translated Reports:")
print(f"All CV Scores: {cv_scores}")
print(f"Mean CV Score:  {np.mean(cv_scores)}")
print(f"Standard Deviation CV Score: {np.std(cv_scores)}")

In [None]:
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass


# Create a pipeline
pipe = Pipeline([('clf', DummyEstimator())]) # Placeholder Estimator

# Candidate learning algorithms and their hyperparameters
search_space = [{'clf': [LogisticRegression()],
                    'clf__max_iter': [1500]},
                {'clf': [GaussianNB()],},
                {'clf': [MLPClassifier()],
                 'clf__max_iter': [500]},
                {'clf': [KNeighborsClassifier()],},
                {'clf': [SVC()],},
                {'clf': [GaussianProcessClassifier()],},
                {'clf': [HistGradientBoostingClassifier()],},
                {'clf': [DecisionTreeClassifier()],},
                {'clf': [RandomForestClassifier()],},
                {'clf': [AdaBoostClassifier()],},
                ]

# Create grid search 
gs = GridSearchCV(pipe, search_space, scoring="accuracy", cv=cv_fold)
gs.fit(X_instructor, Y)
df_cv_search = pd.DataFrame(gs.cv_results_)
# df_cv_search.to_csv('data/nlmyo/processed/report_translated_embed_cohere_gridsearch.csv')
df_cv_search

In [None]:
# Let's choose and optimize a random forest
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
# Create grid search 
cls = RandomForestClassifier(random_state=42)
gs_rf = GridSearchCV(cls, param_grid, scoring="accuracy", cv=cv_fold)
gs_rf.fit(X_instructor, Y)
df_cv_search_rf = pd.DataFrame(gs_rf.cv_results_)
df_cv_search_rf

In [None]:
# Print the best parameters and score
print("Best parameters:", gs_rf.best_params_)
print("Best score:", gs_rf.best_score_)


best_rf = RandomForestClassifier(**gs_rf.best_params_, random_state=42)
best_rf.fit(X_instructor, Y)

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict

# Use cross_val_predict to get predicted labels and probabilities
y_pred = cross_val_predict(best_rf, X_instructor, Y, cv=cv_fold)
y_probas = cross_val_predict(best_rf, X_instructor, Y, cv=cv_fold, method='predict_proba')
# Compute classification report
report = classification_report(Y, y_pred, target_names=set(Y), output_dict=True)


In [None]:
# Save the trained model to disk & WandB
import joblib
joblib.dump(best_rf, '../models/instructor_model.joblib')

# experiment tracking
import wandb

run = wandb.init(project='myo-text-classify',
                 config={"embedding": "instructor", "doc_lang": "fr"})
best_estimator = gs_rf.best_estimator_
config = wandb.config
best_params = gs_rf.best_params_
best_score = gs_rf.best_score_
best_std = gs_rf.cv_results_['std_test_score'][gs_rf.best_index_]


wandb.log({'Classification Report': report,
           'Best Params': best_params,
           'Best Score': best_score,
           'CV Std Devs': best_std},)

wandb.sklearn.plot_classifier(best_estimator, X_instructor, X_instructor, Y, Y, y_pred, y_probas, labels=list(set(Y)),
                                                         model_name='instructor_model', feature_names=None)
# Create artifact for best model
model_artifact = wandb.Artifact('instructor_model', type='model')
# Add best estimator to artifact
model_artifact.add_file('../models/instructor_model.joblib')
# Log artifact to WandB
wandb.run.log_artifact(model_artifact)
wandb.finish()