In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

#### Import the data
df = pd.read_csv('../data/text_dataset.csv')
Y = df['diag'].values

# Remove CFTD and unclear diagnosis
df['diag'].value_counts()
# Drop the rows with unclear diagnosis
df = df[df['diag'] != 'UNCLEAR']
# Do the same for the X array based on the df index

cv_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
df['diag'].value_counts()

In [None]:
from langchain.embeddings import HuggingFaceInstructEmbeddings
embeddings = HuggingFaceInstructEmbeddings(
    query_instruction="Represent the medicine document for classification: "
)
text_reports = df["text"].to_list()
embeddings_results = embeddings.embed_documents(text_reports)
X_instructor = np.array(embeddings_results)
Y = df['diag'].values
print(X_instructor.shape)
print(Y.shape)

In [None]:
clf_dummy = DummyClassifier(strategy='prior')
cv_scores_dummy = cross_val_score(clf_dummy, X_instructor, Y, cv=cv_fold)
print("Dummy Classifier Results:")
print(f"All CV Scores: {cv_scores_dummy}")
print(f"Mean CV Score:  {np.mean(cv_scores_dummy)}")
print(f"Standard Deviation CV Score: {np.std(cv_scores_dummy)}")

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(max_iter=1000)
cv_scores = cross_val_score(clf, X_instructor, Y, cv=cv_fold)
print("Results with Logistic Regression and Instructor Embeddings in French Reports:")
print(f"All CV Scores: {cv_scores}")
print(f"Mean CV Score:  {np.mean(cv_scores)}")
print(f"Standard Deviation CV Score: {np.std(cv_scores)}")

In [None]:
from sklearn.base import BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass


# Create a pipeline
pipe = Pipeline([('clf', DummyEstimator())]) # Placeholder Estimator

# Candidate learning algorithms and their hyperparameters
search_space = [{'clf': [LogisticRegression()],
                    'clf__max_iter': [1500]},
                {'clf': [GaussianNB()],},
                {'clf': [MLPClassifier()],
                 'clf__max_iter': [2500]},
                {'clf': [KNeighborsClassifier()],},
                {'clf': [SVC()],},
                {'clf': [GaussianProcessClassifier()],},
                {'clf': [HistGradientBoostingClassifier()],},
                {'clf': [DecisionTreeClassifier()],},
                {'clf': [RandomForestClassifier()],},
                {'clf': [AdaBoostClassifier()],},
                ]

# Create grid search 
gs = GridSearchCV(pipe, search_space, scoring="accuracy", cv=cv_fold)
gs.fit(X_instructor, Y)
df_cv_search = pd.DataFrame(gs.cv_results_)
# df_cv_search.to_csv('data/nlmyo/processed/report_translated_embed_cohere_gridsearch.csv')
df_cv_search

In [None]:
import joblib
# Let's choose and optimize a MLPC
param_grid = {
    'hidden_layer_sizes': [(400,), (200,), (100,100), (200,200)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'learning_rate_init': [0.001, 0.01],
    'max_iter': [800, 1500, 2500],
}

# Create grid search 
cls = MLPClassifier(random_state=42)
gs_mlpc = GridSearchCV(cls, param_grid, scoring="accuracy", cv=cv_fold, verbose=1)
gs_mlpc.fit(X_instructor, Y)
best_mlpc = gs_mlpc.best_estimator_
df_cv_search_rf = pd.DataFrame(gs_rf.cv_results_)
# Print the best parameters and score
print("Best parameters:", gs_rf.best_params_)
print("Best score:", gs_rf.best_score_)
joblib.dump(best_mlpc, '../models/instructor_model.joblib')

In [None]:
# Save the trained model to disk & WandB
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict
import joblib

# experiment tracking
import wandb
best_mlpc = joblib.load('../models/instructor_model.joblib')

# Use cross_val_predict to get predicted labels and probabilities
y_pred = cross_val_predict(best_mlpc, X_instructor, Y, cv=cv_fold)
y_probas = cross_val_predict(best_mlpc, X_instructor, Y, cv=cv_fold, method='predict_proba')
# Compute classification report
report = classification_report(Y, y_pred, target_names=best_mlpc.classes_, output_dict=True)

run = wandb.init(project='myo-text-classify',
                 config={"embedding": "instructor", "doc_lang": "fr", "corpus":"complete_1704023_190reports", "model":"MLPClassifier"})
config = wandb.config
# best_params = gs_mlpc.best_params_

# best_score = gs_mlpc.best_score_
# best_std = gs_mlpc.cv_results_['std_test_score'][gs_mlpc.best_index_]


wandb.log({'Classification Report': report,
        #    'Best Params': best_params,
        #    'Best Score': best_score,
        #    'CV Std Devs': best_std,
        #   "confusion_matrix": wandb.plot.confusion_matrix(probs=None, y_true=Y, preds=y_pred, class_names=list(best_mlpc.classes_))
        })

wandb.sklearn.plot_classifier(best_mlpc, X_instructor, X_instructor, Y, Y, y_pred, y_probas, labels=best_mlpc.classes_,
                                                         model_name='instructor_model', feature_names=None)
# Create artifact for best model
model_artifact = wandb.Artifact('instructor_model', type='model')
# Add best estimator to artifact
model_artifact.add_file('../models/instructor_model.joblib')
# Log artifact to WandB
wandb.run.log_artifact(model_artifact)
wandb.finish()