In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the train and test data
train_df = pd.read_csv("../data/kaggle/train_numeric.csv")
test_df  = pd.read_csv("../data/kaggle/test_numeric.csv")

In [2]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 688 entries, 0 to 687
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Group                     688 non-null    int64  
 1   Sex                       688 non-null    int64  
 2   Age                       688 non-null    int64  
 3   Patients number per hour  688 non-null    int64  
 4   Arrival mode              688 non-null    int64  
 5   Injury                    688 non-null    int64  
 6   Chief_complain            688 non-null    object 
 7   Mental                    688 non-null    int64  
 8   Pain                      688 non-null    int64  
 9   NRS_pain                  370 non-null    float64
 10  SBP                       670 non-null    float64
 11  DBP                       670 non-null    float64
 12  HR                        672 non-null    float64
 13  RR                        675 non-null    float64
 14  BT        

In [16]:
train_df = train_df.drop(columns=['Chief_complain','Diagnosis in ED','KTAS_RN'])
test_df = test_df.drop(columns=['Chief_complain','Diagnosis in ED','KTAS_RN'])

In [None]:
# List of vital signs columns
vital_signs_cols = ['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain']

# Ensure vital signs are numeric and handle missing values
data[vital_signs_cols] = data[vital_signs_cols].apply(pd.to_numeric, errors='coerce')

# Normalize vital signs using Min-Max scaling
scaler = MinMaxScaler()
vital_signs_normalized = scaler.fit_transform(data[vital_signs_cols])

In [17]:
# Fill all NAs with the mean

numeric_cols = train_df.select_dtypes(include=["number"]).columns
for col in numeric_cols:
    train_df[col].fillna(train_df[col].mean(), inplace=True)
    test_df[col].fillna(test_df[col].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df[col].fillna(train_df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_df[col].fillna(test_df[col].mean(), inplace=True)


In [20]:
# Optionally, drop rows with missing target values (if any)
train_df = train_df.dropna(subset=["KTAS_expert",])
test_df  = test_df.dropna(subset=["KTAS_expert", ])

# Define the target column and feature set.
# Here we assume that all columns except 'KTAS_expert' are features.
target_col = "KTAS_expert"
features = [col for col in train_df.columns if col != target_col]

X_train = train_df[features]
y_train = train_df[target_col]

X_test = test_df[features]
y_test = test_df[target_col]

# (Optional) Convert all features to numeric in case they are not.
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test  = X_test.apply(pd.to_numeric, errors='coerce')

# (Optional) Fill any remaining missing values with the median of each column.
X_train.fillna(X_train.median(), inplace=True)
X_test.fillna(X_test.median(), inplace=True)

# Build and train the Logistic Regression model.
# For multiclass problems, LogisticRegression can use multinomial mode.
logreg = LogisticRegression(max_iter=1000, multi_class='auto')
logreg.fit(X_train, y_train)

# Make predictions on the test set.
preds_lr = logreg.predict(X_test)

# Print the results.
print("=== Logistic Regression Predictions ===")
print("\nAccuracy:", accuracy_score(y_test, preds_lr))
print("\nClassification Report:\n", classification_report(y_test, preds_lr))


=== Logistic Regression Predictions ===

Accuracy: 0.4835924006908463

Classification Report:
               precision    recall  f1-score   support

           1       0.71      0.42      0.53        12
           2       0.37      0.17      0.23       148
           3       0.47      0.71      0.57       217
           4       0.56      0.52      0.54       183
           5       0.00      0.00      0.00        19

    accuracy                           0.48       579
   macro avg       0.42      0.36      0.37       579
weighted avg       0.46      0.48      0.45       579



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
y_train = y_train - 1

In [27]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

# Build and train the XGBoost model.
# Setting 'use_label_encoder' to False to avoid warnings and specifying an evaluation metric.
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)

# Make predictions on the test set.
preds_xgb = xgb_model.predict(X_test)

# Print the results.
print("=== XGBoost Predictions ===")
print("\nAccuracy:", accuracy_score(y_test, preds_xgb))
print("\nClassification Report:\n", classification_report(y_test, preds_xgb))


=== XGBoost Predictions ===

Accuracy: 0.2970639032815199

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.04      0.17      0.07        12
           2       0.27      0.28      0.28       148
           3       0.34      0.59      0.43       217
           4       0.25      0.01      0.01       183
           5       0.00      0.00      0.00        19

    accuracy                           0.30       579
   macro avg       0.15      0.17      0.13       579
weighted avg       0.28      0.30      0.24       579



Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### BioBERT + MLP

In [7]:
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
train_df = pd.read_csv("../data/kaggle/train_numeric.csv")
test_df  = pd.read_csv("../data/kaggle/test_numeric.csv")
train_df = train_df.drop(columns=['Chief_complain','Diagnosis in ED'])
test_df = test_df.drop(columns=['Chief_complain','Diagnosis in ED'])

In [9]:

# Load the pre-trained SentenceTransformer model for clinical text
# This model is fine-tuned for biomedical and clinical text embeddings
# model_name = 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb'
# model = SentenceTransformer(model_name)

from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
