In [None]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 438, done.[K
remote: Counting objects: 100% (169/169), done.[K
remote: Compressing objects: 100% (115/115), done.[K
remote: Total 438 (delta 121), reused 57 (delta 54), pack-reused 269[K
Receiving objects: 100% (438/438), 118.31 KiB | 5.14 MiB/s, done.
Resolving deltas: 100% (223/223), done.
***********************************************************************
We will now install RAPIDS via pip! 
Please stand by, should be quick...
***********************************************************************

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cudf-cu12==23.12.*
  Downloading https://pypi.nvidia.com/cudf-cu12/cudf_cu12-23.12.1-cp310-cp310-manylinux_2_28_x86_64.whl (511.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 511.6/511.6 MB 1.9 MB/s eta 0:00:00
Collecting cuml-cu12==23.12.*
  Downloading https://pypi.nvidia.com/cuml-cu12/cuml_cu12-23.12.0-cp310-cp310-manylinux_2_17_

In [None]:
from google.colab import drive
drive.mount('/content/drive')

folder_path = '/content/drive/My Drive/Colab Notebooks/4_Influencing_Signal/data-hrv-kaggle'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import cudf
import numpy as np
import pandas as pd
# import pickle

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.svm import SVC

#from cuml.ensemble import RandomForestClassifier
#from cuml.svm import SVC
#from cuml.neighbors import KNeighborsClassifier
#from cuml.naive_bayes import MultinomialNB
#from xgboost import XGBClassifier
#from cuml.metrics import accuracy_score
from sklearn.dummy import DummyClassifier

In [None]:
%%time
data_train = pd.read_csv(folder_path + '/train.csv')
data_test = pd.read_csv(folder_path + '/test.csv')

CPU times: user 5.98 s, sys: 599 ms, total: 6.58 s
Wall time: 15.1 s


In [None]:
data_train['MEAN_RR'].count()

369289

In [None]:
data_test['MEAN_RR'].count()

41033

In [None]:
def run_preprocessing(data, target_column_name='condition', label_encoder=None):
  # Separating the features and the target variable before imputation
  X = data.drop(columns=[target_column_name])  # Features
  y = data[target_column_name]  # Target

  # Imputing missing values in the feature set
  imputer = SimpleImputer(strategy="mean")
  X_imputed = pd.DataFrame(imputer.fit_transform(X.select_dtypes(include=['float64'])),
                          columns=X.select_dtypes(include=['float64']).columns) # include=['float64', 'int64']

  # Standardizing the features
  scaler = StandardScaler()
  X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X_imputed.columns)

  # Check if label_encoder is provided, otherwise create a new one
  if label_encoder is None:
      # Encoding the target values
      label_encoder = LabelEncoder()
      # Fitting the encoder to the target values
      label_encoder.fit(y)

  # Transforming the target values to numeric
  y_encoded = label_encoder.transform(y)
  # y_encoded = pd.Series(y_encoded.astype(np.int32))

  # Transform to CUDA data frame
  # X_cudf = cudf.DataFrame.from_pandas(X_scaled)
  # y_cudf = cudf.Series(y_encoded.values)

  return X_scaled, y_encoded, label_encoder #X_cudf, y_cudf

In [None]:
# Function to run the pipeline
def run_pipeline(data_train, data_test, models, parameter_grid, feature_sets):
    results = []

    # Splitting the dataset and fit the label encoder on the training data
    X_train, y_train, label_encoder = run_preprocessing(data_train)
    X_test, y_test, _ = run_preprocessing(data_test, label_encoder=label_encoder)

    for features in feature_sets:
        X_train_sub = X_train[features]
        X_test_sub = X_test[features]

        for model_name, model in models.items():
            print(f"Running {model_name} with features {features}")

            # Hyperparameter tuning
            searcher = RandomizedSearchCV(model, parameter_grid[model_name], n_iter=9, cv=3, random_state=42, verbose=3)
            searcher.fit(X_train_sub, y_train)

            best_model = searcher.best_estimator_
            predictions = best_model.predict(X_test_sub)

            # Get unique labels
            unique_labels = np.unique(np.concatenate([y_test, y_train]))

            # Decode labels
            decoded_labels = label_encoder.inverse_transform(unique_labels)

            # Recording results
            accuracy = accuracy_score(y_test, predictions)
            report = classification_report(y_test, predictions, output_dict=True)
            cm = confusion_matrix(y_test, predictions)

            # Initialize a dictionary for this model and feature set
            model_results = {
                'model': model_name,
                'features': features,
                'best_params': searcher.best_params_,
                'overall_accuracy': accuracy,
                'overall_precision': report['macro avg']['precision'],
                'overall_recall': report['macro avg']['recall'],
                'overall_f1': report['macro avg']['f1-score'],
                # 'overall_support': report['macro avg']['support']
            }

            # Iterate over each class in the report to add class-specific metrics
            for idx, label in enumerate(unique_labels):
                label_name = label_encoder.inverse_transform([label])[0]
                class_accuracy = cm[idx, idx] / cm[idx].sum() if cm[idx].sum() > 0 else 0
                model_results[f'accuracy_class_{label_name}'] = class_accuracy
                model_results[f'precision_class_{label_name}'] = report[str(label)]['precision']
                model_results[f'recall_class_{label_name}'] = report[str(label)]['recall']
                model_results[f'f1_score_class_{label_name}'] = report[str(label)]['f1-score']
                # model_results[f'support_class_{label_name}'] = report[str(label)]['support']

            # Append the compiled results for this model and feature set
            results.append(model_results)

    return pd.DataFrame(results)

# Define your models
models = {
    #'RandomForest': RandomForestClassifier(),
    #'SVM': SVC(),
    # 'KNN': KNeighborsClassifier(),
    # 'Naive Bayes': MultinomialNB(),
    #'XGBoost': XGBClassifier()
    'Base': DummyClassifier()
}

# Define your hyperparameters for each model
parameter_grid = {
    #'RandomForest': {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20, 30]},
    #'SVM': {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01]},
    # 'KNN': {'n_neighbors': [1, 3, 5, 10, 20]},
    # 'Naive Bayes': {'alpha': [0.0, 0.5, 1.0]}, #, 'fit_prior': [True, False]},
    #'XGBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'tree_method': ['hist'], 'device': ['cuda']},
    'Base': {'strategy': ['most_frequent']}
}

# Define your feature sets
feature_sets = [
    # ['MEAN_RR', 'MEDIAN_RR'],  # Subset 1
    # ['MEAN_RR', 'MEDIAN_RR', 'SDRR', 'RMSSD', 'SDSD', 'SDRR_RMSSD','HR'],  # Subset 2
    ['MEAN_RR','SDRR'], # Subset 3
    ['MEAN_RR', 'MEDIAN_RR', 'SDRR', 'RMSSD', 'SDSD', 'SDRR_RMSSD','HR','pNN25','pNN50','SD1','SD2'], # Subset 4
]

In [None]:
%%time

# Run the pipeline
results_df = run_pipeline(data_train, data_test, models, parameter_grid, feature_sets)

# Save or print the results
# print(results_df)
results_df.to_csv('/content/drive/My Drive/Colab Notebooks/4_Influencing_Signal/results/model_results_base.csv', index=False)

Running Base with features ['MEAN_RR', 'MEDIAN_RR']
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END ............strategy=most_frequent;, score=0.542 total time=   0.0s
[CV 2/3] END ............strategy=most_frequent;, score=0.542 total time=   0.0s
[CV 3/3] END ............strategy=most_frequent;, score=0.542 total time=   0.0s


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Running Base with features ['MEAN_RR', 'MEDIAN_RR', 'SDRR', 'RMSSD', 'SDSD', 'SDRR_RMSSD', 'HR']
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END ............strategy=most_frequent;, score=0.542 total time=   0.0s
[CV 2/3] END ............strategy=most_frequent;, score=0.542 total time=   0.1s
[CV 3/3] END ............strategy=most_frequent;, score=0.542 total time=   0.0s
CPU times: user 954 ms, sys: 295 ms, total: 1.25 s
Wall time: 1.27 s


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
results_df

Unnamed: 0,model,features,best_params,overall_accuracy,overall_precision,overall_recall,overall_f1,accuracy_class_interruption,precision_class_interruption,recall_class_interruption,f1_score_class_interruption,accuracy_class_no stress,precision_class_no stress,recall_class_no stress,f1_score_class_no stress,accuracy_class_time pressure,precision_class_time pressure,recall_class_time pressure,f1_score_class_time pressure
0,Base,"[MEAN_RR, MEDIAN_RR]",{'strategy': 'most_frequent'},0.540004,0.180001,0.333333,0.233767,0.0,0.0,0.0,0.0,1.0,0.540004,1.0,0.701302,0.0,0.0,0.0,0.0
1,Base,"[MEAN_RR, MEDIAN_RR, SDRR, RMSSD, SDSD, SDRR_R...",{'strategy': 'most_frequent'},0.540004,0.180001,0.333333,0.233767,0.0,0.0,0.0,0.0,1.0,0.540004,1.0,0.701302,0.0,0.0,0.0,0.0
