In [None]:
from google.colab import drive
drive.mount('/content/drive')

folder_path = '/content/drive/My Drive/Colab Notebooks/4_Influencing_Signal/data-hrv-kaggle'

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [None]:
data_train = pd.read_csv(folder_path + '/train.csv')
data_test = pd.read_csv(folder_path + '/test.csv')

In [None]:
def run_preprocessing(data, target_column_name='condition'):
  # Separating the features and the target variable before imputation
  X = data.drop(columns=[target_column_name])  # Features
  y = data[target_column_name]  # Target

  # Imputing missing values in the feature set
  imputer = SimpleImputer(strategy="mean")
  X_imputed = pd.DataFrame(imputer.fit_transform(X.select_dtypes(include=['float64', 'int64'])),
                          columns=X.select_dtypes(include=['float64', 'int64']).columns)

  # Standardizing the features
  scaler = StandardScaler()
  X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X_imputed.columns)

  # Encoding the target values
  label_encoder = LabelEncoder()

  # Fitting the encoder to the target values
  label_encoder.fit(y)

  # Transforming the target values to numeric
  y_encoded = label_encoder.transform(y)

  return X_scaled, y

In [None]:
# Function to run the pipeline
def run_pipeline(data_train, data_test, models, parameter_grid, feature_sets):
    results = []

    # Splitting the dataset
    X_train, y_train = run_preprocessing(data_train)
    X_test, y_test = run_preprocessing(data_test)

    for features in feature_sets:
        X_train_sub = X_train[features]
        X_test_sub = X_test[features]

        for model_name, model in models.items():
            print(f"Running {model_name} with features {features}")

            # Hyperparameter tuning
            searcher = RandomizedSearchCV(model, parameter_grid[model_name], n_iter=10, cv=3, random_state=42)
            searcher.fit(X_train_sub, y_train)

            best_model = searcher.best_estimator_
            predictions = best_model.predict(X_test_sub)

            # Recording results
            accuracy = accuracy_score(y_test, predictions)
            report = classification_report(y_test, predictions)
            results.append({
                'model': model_name,
                'features': features,
                'best_params': searcher.best_params_,
                'accuracy': accuracy,
                'report': report
            })

    return pd.DataFrame(results)

# Define your models
models = {
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(),
    'LogisticRegression': LogisticRegression(),
    'XGBoost': XGBClassifier()
}

# Define your hyperparameters for each model
parameter_grid = {
    'RandomForest': {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20, 30]},
    'SVM': {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01]},
    'LogisticRegression': {'C': [0.1, 1, 10]},
    'XGBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2]}
}

# Define your feature sets
feature_sets = [
    ['MEAN_RR', 'MEDIAN_RR'],  # Subset 1
    ['MEAN_RR', 'MEDIAN_RR', 'SDRR', 'RMSSD', 'SDSD', 'SDRR_RMSSD','HR'],  # Subset 2
    # Add more subsets as needed
]

In [None]:
# Run the pipeline
results_df = run_pipeline(data_train, data_test, models, parameter_grid, feature_sets)

# Save or print the results
print(results_df)
results_df.to_csv('/content/drive/My Drive/Colab Notebooks/4_Influencing_Signal/results/model_sklearn_cv_results.csv', index=False)