In [23]:
from copy import deepcopy
from glob import glob
import os
from os import path as op
import numpy as np
import pandas as pd
import sys
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, balanced_accuracy_score
from sklearn.model_selection import StratifiedGroupKFold, cross_validate, StratifiedKFold, LeaveOneOut, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import itertools
import argparse


## Define classification parameters

In [None]:

# Define linear SVM classifier pipeline
model = svm.SVC(C=1, class_weight='balanced', kernel='linear', random_state=127, probability=True)
pipe = Pipeline([('scaler', StandardScaler()), 
                            ('model', model)])

# Define scoring type
scoring = {'accuracy': 'accuracy',
           'balanced_accuracy': 'balanced_accuracy'}

# Define ten-fold cross-validation splitter
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=127)


## Load data

In [24]:
# Define current directory
current_dir = os.getcwd()

# Real time-series catch24 results
real_TS_catch24_res_path = f"{current_dir}/real/catch24_res/"

# Define problem list
problem_list = ["Beef", "DodgerLoopWeekend", "ECG5000", "GunPointOldVersusYoung",
                  "InlineSkate", "InsectEPGRegularTrain", "ItalyPowerDemand", "PowerCons",
                  "Wafer"]


## Run classification using linear SVM on real time series data with (1) FTM, mean + SD and (2) full catch24 feature set

In [30]:
# Initialize lists for storing results
classification_results_list = []

# Read over each problem
for problem in problem_list:
    problem_catch24_res = pd.read_csv(f"{real_TS_catch24_res_path}{problem}_catch24_res.csv")

    # Separate 'id' into sample ID and problem
    problem_catch24_res[["sample_ID", "problem"]] = problem_catch24_res["id"].str.split("_", expand=True)

    # Drop 'id' column
    problem_catch24_res.drop(columns=["id"], inplace=True)


    # Widen the dataframe
    problem_catch24_res_wide = problem_catch24_res.pivot(index=["sample_ID", "group"], columns="names", values="values")

    # Extract just the values
    problem_catch24_res_wide_values = problem_catch24_res_wide.values

    # Also extract just the mean+SD features
    problem_FTM_res_wise = problem_catch24_res_wide.filter(['DN_Spread_Std', 'DN_Mean'], axis=1)
    problem_FTM_res_wide_values = problem_FTM_res_wise.values

    # Extract the group labels
    group_labels = problem_catch24_res_wide.index.get_level_values("group").values

    # Fit the pipeline with cv
    cv_catch24_results = cross_validate(pipe, problem_catch24_res_wide_values, group_labels, cv=cv, scoring=scoring)
    cv_FTM_results = cross_validate(pipe, problem_FTM_res_wide_values, group_labels, cv=cv, scoring=scoring)

    # Save to dataframes
    cv_catch24_results_df = pd.DataFrame({"Problem": problem,
                                          "Feature_Set": "catch24",
                                          "Fold_Number": np.arange(1, 11),
                                          "accuracy": cv_catch24_results["test_accuracy"],
                                          "balanced_accuracy": cv_catch24_results["test_balanced_accuracy"]})
    cv_FTM_results_df = pd.DataFrame({"Problem": problem,
                                        "Feature_Set": "Mean_SD",
                                        "Fold_Number": np.arange(1, 11),
                                        "accuracy": cv_FTM_results["test_accuracy"],
                                        "balanced_accuracy": cv_FTM_results["test_balanced_accuracy"]})
    
    # Append to list
    classification_results_list.append(cv_catch24_results_df)
    classification_results_list.append(cv_FTM_results_df)

all_real_TS_classification_results = pd.concat(classification_results_list)

In [None]:
# A