# Utils

In [83]:
import os
from typing import Callable, List, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
from scipy.fft import fft
import scipy.stats

from utils import Dataset

# Manual feature extraction

## Utilities and preprocessing

In [80]:
def basic_data_cleaning(data: List[pd.DataFrame]) -> List[pd.DataFrame]:
    """
    Assumes DataFrames with "timestamp", "date" and "activity" columns.
    
    Performs cleaning operations:
    - assure format YYYY-MM-DD HH:MM:SS for "timestamp"
    - drop redundant "date" column
    - assure float32 format for "activity"
    
    :param data: list of DataFrames
    :returns: list of cleaned DataFrames
    """
    data = [df.copy() for df in data]  # create copy to avoid side effects
    
    for df in data:
        df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y-%m-%d %H:%M:%S")
        df.drop("date", axis=1, inplace=True)
        df["activity"] = df["activity"].astype(np.float32)
    
    return data


def total_power(x: pd.Series) -> pd.Series:
    """
    Calculates total power for given signal, using FFT and PSD (Power Spectral Density).
    
    x = FFT(x)
    PSD = 1/N * sum{i=0}^N |x(i)|^2
    
    :param df: Series, "activity" signal
    :returns: total power
    """
    x = fft(x.to_numpy())
    x = np.mean(np.square(np.abs(x)))
    return pd.Series(x)


def group_by_frequency(df: pd.DataFrame, frequency: str, domain: str) -> pd.DataFrame:
    """
    Groups time series DataFrame by given frequency, aggregating with a window function.
    Grouping result is either in time or frequency domain, depending on "domain" argument value.
    
    Assumes DataFrame with "timestamp", "date" and "activity" columns. 
    
    Options for "frequency":
    - "hour_quarter": 15 minutes
    - "hour_half": 30 minutes
    - "hour": 60 minutes
    
    Options for "domain":
    - "time": aggregates each period with simple mean (average)
    - "freq": aggregates each period calculating total power using Power Spectral Density (PSD)
    
    :param df: DataFrame with columns "datetime" and "activity"
    :param period: "hour_quarter", "hour_half" or "hour"
    :param domain: "time" or "freq"
    :returns: DataFrame with processed "activity" column and appropriately grouped "timestamp" column
    """
    df = df.copy()  # create copy to avoid side effects
    
    # group with given frequency
    if frequency == "hour_quarter":
        df = df.groupby([pd.Grouper(key="timestamp", freq="15min")])
    elif frequency == "hour_half":
        df = df.groupby([pd.Grouper(key="timestamp", freq="30min")])
    elif frequency == "hour":
        df = df.groupby([pd.Grouper(key="timestamp", freq="H")])
    else:
        raise ValueError(f'Frequency should be "hour_quarter", "hour_half" or "hour", got "{frequency}"')
    
    # aggregate in the proper domain
    if domain == "time":
        df = df.mean()
    elif domain == "freq":
        df = df.agg(total_power)
    else:
        raise ValueError(f'Domain should be "time" or "freq", got "{domain}"')
    
    # change index back to RangeIndex, since it became TimeIndex during grouping
    df.reset_index(inplace=True)
    
    return df


def get_day_part(df: pd.DataFrame, part: str) -> pd.DataFrame:
    """
    For given DataFrame with "timestamp" column returns only those rows that correspond to the 
    chosen part of day.
    
    Parts are "day" and "night", defined as:
    - "day": [8:00, 21:00)
    - "night": [21:00, 8:00)
    
    :param df: DataFrame to select rows from
    :param part: part of day, either "day" or "night"
    :returns: DataFrame, subset of rows of df
    """
    if part == "day":
        df = df.loc[(df["timestamp"].dt.hour >= 8) & (df["timestamp"].dt.hour < 21)]
    elif part == "night":
        df = df.loc[(df["timestamp"].dt.hour >= 21) | (df["timestamp"].dt.hour < 8)]
    else:
        raise ValueError(f'Part should be "day" or "night", got "{part}"')
        
    return df


def trim_to_length(data: List[pd.DataFrame], length: int) -> List[pd.DataFrame]:
    """
    Trims list of DataFrames to the given length (in terms of row number).
    
    :param data: list of DataFrames
    :param length: target number of rows
    :returns: list of trimmed DataFrames
    """
    data = [df.copy() for df in data]  # create copy to avoid side effects
    data = [df.head(length) for df in data]
    return data


def fill_missing_data(data: List[pd.DataFrame]) -> List[pd.DataFrame]:
    """
    Uses forward filling and then backward filling to fill missing values in "activity" column in DataFrames.
    
    :param data: list of DataFrames with "activity" column
    :returns: list of DataFrames with missing values filled
    """
    data = [df.copy() for df in data]  # create copy to avoid side effects
    for df in data:
        df["activity"] = df["activity"].ffill().bfill()
    return data


def spectral_flatness(X: np.ndarray, const: float = 1e-20) -> np.ndarray:
    """
    Calculates spectral flatness for given signal.
    
    :param X: Numpy 1D array with signal
    :param const: small constant to add to X to avoid division by zero
    :returns: spectral flatness value
    """
    norm = X.mean()
    if norm == 0:
        norm = 1
    
    X = np.log(X + 1e-20)  # add small number to avoid infinities
    X = np.exp(X.mean()) / norm

    return X

## Feature extraction

In [None]:
def extract_features(X: np.ndarray, add_spectral_flatness: bool = False) -> pd.DataFrame:
    """
    Extracts features from activity signal in time domain.
    
    :param df: 1D Numpy vector with signal
    :param add_spectral_flatness: whether to add spectral flatness to features or not; it should be applied only 
    if signal is already in 
    :returns: DataFrame with a single row representing features
    """
    features = {
        "minimum": np.min(X),
        "maximum": np.max(X),
        "mean": np.mean(X),
        "median": np.median(X),
        "stddev": np.std(X, ddof=1)  # ddof=1 applies Bessel correction, i.e. division by (N-1) instead of N,
        "variance": np.var(X),
        "kurtosis": sp.stats.kurtosis(X),
        "skewness": sp.stats.skew(X),
        "coeff_of_var": sp.stats.variation(X),
        "iqr": sp.stats.iqr(X),
        "trimmed_mean": sp.stats.trim_mean(X, proportiontocut=0.1),
        "entropy": sp.stats.entrop(X, base=2)
    }
    
    return pd.DataFrame(features)

# Depresjon

In [73]:
dataset = Dataset(dirpath=os.path.join("data", "depresjon"))
condition = dataset.condition

In [75]:
dfs = basic_data_cleaning(condition)
df = dfs[0]
df = group_by_frequency(df, period="hour_quarter", domain="freq")
df.head()

Unnamed: 0,timestamp,activity
0,2003-05-07 12:00:00,1862959.875
1,2003-05-07 12:15:00,3976770.25
2,2003-05-07 12:30:00,2845712.25
3,2003-05-07 12:45:00,2506544.75
4,2003-05-07 13:00:00,703248.0


# Psykose

Time features are extracted according to the article:
- `mean`, `median`, `stddev`, `variance`, `kurtosis`, `minimum`, `maximum` - quite self explanatory statistical features
- `coeff_of_var` - coefficient of variation, the ratio of the biased standard deviation to the mean
- `iqr` - interquartile range, difference between 75 and 25 percentile (3rd and 1st quartile)
- `trimmed_mean` - alternatively truncated mean, mean of the values where the most extreme values (from both ends) are not used; since the article doesn't specify this, I assume that the popular 10% trim percentage is used

Data is saved as a DataFrame, since some machine learning models can provide additional insight when using named columns.

Multiple features are calculated before standardization, since they wouldn't make sense for standardized data, when mean and standard deviation are always 0 and 1, respectively.

In [82]:
from scipy.stats import iqr, kurtosis, trim_mean, variation

Time features are extracted according to the article:
- all of the features that were calculated for time domain
- entropy
- skewness
- spectral flatness

Also the "Spectral Density" feature has been interpreted as total average power, i.e. simply the sum for the frequency domain.

In [86]:
from scipy.stats import entropy, skew

In [88]:
def extract_freq_features(X: np.ndarray) -> pd.DataFrame:
    # features for non-standardized data
    mean = np.nanmean(X, axis=1)
    median = np.nanmedian(X, axis=1)
    stddev = np.nanstd(X, axis=1, ddof=1)  # paper divides by (N - 1)
    var = np.nanvar(X, axis=1)
    kurt = kurtosis(X, axis=1, nan_policy="omit")
    coeff_of_var = variation(X, axis=1, nan_policy="omit")
    iq_range = iqr(X, axis=1, nan_policy="omit")
    minimum = np.nanmin(X, axis=1)
    maximum = np.nanmax(X, axis=1)
    
    # Scipy doesn't have NaN option for trimmed mean or entropy, so we have to calculate them by hand
    trimmed_mean = np.array([trim_mean(row[~np.isnan(row)], proportiontocut=0.1) for row in X])
    
    spectral_density = np.nansum(X, axis=1)
    skewness = skew(X, axis=1, nan_policy="omit")
    entr = np.array([entropy(row[~np.isnan(row)], base=2) for row in X])
    flatness = np.array([spectral_flatness(row[~np.isnan(row)]) for row in X])

    features = {
        "mean": mean,
        "median": median,
        "stddev": stddev,
        "variance": var,
        "kurtosis": kurt,
        "coeff_of_var": coeff_of_var,
        "iqr": iq_range,
        "minimum": minimum,
        "maximum": maximum,
        "trimmed_mean": trimmed_mean,
        "spectral_density": spectral_density,
        "skewness": skewness,
        "entropy": entr,
        "spectral_flatness": flatness
    }
    return pd.DataFrame(features)

In [89]:
freq_features = {}

for arr_name in ["X_night", "X_day", "X_all"]:
    freq_features[arr_name] = extract_freq_features(freq_data[arr_name])

In [90]:
freq_features["X_night"].head()

Unnamed: 0,mean,median,stddev,variance,kurtosis,coeff_of_var,iqr,minimum,maximum,trimmed_mean,spectral_density,skewness,entropy,spectral_flatness
0,34.117133,11.583333,45.756484,2079.014907,2.416487,1.336461,46.425,0.0,206.883333,24.915652,4878.75,1.74496,6.114835,0.014998
1,104.121096,23.633333,158.228121,24861.060346,2.792222,1.514332,74.825,2.183333,763.416667,69.512609,14889.316667,1.871353,5.923931,0.358771
2,91.023893,30.216667,122.794729,14973.10102,1.478461,1.344313,120.35,0.0,502.316667,66.538551,13016.416667,1.539269,6.030191,0.039675
3,80.998834,30.916667,104.402483,10823.655597,3.20087,1.284423,106.9,0.416667,532.35,60.698696,11582.833333,1.797638,6.169682,0.363638
4,136.69965,65.516667,149.172041,22096.687197,0.899069,1.087417,229.35,2.533333,666.816667,113.303333,19548.05,1.209742,6.351476,0.435199


### Datasets with all features

In [91]:
X_night = pd.merge(
    time_features["X_night"],
    freq_features["X_night"],
    left_index=True,
    right_index=True,
    suffixes=["_time", "_freq"]
)

X_day = pd.merge(
    time_features["X_day"],
    freq_features["X_day"],
    left_index=True,
    right_index=True,
    suffixes=["_time", "_freq"]
)

X_all = pd.merge(
    time_features["X_all"],
    freq_features["X_all"],
    left_index=True,
    right_index=True,
    suffixes=["_time", "_freq"]
)

y_night = time_data["y_night"]
y_day = time_data["y_day"]
y_all = time_data["y_all"]

Standardize the data (called "scaling" or "standard scaling" in Scikit-learn):

In [92]:
from sklearn.preprocessing import scale

In [93]:
X_night_stand = scale(X_night)
X_day_stand = scale(X_day)
X_all_stand = scale(X_all)

## Feature selection

For selecting the best sets of features (from 1 to 9 features, depending on an experiment) the paper uses:
- forward selection as the selection algorithm
- logistic regression as a base estimator
- 70%-30% of data for training-validation split

While it's not explicitly stated, we assume that cross validation is used, since the number of patients is very low and it's also used later in the Random Forest (there it's written explicitly).

In [94]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, ShuffleSplit

In [95]:
cross_validator = ShuffleSplit(test_size=0.3, random_state=0)
clf = LogisticRegression(random_state=0)
forward_selector = SequentialFeatureSelector(
    clf,
    n_features_to_select=9,
    direction="forward",
    scoring="accuracy",
    cv=cross_validator,
    n_jobs=-1
)

In [96]:
forward_selector.fit(X_night_stand, y_night)
night_features = X_night.columns[forward_selector.get_support()]
night_features

Index(['mean_time', 'median_time', 'iqr_time', 'maximum_time',
       'trimmed_mean_time', 'mean_freq', 'median_freq', 'iqr_freq',
       'trimmed_mean_freq'],
      dtype='object')

Paper features for night: kurtosis (time), median (time), interquartil rank (time), minimum (time), maximum (time), median (frequency), standard deviation (frequency), coefficient of variance (frequency), spectral flatness (frequency).

In [97]:
forward_selector.fit(X_day_stand, y_night)
day_features = X_day.columns[forward_selector.get_support()]
day_features

Index(['mean_time', 'median_time', 'variance_time', 'coeff_of_var_time',
       'iqr_time', 'mean_freq', 'variance_freq', 'coeff_of_var_freq',
       'entropy'],
      dtype='object')

Paper features for day: kurtosis (time), mean (time), median (time), minimum (time), trim mean (time), median (frequency), standard deviation (frequency), coefficient of variance (frequency), spectral flatness (frequency).

In [98]:
forward_selector.fit(X_all_stand, y_night)
all_features = X_all.columns[forward_selector.get_support()]
all_features

Index(['mean_time', 'median_time', 'stddev_time', 'variance_time',
       'coeff_of_var_time', 'median_freq', 'stddev_freq', 'skewness',
       'entropy'],
      dtype='object')

Paper features for full day: kurtosis (time), median (time), coefficient of variance (time), minimum (time), trim mean (time), median (frequency), standard deviation (frequency), coefficient of variance (frequency), spectral flatness (frequency).

In all cases features differ quite a lot from those in the paper. Let's check performance of features selected above and those from the paper.

## Random Forest training

Since we are using Random Forest, no hyperparameter tuning is performed. Because of very small sample size, instead of choosing a single test set, we opt for cross validation. While this is certainly a controversial choice, there are good reasons for it:
- small sample size would make a single test set not very meaningful, since its generalization approximation would not be very good
- with no hyperparameter tuning, the validation set can be treated as a test set, since it's independent of the classifier
- instead of single accuracy measure, which could be misleading with such small sample size, we get multiple accuracies and can check mean accuracy and standard deviation; if the latter turns out higher, then it would mean that results depend largely on randomly chosen test set (this problem arises because of small sample size)

We measure 2 different cross validation sizes:
- 3-fold cross validation, to be as close as possible to the 30% test set from the paper
- 5-fold cross validation, to have better overwiew of how accuracy changes with different folds

In [50]:
from sklearn.ensemble import RandomForestClassifier

In [107]:
X_night_selected = X_night[night_features]
X_day_selected = X_day[day_features]
X_all_selected = X_all[all_features]

X_night_paper = X_night[[
    "kurtosis_time", "median_time", "iqr_time", "minimum_time",
    "maximum_time", "median_freq", "stddev_freq", "coeff_of_var_freq",
    "spectral_flatness"
]]

X_day_paper = X_day[[
    "kurtosis_time", "mean_time", "median_time", "minimum_time",
    "trimmed_mean_time", "median_freq", "stddev_freq", "coeff_of_var_freq",
    "spectral_flatness"
]]

X_all_paper = X_all[[
    "kurtosis_time", "median_time", "coeff_of_var_time", "minimum_time",
    "trimmed_mean_time", "median_freq", "stddev_freq", "coeff_of_var_freq",
    "spectral_flatness"
]]

Xs = {
    "night": (X_night_selected, X_night_paper),
    "day": (X_day_selected, X_day_paper),
    "all": (X_all_selected, X_all_paper)
}

ys = {"night": y_night, "day": y_day, "all": y_all}

In [108]:
clf = RandomForestClassifier(n_estimators=500, max_features="sqrt", n_jobs=-1, random_state=0)
results = {}

for part in ["night", "day", "all"]:
    X_selected, X_paper = Xs[part]
    y = ys[part]
    
    selected_results_3 = cross_val_score(clf, X_selected, y, scoring="accuracy", cv=3)
    selected_results_5 = cross_val_score(clf, X_selected, y, scoring="accuracy", cv=5)
    
    paper_results_3 = cross_val_score(clf, X_paper, y, scoring="accuracy", cv=3)
    paper_results_5 = cross_val_score(clf, X_paper, y, scoring="accuracy", cv=5)
    
    results[part] = {3: [selected_results_3, paper_results_3], 5: [selected_results_5, paper_results_5]}

In [109]:
def get_cv_raport(cv_results: np.ndarray) -> None:
    result = f"accuracy: {cv_results.mean():.2f} ± {np.std(cv_results):.2f}, "
    result += f"min: {cv_results.min():.2f}, "
    result += f"max: {cv_results.max():.2f}"
    return result

In [110]:
for part in ["night", "day", "all"]:
    selected_results_3, paper_results_3 = results[part][3]
    selected_results_5, paper_results_5 = results[part][5]
    
    print(part.upper())
    print("\t", "3 folds, selected:", get_cv_raport(selected_results_3))
    print("\t", "3 folds, paper:", get_cv_raport(paper_results_3))
    print()
    print("\t", "5 folds, selected:", get_cv_raport(selected_results_5))
    print("\t", "5 folds, paper:", get_cv_raport(paper_results_5))
    print("\n")

NIGHT
	 3 folds, selected: accuracy: 0.80 ± 0.14, min: 0.61, max: 0.94
	 3 folds, paper: accuracy: 0.73 ± 0.16, min: 0.56, max: 0.94

	 5 folds, selected: accuracy: 0.75 ± 0.15, min: 0.64, max: 1.00
	 5 folds, paper: accuracy: 0.75 ± 0.16, min: 0.55, max: 1.00


DAY
	 3 folds, selected: accuracy: 0.73 ± 0.07, min: 0.67, max: 0.83
	 3 folds, paper: accuracy: 0.66 ± 0.13, min: 0.53, max: 0.83

	 5 folds, selected: accuracy: 0.71 ± 0.07, min: 0.64, max: 0.82
	 5 folds, paper: accuracy: 0.64 ± 0.18, min: 0.36, max: 0.91


ALL
	 3 folds, selected: accuracy: 0.71 ± 0.13, min: 0.58, max: 0.89
	 3 folds, paper: accuracy: 0.75 ± 0.06, min: 0.68, max: 0.83

	 5 folds, selected: accuracy: 0.65 ± 0.23, min: 0.27, max: 0.91
	 5 folds, paper: accuracy: 0.71 ± 0.18, min: 0.45, max: 0.91




Features from our selection performed generally better, only exception being all day data with 5-fold cross validation.

There is very high standard deviation apparent in all cases, especially for night data. For all datasets results vary from unacceptably low (e.g. 56% on night 3-fold, 36% on day 5-fold), to exceptionally good (100% on 5-fold night). This makes us highly doubt results from the paper, which does not precisely state the testing procedure. It's apparent that on such small dataset even properly, randomly chosen test set is not enough to measure quality of the classifier.

What's interesting is that on 5-fold cross validation we actually get worse results than on 3-fold, despite larget training set. This may also be attributed to very small dataset - samples randomly chosen to be in the validation set (which equals test set here) may be very different than those on the training set. This actually justifies the relatively high size of the test set from the paper.

## Random Forest with class weights

We have a noticeable class imbalance, so maybe using Random Forest with balanced class weights (using count of samples from each class) can help.

In [117]:
clf = RandomForestClassifier(n_estimators=500, max_features="sqrt", class_weight="balanced", n_jobs=-1, random_state=0)
results = {}

for part in ["night", "day", "all"]:
    X_selected, X_paper = Xs[part]
    y = ys[part]
    
    selected_results_3 = cross_val_score(clf, X_selected, y, scoring="accuracy", cv=3)
    selected_results_5 = cross_val_score(clf, X_selected, y, scoring="accuracy", cv=5)
    
    paper_results_3 = cross_val_score(clf, X_paper, y, scoring="accuracy", cv=3)
    paper_results_5 = cross_val_score(clf, X_paper, y, scoring="accuracy", cv=5)
    
    results[part] = {3: [selected_results_3, paper_results_3], 5: [selected_results_5, paper_results_5]}

In [118]:
for part in ["night", "day", "all"]:
    selected_results_3, paper_results_3 = results[part][3]
    selected_results_5, paper_results_5 = results[part][5]
    
    print(part.upper())
    print("\t", "3 folds, selected:", get_cv_raport(selected_results_3))
    print("\t", "3 folds, paper:", get_cv_raport(paper_results_3))
    print()
    print("\t", "5 folds, selected:", get_cv_raport(selected_results_5))
    print("\t", "5 folds, paper:", get_cv_raport(paper_results_5))
    print("\n")

NIGHT
	 3 folds, selected: accuracy: 0.80 ± 0.14, min: 0.61, max: 0.94
	 3 folds, paper: accuracy: 0.73 ± 0.16, min: 0.56, max: 0.94

	 5 folds, selected: accuracy: 0.75 ± 0.15, min: 0.64, max: 1.00
	 5 folds, paper: accuracy: 0.71 ± 0.16, min: 0.55, max: 1.00


DAY
	 3 folds, selected: accuracy: 0.71 ± 0.09, min: 0.61, max: 0.83
	 3 folds, paper: accuracy: 0.62 ± 0.08, min: 0.53, max: 0.72

	 5 folds, selected: accuracy: 0.73 ± 0.08, min: 0.64, max: 0.82
	 5 folds, paper: accuracy: 0.64 ± 0.17, min: 0.36, max: 0.91


ALL
	 3 folds, selected: accuracy: 0.69 ± 0.15, min: 0.53, max: 0.89
	 3 folds, paper: accuracy: 0.75 ± 0.06, min: 0.68, max: 0.83

	 5 folds, selected: accuracy: 0.65 ± 0.24, min: 0.27, max: 0.91
	 5 folds, paper: accuracy: 0.71 ± 0.18, min: 0.45, max: 0.91




Nothing really changed.

## SVM

As our second classifier we opted for SVM, since it's often used for ML in medicine and performs well for small datasets. It requires hyperparameter tuning, therefore we perform a following procedure to ensure proper results:
- perform process similar to 3-fold cross-validation, splitting the dataset into 3 parts (after shuffling)
- for each fold, take the current fold as a test set, and train classifier on the training set with leave-one-out cross validation

While this approach has high computational requirements because of the leave-one-out cross-validation, we believe it is the only reasonable way to tune this type of classifier on such a small sample.

In [124]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, KFold, LeaveOneOut
from sklearn.svm import SVC

In [144]:
def train_svm(cv_n_splits):
    cv_kfold = KFold(n_splits=cv_n_splits, shuffle=True, random_state=0)
    cv_loo = LeaveOneOut()

    clf = SVC(kernel="rbf", cache_size=1024, random_state=0)

    param_grid = {
        "C": [1, 10, 20, 50, 100, 200, 500, 1000],
        "gamma": [1e-4, 1e-3, 1e-2, 1e-1, "scale", "auto"]
    }

    grid_search = GridSearchCV(clf, param_grid, scoring="accuracy", n_jobs=-1, cv=cv_loo)

    results = {}
    for part in ["night", "day", "all"]:
        X_selected, X_paper = Xs[part]
        X_selected, X_paper = X_selected.values, X_paper.values
        y = ys[part]

        accuracies = {"selected": [], "paper": []}
        
        for version, X in [("selected", X_selected), ("paper", X_paper)]:
            for train_idxs, test_idxs in cv_kfold.split(X):
                
                X_train, X_test = X[train_idxs], X[test_idxs]
                y_train, y_test = y[train_idxs], y[test_idxs]

                best_clf = grid_search.fit(X_train, y_train)
                test_acc = accuracy_score(y_test, best_clf.predict(X_test))
                accuracies[version].append(test_acc)

        results[part] = accuracies
    
    return results

In [145]:
results = train_svm(3)

In [147]:
for part in ["night", "day", "all"]:
    selected_results, paper_results = results[part]["selected"], results[part]["paper"]
    selected_results, paper_results = np.array(selected_results), np.array(paper_results)
    
    print(part.upper())
    print("\t", "selected:", get_cv_raport(selected_results))
    print("\t", "paper:", get_cv_raport(paper_results))
    print("\n")

NIGHT
	 selected: accuracy: 0.63 ± 0.08, min: 0.56, max: 0.74
	 paper: accuracy: 0.64 ± 0.02, min: 0.61, max: 0.67


DAY
	 selected: accuracy: 0.60 ± 0.07, min: 0.50, max: 0.67
	 paper: accuracy: 0.62 ± 0.11, min: 0.47, max: 0.72


ALL
	 selected: accuracy: 0.67 ± 0.10, min: 0.56, max: 0.79
	 paper: accuracy: 0.63 ± 0.08, min: 0.56, max: 0.74




SVM performance is noticably worse than Random Forest. The only positive is that standard deviations are much lower.

Perhaps the 3-fold cross-validation is too harsh for a classifier that requires hyperparameter tuning. Let's check 5-fold.

In [148]:
results = train_svm(5)

In [150]:
for part in ["night", "day", "all"]:
    selected_results, paper_results = results[part]["selected"], results[part]["paper"]
    selected_results, paper_results = np.array(selected_results), np.array(paper_results)
    
    print(part.upper())
    print("\t", "selected:", get_cv_raport(selected_results))
    print("\t", "paper:", get_cv_raport(paper_results))
    print("\n")

NIGHT
	 selected: accuracy: 0.71 ± 0.04, min: 0.64, max: 0.73
	 paper: accuracy: 0.65 ± 0.07, min: 0.55, max: 0.73


DAY
	 selected: accuracy: 0.55 ± 0.08, min: 0.45, max: 0.64
	 paper: accuracy: 0.56 ± 0.16, min: 0.27, max: 0.73


ALL
	 selected: accuracy: 0.65 ± 0.11, min: 0.55, max: 0.82
	 paper: accuracy: 0.65 ± 0.07, min: 0.55, max: 0.73




On some parts we got slight improvements, on others classifier performed slightly worse. Overall, SVM is still heavily underperforming compared to Random Forest.

## Overall results and discussion

The main conclusion from all experiments is that paper results are irreproducible, and such high performance has been achieved due to very small dataset and all problems that stem from it.

The approach presented there has not been properly corrected to take into accounts all the anomalies that occur while using ML algorithms on small samples, e.g. the single test set is not a good approximation of the generalization performance of the classifier. Our analysis performed above shows in detail how skewed can the results be, even when using techniques that are sufficient for typical, larget datasets.

While we were able to achieve the results close to 100% accuracy, their consistency cannot be guaranteed. The overall approach of using sensor data for mental health diagnosis, however, shows great potential. All accuracies have been high enough to justify cautious optimism, but small dataset size remain.

When enough data is gathered (at least hundreds of samples for each class), then this unique approach to depression detection can be fully utilized in real world systems. We agree with the paper about gathering exclusively night data - it showed best results also on our tests, and it is the cheapest and easiest to measure.