# Utils

In [52]:
import os
from typing import Dict, List

import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats
from scipy.stats.mstats import gmean

from utils import Dataset

# Manual feature extraction

## Utilities and preprocessing

In [146]:
def basic_data_cleaning(data: List[pd.DataFrame]) -> List[pd.DataFrame]:
    """
    Assumes DataFrames with "timestamp", "date" and "activity" columns.
    
    Performs cleaning operations:
    - assure format YYYY-MM-DD HH:MM:SS for "timestamp"
    - drop redundant "date" column
    - assure float32 format for "activity"
    
    :param data: list of DataFrames
    :returns: list of cleaned DataFrames
    """
    data = [df.copy() for df in data]  # create copy to avoid side effects
    
    for df in data:
        df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y-%m-%d %H:%M:%S")
        df.drop("date", axis=1, inplace=True)
        df["activity"] = df["activity"].astype(np.float32)
    
    return data


def get_day_part(df: pd.DataFrame, part: str) -> pd.DataFrame:
    """
    For given DataFrame with "timestamp" column returns only those rows that correspond to the 
    chosen part of day.
    
    Parts are "day" and "night", defined as:
    - "day": [8:00, 21:00)
    - "night": [21:00, 8:00)
    
    :param df: DataFrame to select rows from
    :param part: part of day, either "day" or "night"
    :returns: DataFrame, subset of rows of df
    """
    if part == "day":
        df = df.loc[(df["timestamp"].dt.hour >= 8) & (df["timestamp"].dt.hour < 21)]
    elif part == "night":
        df = df.loc[(df["timestamp"].dt.hour >= 21) | (df["timestamp"].dt.hour < 8)]
    else:
        raise ValueError(f'Part should be "day" or "night", got "{part}"')
        
    return df


def fill_missing_activity(df: pd.DataFrame) -> pd.DataFrame:
    """
    Makes sure that "timestamp" column has minute resolution with no missing values from start to end and replaces 
    all NaNs in "activity" column with mean average value.
    
    :param data: DataFrame with "timestamp" and "activity" columns
    :returns: cleaned DataFrame
    """
    df = df.copy()  # create copy to avoid side effects
    
    # resample to the basic frequency, i.e. minute; this will create NaNs for any rows that may be missing
    df = df.resample("min", on="timestamp").mean()
    
    # recreate index and "timestamp" column
    df = df.reset_index()
    
    # fill any NaNs with mean activity value
    df["activity"] = df["activity"].fillna(df["activity"].mean())

    return df


def total_power(df: pd.DataFrame) -> float:
    """
    Calculates total power for given signal. Estimates total power density (i.e. power spectrum) using 
    Welch method and then sums it (integrates discrete signal).
    
    :param df: DataFrame with "activity" column
    :returns: total power
    """
    x = df["activity"].values
    power_spectrum = scipy.signal.welch(x, nperseg=min(len(x), 256))[1]
    return pd.Series(power_spectrum.sum())


def resample_in_domain(df: pd.DataFrame, domain: str, freq: str = "H") -> pd.DataFrame:
    """
    Resamples time series DataFrame with given frequency, aggregating with a window function.
    Result is either in time or frequency domain, depending on "domain" argument value.
    
    Assumes DataFrame with "timestamp", "date" and "activity" columns. 
    
    Options for "domain":
    - "time": aggregates each period with simple mean (average)
    - "frequency": aggregates each period calculating total power
    
    :param df: DataFrame with columns "datetime" and "activity"
    :param domain: "time" or "frequency"
    :param freq: resampling frequency string passed to Pandas resample() function
    :returns: DataFrame with "timestamp" and "activity" columns
    """
    df = df.copy()  # create copy to avoid side effects
    
    # make sure that data has minute resolution with no missing parts from start to end, with no missing values
    df = fill_missing_activity(df)
    
    # group with given frequency
    df = df.resample(freq, on="timestamp")
    
    # aggregate in the proper domain
    if domain == "time":
        df = df.mean()
        df = df.reset_index()
    elif domain == "frequency":
        df = df.agg(total_power)
        
        # clear index, as timestamp index is meaningless in frequency domain
        df = df.reset_index(drop=True)
    else:
        raise ValueError(f'Domain should be "time" or "frequency", got "{domain}"')
    
    return df


def spectral_flatness(df: pd.DataFrame) -> float:
    """
    Calculates spectral flatness of a signal, i.e. a geometric mean of the power spectrum divided by 
    the arithmetic mean of the power spectrum.
    
    :param x: DataFrame with "activity" column
    :returns: spectral flatness value
    """
    x = df["activity"].values
    power_spectrum = scipy.signal.welch(x, nperseg=min(len(x), 256))[1]
    return gmean(power_spectrum) / power_spectrum.mean()

## Feature extraction

In [147]:
def extract_time_features(df_raw: pd.DataFrame, df_resampled: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts features from activity signal in time domain.
    
    :param df_raw: DataFrame with "timestamp" and "activity" columns, with raw signal values
    :param df_resampled: DataFrame resampled in time domain with another frequency
    :returns: DataFrame with a single row representing features
    """
    X = df_resampled["activity"].values
    
    features = {
        "minimum": np.min(X),
        "maximum": np.max(X),
        "mean": np.mean(X),
        "median": np.median(X),
        "stddev": np.std(X, ddof=1),  # ddof=1 applies Bessel correction, i.e. division by (N-1) instead of N
        "variance": np.var(X),
        "kurtosis": sp.stats.kurtosis(X),
        "skewness": sp.stats.skew(X),
        "coeff_of_var": sp.stats.variation(X),
        "iqr": sp.stats.iqr(X),
        "trimmed_mean": sp.stats.trim_mean(X, proportiontocut=0.1),
        "entropy": sp.stats.entropy(X, base=2),
    }
    
    return pd.DataFrame([features])

In [148]:
def extract_frequency_features(df_raw: pd.DataFrame, df_resampled: pd.DataFrame, df_frequency: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts features from activity signal in frequency domain (precisely, in power domain, since it's assumed 
    that df_frequency has been aggregated with total power using PSD).
    
    :param df_raw: DataFrame with "timestamp" and "activity" columns, with raw signal values
    :param df_resampled: DataFrame resampled in time domain with another frequency
    :param df_frequency: DataFrame in frequency domain
    :returns: DataFrame with a single row representing features
    """
    X = df_frequency.values.ravel()
    
    features = {
        "minimum": np.min(X),
        "maximum": np.max(X),
        "mean": np.mean(X),
        "median": np.median(X),
        "stddev": np.std(X, ddof=1),  # ddof=1 applies Bessel correction, i.e. division by (N-1) instead of N
        "variance": np.var(X),
        "kurtosis": sp.stats.kurtosis(X),
        "skewness": sp.stats.skew(X),
        "coeff_of_var": sp.stats.variation(X),
        "iqr": sp.stats.iqr(X),
        "trimmed_mean": sp.stats.trim_mean(X, proportiontocut=0.1),
        "entropy": sp.stats.entropy(X, base=2),
        "spectral_flatness": spectral_flatness(df_resampled)
    }
    
    return pd.DataFrame([features])

In [149]:
def extract_features_for_dataframes(dfs: List[pd.DataFrame], freq: str = "H") -> Dict[str, pd.DataFrame]:
    """
    Calculates time and frequency features for given DataFrames. Uses given frequency for resampling.
    
    Calculates features separately for:
    - full 24hs
    - days: [8:00, 21:00)
    - nights: [21:00, 8:00)
    
    :param dfs: list of DataFrames to extract features from; each one has to have "timestamp" and "activity" columns
    :param freq: resampling frequency
    :returns: dictionary with keys "full_24h", "day" and "night", corresponding to features from given parts of day
    """
    full_dfs = basic_data_cleaning(dfs)
    night_dfs = [get_day_part(df, part="night") for df in full_dfs]
    day_dfs = [get_day_part(df, part="day") for df in full_dfs]

    day_part_dfs = {"full_24h": full_dfs, "night": night_dfs, "day": day_dfs}
    datasets = {}

    for part, list_of_dfs in day_part_dfs.items():
        full_features = []
        for df in list_of_dfs:
            df_resampled = resample_in_domain(df, domain="time", freq=freq)
            df_frequency = resample_in_domain(df, domain="frequency", freq=freq)

            time_features = extract_time_features(df, df_resampled)
            freq_features = extract_frequency_features(df, df_resampled, df_frequency)

            features_merged = pd.merge(
                time_features,
                freq_features,
                left_index=True,
                right_index=True,
                suffixes=["_time", "_freq"]
            )
            full_features.append(features_merged)

        datasets[part] = pd.concat(full_features)
        datasets[part].reset_index(drop=True, inplace=True)
    
    return datasets

# Depresjon

In [156]:
dataset = Dataset(dirpath=os.path.join("data", "depresjon"))
condition = dataset.condition
control = dataset.control

In [157]:
condition_parts_dfs = extract_features_for_dataframes(condition, freq="H")
control_parts_dfs = extract_features_for_dataframes(control, freq="H")

datasets = {}

for part in ["full_24h", "night", "day"]:
    condition_df = condition_parts_dfs["full_24h"]
    control_df = control_parts_dfs["full_24h"]
    
    entire_df = condition_df.append(control_df, ignore_index=True)
    datasets[part] = entire_df

In [159]:
target_dir = "processed_data"

for part, df in datasets.items():
    filename = f"depresjon_{part}.csv"
    filepath = os.path.join(target_dir, filename)
    df.to_csv(filepath, index=False)

# Psykose

In [160]:
dataset = Dataset(dirpath=os.path.join("data", "psykose"))
condition = dataset.condition
control = dataset.control

In [161]:
condition_parts_dfs = extract_features_for_dataframes(condition, freq="H")
control_parts_dfs = extract_features_for_dataframes(control, freq="H")

datasets = {}

for part in ["full_24h", "night", "day"]:
    condition_df = condition_parts_dfs["full_24h"]
    control_df = control_parts_dfs["full_24h"]
    
    entire_df = condition_df.append(control_df, ignore_index=True)
    datasets[part] = entire_df

In [162]:
target_dir = "processed_data"

for part, df in datasets.items():
    filename = f"psykose_{part}.csv"
    filepath = os.path.join(target_dir, filename)
    df.to_csv(filepath, index=False)