# Utils

In [1]:
import os
from typing import Dict, List

import numpy as np
import pandas as pd
import scipy as sp
import scipy.signal
import scipy.stats
from scipy.stats.mstats import gmean

from utils import Dataset

# Manual feature extraction

## Utilities and preprocessing

In [47]:
# parameters for Welch's method for estimating power spectrum

NPERSEG = 60                    # length of segment
NOVERLAP = int(0.75 * nperseg)  # overlap of segments
NFFT = NPERSEG                  # length of FFT

In [48]:
def basic_data_cleaning(data: List[pd.DataFrame]) -> List[pd.DataFrame]:
    """
    Assumes DataFrames with "timestamp", "date" and "activity" columns.
    
    Performs cleaning operations:
    - assure format YYYY-MM-DD HH:MM:SS for "timestamp"
    - drop redundant "date" column
    - assure float32 format for "activity"
    
    :param data: list of DataFrames
    :returns: list of cleaned DataFrames
    """
    data = [df.copy() for df in data]  # create copy to avoid side effects
    
    for df in data:
        df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y-%m-%d %H:%M:%S")
        df.drop("date", axis=1, inplace=True)
        df["activity"] = df["activity"].astype(np.float32)
    
    return data


def get_day_part(df: pd.DataFrame, part: str) -> pd.DataFrame:
    """
    For given DataFrame with "timestamp" column returns only those rows that correspond to the 
    chosen part of day.
    
    Parts are "day" and "night", defined as:
    - "day": [8:00, 21:00)
    - "night": [21:00, 8:00)
    
    :param df: DataFrame to select rows from
    :param part: part of day, either "day" or "night"
    :returns: DataFrame, subset of rows of df
    """
    if part == "day":
        df = df.loc[(df["timestamp"].dt.hour >= 8) & (df["timestamp"].dt.hour < 21)]
    elif part == "night":
        df = df.loc[(df["timestamp"].dt.hour >= 21) | (df["timestamp"].dt.hour < 8)]
    else:
        raise ValueError(f'Part should be "day" or "night", got "{part}"')
        
    return df


def fill_missing_activity(df: pd.DataFrame) -> pd.DataFrame:
    """
    Makes sure that "timestamp" column has minute resolution with no missing values from start to end and replaces 
    all NaNs in "activity" column with mean average value.
    
    :param data: DataFrame with "timestamp" and "activity" columns
    :returns: cleaned DataFrame
    """
    df = df.copy()  # create copy to avoid side effects
    
    # resample to the basic frequency, i.e. minute; this will create NaNs for any rows that may be missing
    df = df.resample("min", on="timestamp").mean()
    
    # recreate index and "timestamp" column
    df = df.reset_index()
    
    # fill any NaNs with mean activity value
    df["activity"] = df["activity"].fillna(df["activity"].mean())

    return df


def resample(df: pd.DataFrame, freq: str = "H") -> pd.DataFrame:
    """
    Resamples time series DataFrame with given frequency, aggregating each segment with a mean.

    :param df: DataFrame with "timestamp" and "activity" columns
    :param freq: resampling frequency string passed to Pandas resample() function
    :returns: DataFrame with "timestamp" and "activity" columns
    """
    df = df.copy()  # create copy to avoid side effects
    
    # group with given frequency
    df = df.resample(freq, on="timestamp").mean()

    # recreate "timestamp" column
    df = df.reset_index()

    return df


def proportion_of_zeros(x: np.ndarray) -> float:
    """
    Calculates proportion of zeros in given array, i.e. number of zeros divided by length of array.
    
    :param x: 1D Numpy array
    :returns: proportion of zeros
    """
    # since we may be dealing with floating numbers, we can't use direct comparison
    zeros_count = np.sum(np.isclose(x, 0))
    return zeros_count / len(x)


def power_spectral_density(df: pd.DataFrame) -> np.ndarray:
    """
    Calculates power spectral density (PSD) from "activity" column of a DataFrame.
    
    :param df: DataFrame with "activity" column
    :returns: 1D Numpy array with power spectral density
    """
    psd = scipy.signal.welch(
        x=df["activity"].values,
        fs=(1/60),
        nperseg=NPERSEG,
        noverlap=NOVERLAP,
        nfft=NFFT,
        scaling="density"
    )[1]
    return psd


def spectral_flatness(df: pd.DataFrame) -> float:
    """
    Calculates spectral flatness of a signal, i.e. a geometric mean of the power spectrum divided by 
    the arithmetic mean of the power spectrum.
    
    If some frequency bins in the power spectrum are close to zero, they are removed prior to calculation of 
    spectral flatness to avoid calculation of log(0).
    
    :param df: DataFrame with "activity" column
    :returns: spectral flatness value
    """
    power_spectrum = scipy.signal.welch(
        df["activity"].values,
        fs=(1/60),
        nperseg=NPERSEG,
        noverlap=NOVERLAP,
        nfft=NFFT,
        scaling="spectrum"
    )[1]
    
    non_zeros_mask = ~np.isclose(power_spectrum, 0)
    power_spectrum = power_spectrum[non_zeros_mask]
    
    return gmean(power_spectrum) / power_spectrum.mean()

## Feature extraction

In [49]:
def extract_time_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts features from activity signal in time domain.
    
    :param df_resampled: DataFrame with "activity" column
    :returns: DataFrame with a single row representing features
    """
    X = df["activity"].values
    
    features = {
        "minimum": np.min(X),
        "maximum": np.max(X),
        "mean": np.mean(X),
        "median": np.median(X),
        "variance": np.var(X, ddof=1),  # apply Bessel's correction, i.e. divide by N-1
        "kurtosis": sp.stats.kurtosis(X),
        "skewness": sp.stats.skew(X),
        "coeff_of_var": sp.stats.variation(X),
        "iqr": sp.stats.iqr(X),
        "trimmed_mean": sp.stats.trim_mean(X, proportiontocut=0.1),
        "entropy": sp.stats.entropy(X, base=2),
        "proportion_of_zeros": proportion_of_zeros(X)
    }
    
    return pd.DataFrame([features])

In [50]:
def extract_frequency_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts features from activity signal in frequency domain, i.e. calculated from its Power Spectral Density (PSD).
    
    :param df: DataFrame with "activity" column
    :returns: DataFrame with a single row representing features
    """
    X = power_spectral_density(df)
    
    features = {
        "minimum": np.min(X),
        "maximum": np.max(X),
        "mean": np.mean(X),
        "median": np.median(X),
        "variance": np.var(X),
        "kurtosis": sp.stats.kurtosis(X),
        "skewness": sp.stats.skew(X),
        "coeff_of_var": sp.stats.variation(X),
        "iqr": sp.stats.iqr(X),
        "trimmed_mean": sp.stats.trim_mean(X, proportiontocut=0.1),
        "entropy": sp.stats.entropy(X, base=2),
        "spectral_flatness": spectral_flatness(df)
    }
    
    return pd.DataFrame([features])

In [51]:
def extract_features_for_dataframes(dfs: List[pd.DataFrame], freq: str = "H") -> Dict[str, pd.DataFrame]:
    """
    Calculates time and frequency features for given DataFrames. Uses given frequency for resampling.
    
    Calculates features separately for:
    - full 24hs
    - days: [8:00, 21:00)
    - nights: [21:00, 8:00)
    
    :param dfs: list of DataFrames to extract features from; each one has to have "timestamp" and "activity" columns
    :param freq: resampling frequency
    :returns: dictionary with keys "full_24h", "day" and "night", corresponding to features from given parts of day
    """
    full_dfs = basic_data_cleaning(dfs)
    night_dfs = [get_day_part(df, part="night") for df in full_dfs]
    day_dfs = [get_day_part(df, part="day") for df in full_dfs]

    day_part_dfs = {"full_24h": full_dfs, "night": night_dfs, "day": day_dfs}
    datasets = {}

    for part, list_of_dfs in day_part_dfs.items():
        full_features = []
        for df in list_of_dfs:
            df = fill_missing_activity(df)
            df_resampled = resample(df, freq=freq)

            time_features = extract_time_features(df_resampled)
            freq_features = extract_frequency_features(df)

            features_merged = pd.merge(
                time_features,
                freq_features,
                left_index=True,
                right_index=True,
                suffixes=["_time", "_freq"]
            )
            full_features.append(features_merged)

        datasets[part] = pd.concat(full_features)
        datasets[part].reset_index(drop=True, inplace=True)
    
    return datasets

# Depresjon

In [52]:
dataset = Dataset(dirpath=os.path.join("data", "depresjon"))
condition = dataset.condition
control = dataset.control

In [53]:
condition[0]

Unnamed: 0,timestamp,date,activity
0,2003-05-07 12:00:00,2003-05-07,0
1,2003-05-07 12:01:00,2003-05-07,143
2,2003-05-07 12:02:00,2003-05-07,0
3,2003-05-07 12:03:00,2003-05-07,20
4,2003-05-07 12:04:00,2003-05-07,166
...,...,...,...
23239,2003-05-23 15:19:00,2003-05-23,0
23240,2003-05-23 15:20:00,2003-05-23,0
23241,2003-05-23 15:21:00,2003-05-23,0
23242,2003-05-23 15:22:00,2003-05-23,0


In [54]:
condition_parts_dfs = extract_features_for_dataframes(condition, freq="H")
control_parts_dfs = extract_features_for_dataframes(control, freq="H")

datasets = {}

for part in ["full_24h", "night", "day"]:
    condition_df = condition_parts_dfs["full_24h"]
    control_df = control_parts_dfs["full_24h"]
    
    entire_df = condition_df.append(control_df, ignore_index=True)
    datasets[part] = entire_df

In [55]:
target_dir = "processed_data"

for part, df in datasets.items():
    filename = f"manual_depresjon_{part}.csv"
    filepath = os.path.join(target_dir, filename)
    df.to_csv(filepath, index=False)

In [56]:
datasets["full_24h"]

Unnamed: 0,minimum_time,maximum_time,mean_time,median_time,variance_time,kurtosis_time,skewness_time,coeff_of_var_time,iqr_time,trimmed_mean_time,...,mean_freq,median_freq,variance_freq,kurtosis_freq,skewness_freq,coeff_of_var_freq,iqr_freq,trimmed_mean_freq,entropy_freq,spectral_flatness
0,0.0,1775.400024,146.843735,79.099998,39430.636719,16.872655,3.217538,1.35052,214.308335,112.766991,...,5247733.0,2438089.0,38050110000000.0,4.74283,2.337443,1.175456,3440097.0,3708409.0,4.271075,0.649165
1,0.0,1184.533325,289.170868,261.433319,73694.851562,-0.23259,0.710244,0.937476,457.241667,258.745453,...,10281790.0,5953788.0,157440400000000.0,6.699326,2.678014,1.220364,6005661.0,7067223.0,4.264182,0.660929
2,0.0,950.25,129.167282,50.816666,27766.009766,2.877282,1.628733,1.288362,204.808334,98.616768,...,5968502.0,2210280.0,85777970000000.0,8.177933,2.938108,1.551753,3372133.0,3515024.0,3.938916,0.537185
3,0.0,1059.31665,151.091476,81.733337,29531.572266,1.933566,1.321734,1.135836,252.179164,123.827675,...,5114222.0,2647666.0,35081880000000.0,4.627537,2.338917,1.158142,3204998.0,3612938.0,4.296206,0.667264
4,0.0,906.483337,221.277527,178.983337,46180.472656,-0.362119,0.730967,0.970038,362.500005,196.210083,...,8363174.0,4522860.0,79345510000000.0,4.904929,2.284223,1.065099,6104334.0,6306054.0,4.368708,0.686094
5,0.0,719.416687,75.061485,42.591667,8648.376953,9.114864,2.439158,1.237227,102.891665,58.222412,...,2850201.0,1023185.0,16144950000000.0,6.23176,2.609037,1.409752,2051578.0,1795513.0,4.044027,0.561175
6,0.0,1099.25,109.48111,52.458332,20248.443359,9.519291,2.511456,1.297954,161.033337,82.409325,...,3818881.0,2039797.0,20330300000000.0,7.447136,2.765593,1.18069,2252710.0,2708095.0,4.307993,0.680214
7,0.0,1501.800049,146.327271,5.541667,54861.113281,3.348586,1.84038,1.599542,243.775002,96.446457,...,5626628.0,2646977.0,47518730000000.0,6.645008,2.651986,1.225136,3841573.0,3893814.0,4.253733,0.653364
8,0.0,1046.016724,85.663643,69.050003,8097.554199,34.586817,3.794976,1.049002,118.437495,74.442741,...,2597630.0,1166017.0,12193240000000.0,8.875479,2.992579,1.344257,1928424.0,1722382.0,4.164455,0.62446
9,0.0,389.283325,70.900734,64.666664,3598.278809,4.015282,1.52417,0.844865,78.016665,63.697311,...,1695482.0,1011177.0,2517059000000.0,4.227085,2.222737,0.935737,917585.4,1319559.0,4.498018,0.756447


# Psykose

In [58]:
dataset = Dataset(dirpath=os.path.join("data", "psykose"))
condition = dataset.condition
control = dataset.control

In [59]:
condition_parts_dfs = extract_features_for_dataframes(condition, freq="H")
control_parts_dfs = extract_features_for_dataframes(control, freq="H")

datasets = {}

for part in ["full_24h", "night", "day"]:
    condition_df = condition_parts_dfs["full_24h"]
    control_df = control_parts_dfs["full_24h"]
    
    entire_df = condition_df.append(control_df, ignore_index=True)
    datasets[part] = entire_df

In [60]:
target_dir = "processed_data"

for part, df in datasets.items():
    filename = f"manual_psykose_{part}.csv"
    filepath = os.path.join(target_dir, filename)
    df.to_csv(filepath, index=False)