# Utils

In [167]:
import os
from typing import Dict, List

import numpy as np
import pandas as pd
import scipy as sp
import scipy.signal
import scipy.stats
from scipy.stats.mstats import gmean

from utils import Dataset

# Manual feature extraction

## Utilities and preprocessing

In [168]:
def basic_data_cleaning(data: List[pd.DataFrame]) -> List[pd.DataFrame]:
    """
    Assumes DataFrames with "timestamp", "date" and "activity" columns.
    
    Performs cleaning operations:
    - assure format YYYY-MM-DD HH:MM:SS for "timestamp"
    - drop redundant "date" column
    - assure float32 format for "activity"
    
    :param data: list of DataFrames
    :returns: list of cleaned DataFrames
    """
    data = [df.copy() for df in data]  # create copy to avoid side effects
    
    for df in data:
        df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y-%m-%d %H:%M:%S")
        df.drop("date", axis=1, inplace=True)
        df["activity"] = df["activity"].astype(np.float32)
    
    return data


def get_day_part(df: pd.DataFrame, part: str) -> pd.DataFrame:
    """
    For given DataFrame with "timestamp" column returns only those rows that correspond to the 
    chosen part of day.
    
    Parts are "day" and "night", defined as:
    - "day": [8:00, 21:00)
    - "night": [21:00, 8:00)
    
    :param df: DataFrame to select rows from
    :param part: part of day, either "day" or "night"
    :returns: DataFrame, subset of rows of df
    """
    if part == "day":
        df = df.loc[(df["timestamp"].dt.hour >= 8) & (df["timestamp"].dt.hour < 21)]
    elif part == "night":
        df = df.loc[(df["timestamp"].dt.hour >= 21) | (df["timestamp"].dt.hour < 8)]
    else:
        raise ValueError(f'Part should be "day" or "night", got "{part}"')
        
    return df


def fill_missing_activity(df: pd.DataFrame) -> pd.DataFrame:
    """
    Makes sure that "timestamp" column has minute resolution with no missing values from start to end and replaces 
    all NaNs in "activity" column with mean average value.
    
    :param data: DataFrame with "timestamp" and "activity" columns
    :returns: cleaned DataFrame
    """
    df = df.copy()  # create copy to avoid side effects
    
    # resample to the basic frequency, i.e. minute; this will create NaNs for any rows that may be missing
    df = df.resample("min", on="timestamp").mean()
    
    # recreate index and "timestamp" column
    df = df.reset_index()
    
    # fill any NaNs with mean activity value
    df["activity"] = df["activity"].fillna(df["activity"].mean())

    return df


def resample(df: pd.DataFrame, freq: str = "H") -> pd.DataFrame:
    """
    Resamples time series DataFrame with given frequency, aggregating each segment with a mean.

    :param df: DataFrame with "timestamp" and "activity" columns
    :param freq: resampling frequency string passed to Pandas resample() function
    :returns: DataFrame with "timestamp" and "activity" columns
    """
    df = df.copy()  # create copy to avoid side effects
    
    # make sure that data has minute resolution with no missing parts from start to end, with no missing values
    df = fill_missing_activity(df)
    
    # group with given frequency
    df = df.resample(freq, on="timestamp").mean()

    # recreate "timestamp" column
    df = df.reset_index()

    return df


def power_spectral_density(df: pd.DataFrame) -> np.ndarray:
    """
    Calculates power spectral density (PSD) from "activity" column of a DataFrame using a periodogram.
    
    :param df: DataFrame with "activity" column
    :returns: 1D Numpy array with power spectral density
    """
    return scipy.signal.periodogram(df["activity"].values)[1]


def spectral_flatness(df: pd.DataFrame) -> float:
    """
    Calculates spectral flatness of a signal, i.e. a geometric mean of the power spectrum divided by 
    the arithmetic mean of the power spectrum.
    
    If some frequency bins in the power spectrum are close to zero, they are removed prior to calculation of 
    spectral flatness to avoid calculation of log(0).
    
    :param df: DataFrame with "activity" column
    :returns: spectral flatness value
    """
    x = df["activity"].values + 1e-3
    power_spectrum = scipy.signal.periodogram(x)[1]
    
    non_zeros_mask = ~np.isclose(power_spectrum, 0)
    power_spectrum = power_spectrum[non_zeros_mask]
    
    return gmean(power_spectrum) / power_spectrum.mean()

## Feature extraction

In [169]:
def extract_time_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts features from activity signal in time domain.
    
    :param df_resampled: DataFrame with "activity" column
    :returns: DataFrame with a single row representing features
    """
    X = df["activity"].values
    
    features = {
        "minimum": np.min(X),
        "maximum": np.max(X),
        "mean": np.mean(X),
        "median": np.median(X),
        "stddev": np.std(X, ddof=1),  # ddof=1 applies Bessel correction, i.e. division by (N-1) instead of N
        "variance": np.var(X),
        "kurtosis": sp.stats.kurtosis(X),
        "skewness": sp.stats.skew(X),
        "coeff_of_var": sp.stats.variation(X),
        "iqr": sp.stats.iqr(X),
        "trimmed_mean": sp.stats.trim_mean(X, proportiontocut=0.1),
        "entropy": sp.stats.entropy(X, base=2),
    }
    
    return pd.DataFrame([features])

In [170]:
def extract_frequency_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts features from activity signal in frequency domain, i.e. calculated from its Power Spectral Density (PSD).
    
    :param df: DataFrame with "activity" column
    :returns: DataFrame with a single row representing features
    """
    X = power_spectral_density(df)
    
    features = {
        "minimum": np.min(X),
        "maximum": np.max(X),
        "mean": np.mean(X),
        "median": np.median(X),
        "stddev": np.std(X, ddof=1),  # ddof=1 applies Bessel correction, i.e. division by (N-1) instead of N
        "variance": np.var(X),
        "kurtosis": sp.stats.kurtosis(X),
        "skewness": sp.stats.skew(X),
        "coeff_of_var": sp.stats.variation(X),
        "iqr": sp.stats.iqr(X),
        "trimmed_mean": sp.stats.trim_mean(X, proportiontocut=0.1),
        "entropy": sp.stats.entropy(X, base=2),
        "spectral_flatness": spectral_flatness(df)
    }
    
    return pd.DataFrame([features])

In [171]:
def extract_features_for_dataframes(dfs: List[pd.DataFrame], freq: str = "H") -> Dict[str, pd.DataFrame]:
    """
    Calculates time and frequency features for given DataFrames. Uses given frequency for resampling.
    
    Calculates features separately for:
    - full 24hs
    - days: [8:00, 21:00)
    - nights: [21:00, 8:00)
    
    :param dfs: list of DataFrames to extract features from; each one has to have "timestamp" and "activity" columns
    :param freq: resampling frequency
    :returns: dictionary with keys "full_24h", "day" and "night", corresponding to features from given parts of day
    """
    full_dfs = basic_data_cleaning(dfs)
    night_dfs = [get_day_part(df, part="night") for df in full_dfs]
    day_dfs = [get_day_part(df, part="day") for df in full_dfs]

    day_part_dfs = {"full_24h": full_dfs, "night": night_dfs, "day": day_dfs}
    datasets = {}

    for part, list_of_dfs in day_part_dfs.items():
        full_features = []
        for df in list_of_dfs:
            df_resampled = resample(df, freq=freq)

            time_features = extract_time_features(df_resampled)
            freq_features = extract_frequency_features(df_resampled)

            features_merged = pd.merge(
                time_features,
                freq_features,
                left_index=True,
                right_index=True,
                suffixes=["_time", "_freq"]
            )
            full_features.append(features_merged)

        datasets[part] = pd.concat(full_features)
        datasets[part].reset_index(drop=True, inplace=True)
    
    return datasets

# Depresjon

In [172]:
dataset = Dataset(dirpath=os.path.join("data", "depresjon"))
condition = dataset.condition
control = dataset.control

In [173]:
condition[0]

Unnamed: 0,timestamp,date,activity
0,2003-05-07 12:00:00,2003-05-07,0
1,2003-05-07 12:01:00,2003-05-07,143
2,2003-05-07 12:02:00,2003-05-07,0
3,2003-05-07 12:03:00,2003-05-07,20
4,2003-05-07 12:04:00,2003-05-07,166
...,...,...,...
23239,2003-05-23 15:19:00,2003-05-23,0
23240,2003-05-23 15:20:00,2003-05-23,0
23241,2003-05-23 15:21:00,2003-05-23,0
23242,2003-05-23 15:22:00,2003-05-23,0


In [174]:
condition_parts_dfs = extract_features_for_dataframes(condition, freq="H")
control_parts_dfs = extract_features_for_dataframes(control, freq="H")

datasets = {}

for part in ["full_24h", "night", "day"]:
    condition_df = condition_parts_dfs["full_24h"]
    control_df = control_parts_dfs["full_24h"]
    
    entire_df = condition_df.append(control_df, ignore_index=True)
    datasets[part] = entire_df

In [175]:
target_dir = "processed_data"

for part, df in datasets.items():
    filename = f"depresjon_{part}.csv"
    filepath = os.path.join(target_dir, filename)
    df.to_csv(filepath, index=False)

In [176]:
datasets["full_24h"]

Unnamed: 0,minimum_time,maximum_time,mean_time,median_time,stddev_time,variance_time,kurtosis_time,skewness_time,coeff_of_var_time,iqr_time,...,median_freq,stddev_freq,variance_freq,kurtosis_freq,skewness_freq,coeff_of_var_freq,iqr_freq,trimmed_mean_freq,entropy_freq,spectral_flatness
0,0.0,1775.400024,146.843735,79.099998,198.571487,39329.011719,16.872655,3.217538,1.35052,214.308335,...,22546.435547,340280.6,115197100000.0,164.046255,12.49582,4.337211,60796.433594,37670.988281,5.405469,0.243786
1,0.0,1184.533325,289.170868,261.433319,271.467957,73490.140625,-0.23259,0.710244,0.937476,457.241667,...,28367.388672,952618.9,902469000000.0,170.321443,13.024874,6.499249,71857.02832,47538.242188,4.264601,0.204497
2,0.0,950.25,129.167282,50.816666,166.631363,27693.703125,2.877282,1.628733,1.288362,204.808334,...,20504.015625,200350.4,39932310000.0,116.724815,10.232612,3.626663,40307.733398,26700.503906,5.600876,0.320028
3,0.0,1059.31665,151.091476,81.733337,171.847519,29451.755859,1.933566,1.321734,1.135836,252.179164,...,18322.316406,223044.5,49481370000.0,119.111967,10.38621,3.796828,34192.206055,25793.839844,5.408532,0.247478
4,0.0,906.483337,221.277527,178.983337,214.896423,46073.574219,-0.362119,0.730967,0.970038,362.500005,...,27314.263672,350045.5,121967200000.0,128.367432,10.54202,3.80755,44623.34375,35685.039062,5.525689,0.251693
5,0.0,719.416687,75.061485,42.591667,92.996651,8624.487305,9.114864,2.439158,1.237227,102.891665,...,8164.378418,51446.82,2632232000.0,142.62792,11.469247,2.990829,15305.28656,10445.613281,6.02136,0.392794
6,0.0,1099.25,109.48111,52.458332,142.297028,20192.816406,9.519291,2.511456,1.297954,161.033337,...,21084.615234,75515.19,5671383000.0,57.125941,6.599064,1.874983,34535.072266,25825.099609,6.411524,0.387154
7,0.0,1501.800049,146.327271,5.541667,234.224487,54782.515625,3.348586,1.84038,1.599542,243.775002,...,27432.849609,625042.6,389562000000.0,153.950392,12.053959,5.71293,49383.888672,33975.730469,5.3861,0.198311
8,0.0,1046.016724,85.663643,69.050003,89.986412,8075.061035,34.586817,3.794976,1.049002,118.437495,...,9559.854492,51982.25,2687225000.0,153.120894,12.094863,3.227619,13701.13208,10606.317383,6.046891,0.455347
9,0.0,389.283325,70.900734,64.666664,59.985657,3588.199463,4.015282,1.52417,0.844865,78.016665,...,4405.757324,9792.058,95348740.0,27.1547,4.275147,1.364475,6947.679871,5328.163574,6.648716,0.509717


# Psykose

In [177]:
dataset = Dataset(dirpath=os.path.join("data", "psykose"))
condition = dataset.condition
control = dataset.control

In [178]:
condition_parts_dfs = extract_features_for_dataframes(condition, freq="H")
control_parts_dfs = extract_features_for_dataframes(control, freq="H")

datasets = {}

for part in ["full_24h", "night", "day"]:
    condition_df = condition_parts_dfs["full_24h"]
    control_df = control_parts_dfs["full_24h"]
    
    entire_df = condition_df.append(control_df, ignore_index=True)
    datasets[part] = entire_df

In [179]:
target_dir = "processed_data"

for part, df in datasets.items():
    filename = f"psykose_{part}.csv"
    filepath = os.path.join(target_dir, filename)
    df.to_csv(filepath, index=False)