# 2. Calculate normalized stats
In this notebook we:
- Load the DPS curves that were collected in step 1.
- Calaculate stats over regular windows. 

## Setup

In [None]:
# Installs
import sys
!echo "Purging pip environment and installing packages..."
!{sys.executable} -m pip cache purge 
!{sys.executable} -m pip uninstall -y jhutils 
!{sys.executable} -m pip install -q git+https://github.com/jdchart/jh-py-utils.git

# Imports
print("Importing packages...")
import os
from jhutils.local_files import collect_files
import dps
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import utils
print("Ready!")

## Load data

In [None]:
VOSK_ANALYSES = "/Users/jacob/Documents/Repos/dps/projects/data/Fusion/input"
DPS_CURVES = "/Users/jacob/Documents/Repos/dps/projects/data/output/dps_curves/dps_curve_32_1920_20"

analysis_files = collect_files(VOSK_ANALYSES, ["json"])
print(f"Succesfully found {len(analysis_files)} files!")

## Analysis config
- `WINDOWS`: Each curve will be proportionally modifed such that the number of values becomes this resolution.

In [None]:
WINDOWS = 150
FPS = 32
OUTPUT_DEST = "/Users/jacob/Documents/Repos/dps/projects/data/output/norm-stats/dps_curve_32_1920_20"
BINS = [0, 1, 2, 3, 5]
BIN_LABELS = ["[0–1[", "[1–2[", "[2–3[", "[3–5]"]

## Test on one occurance
`segment_nonzero = segment[segment > 0]` : We do this in order to evauate moment of silence. This could be optimizing by setting a minimum succesive 0 time.

In [None]:
INDEX = 0

window_stats = []

data_file_name = os.path.splitext(os.path.basename(analysis_files[INDEX]))[0]
curve = np.load(os.path.join(DPS_CURVES, "data", f"{data_file_name}.npy"))
speech_recognition = dps.SpeechAnalysis(analysis_files[INDEX], fps = FPS)

# Convert DPS data to float64
data = np.array(curve, dtype=np.float64)
# data = np.nan_to_num(data, nan=0.0)

total_len = len(data)
points_per_window = total_len // WINDOWS

for w in range(WINDOWS):
    start = w * points_per_window
    end = (w + 1) * points_per_window if w < WINDOWS - 1 else total_len
    segment = data[start:end]
    segment_nonzero = segment[segment > 0]

    start_time = start / FPS
    end_time = end / FPS

    # Statistiques de base
    stats = {
        "file": os.path.basename(analysis_files[INDEX]),
        "window": w,
        "start_time_s": round(start_time, 2),
        "end_time_s": round(end_time, 2),
        "start_time_mn": f"{int(start_time // 60):02}:{int(start_time % 60):02}",
        "end_time_mn": f"{int(end_time // 60):02}:{int(end_time % 60):02}",
        "count_words": speech_recognition._count_words_region_ms(start_time * 1000, end_time * 1000),
        "mean": np.mean(segment_nonzero) if len(segment_nonzero) > 0 else 0,
        "std": np.std(segment_nonzero) if len(segment_nonzero) > 0 else 0,
        "min": np.min(segment_nonzero) if len(segment_nonzero) > 0 else 0,
        "max": np.max(segment_nonzero) if len(segment_nonzero) > 0 else 0
    }

    # Répartition par bins
    bin_counts, _ = np.histogram(segment_nonzero, bins=BINS)
    for label, count in zip(BIN_LABELS, bin_counts):
        stats[f"count {label}"] = count
        stats[f"perc {label}"] = (count / len(segment_nonzero) * 100) if len(segment_nonzero) > 0 else 0

    window_stats.append(stats)

df_windows = pd.DataFrame(window_stats).round(2)
display(df_windows.head(20))

## Process all

In [None]:
for i, source_file in enumerate(analysis_files):
    print(f"Treating file {i + 1}/{len(analysis_files)}...")

    window_stats = []

    data_file_name = os.path.splitext(os.path.basename(source_file))[0]
    curve = np.load(os.path.join(DPS_CURVES, "data", f"{data_file_name}.npy"))
    speech_recognition = dps.SpeechAnalysis(source_file, fps = FPS)

    # Convert DPS data to float64
    data = np.array(curve, dtype=np.float64)
    # data = np.nan_to_num(data, nan=0.0)

    total_len = len(data)
    points_per_window = total_len // WINDOWS

    for w in range(WINDOWS):
        start = w * points_per_window
        end = (w + 1) * points_per_window if w < WINDOWS - 1 else total_len
        segment = data[start:end]
        segment_nonzero = segment[segment > 0]

        start_time = start / FPS
        end_time = end / FPS

        # Statistiques de base
        stats = {
            "file": os.path.basename(source_file),
            "window": w,
            "start_time_s": round(start_time, 2),
            "end_time_s": round(end_time, 2),
            "start_time_mn": f"{int(start_time // 60):02}:{int(start_time % 60):02}",
            "end_time_mn": f"{int(end_time // 60):02}:{int(end_time % 60):02}",
            "count_words": speech_recognition._count_words_region_ms(start_time * 1000, end_time * 1000),
            "mean": np.mean(segment_nonzero) if len(segment_nonzero) > 0 else 0,
            "std": np.std(segment_nonzero) if len(segment_nonzero) > 0 else 0,
            "min": np.min(segment_nonzero) if len(segment_nonzero) > 0 else 0,
            "max": np.max(segment_nonzero) if len(segment_nonzero) > 0 else 0
        }

        # Répartition par bins
        bin_counts, _ = np.histogram(segment_nonzero, bins=BINS)
        for label, count in zip(BIN_LABELS, bin_counts):
            stats[f"count {label}"] = count
            stats[f"perc {label}"] = (count / len(segment_nonzero) * 100) if len(segment_nonzero) > 0 else 0

        window_stats.append(stats)
    
    os.makedirs(OUTPUT_DEST, exist_ok = True)
    df_windows = pd.DataFrame(window_stats).round(2)
    df_windows.to_csv(os.path.join(OUTPUT_DEST, f'{data_file_name}.csv'), index = True)

print("👍 Finished!")

# Process manual data

In [None]:
MANUAL_SOURCE = "/Users/jacob/Documents/Repos/dps/projects/data/output/dps_curves/dps_curve_LMI_Stopwatch_20230509/data/LMI_Stopwatch_20230509.npy"
OUTPUT_DEST = "/Users/jacob/Documents/Repos/dps/projects/data/output/norm-stats/dps_curve_manual"

FPS = 1

curve = np.load(MANUAL_SOURCE)
window_stats = []

# Convert DPS data to float64
data = np.array(curve, dtype=np.float64)
# data = np.nan_to_num(data, nan=0.0)

total_len = len(data)
points_per_window = total_len // WINDOWS

for w in range(WINDOWS):
    start = w * points_per_window
    end = (w + 1) * points_per_window if w < WINDOWS - 1 else total_len
    segment = data[start:end]
    segment_nonzero = segment[segment > 0]

    start_time = start / FPS
    end_time = end / FPS

    # Statistiques de base
    stats = {
        "file": os.path.basename(MANUAL_SOURCE),
        "window": w,
        "start_time_s": round(start_time, 2),
        "end_time_s": round(end_time, 2),
        "start_time_mn": f"{int(start_time // 60):02}:{int(start_time % 60):02}",
        "end_time_mn": f"{int(end_time // 60):02}:{int(end_time % 60):02}",
        "mean": np.mean(segment_nonzero) if len(segment_nonzero) > 0 else 0,
        "std": np.std(segment_nonzero) if len(segment_nonzero) > 0 else 0,
        "min": np.min(segment_nonzero) if len(segment_nonzero) > 0 else 0,
        "max": np.max(segment_nonzero) if len(segment_nonzero) > 0 else 0
    }

    # Répartition par bins
    bin_counts, _ = np.histogram(segment_nonzero, bins=BINS)
    for label, count in zip(BIN_LABELS, bin_counts):
        stats[f"count {label}"] = count
        stats[f"perc {label}"] = (count / len(segment_nonzero) * 100) if len(segment_nonzero) > 0 else 0

    window_stats.append(stats)

df_windows = pd.DataFrame(window_stats).round(2)
display(df_windows.head(20))

os.makedirs(OUTPUT_DEST, exist_ok = True)
df_windows = pd.DataFrame(window_stats).round(2)
df_windows.to_csv(os.path.join(OUTPUT_DEST, f'{data_file_name}.csv'), index = True)
