In [1]:
from pathlib import Path
JSON_PATH = Path("/Users/joregan/Playing/hsi/annotations/final_resolved")

In [2]:
import json

speakers = {}

for file in JSON_PATH.glob("*.json"):
    parts = file.stem.split("_")
    speaker = parts[1]

    if not speaker in speakers:
        speakers[speaker] = []

    with open(file) as f:
        data = json.load(f)
    
    for seg in data:
        start = data[seg]["general"]["start"]
        end = data[seg]["general"]["end"]
        dur = end - start
        speakers[speaker].append(dur)


In [14]:
import numpy as np

longest = {k: np.max(speakers[k]) for k in speakers}
shortest = {k: np.min(speakers[k]) for k in speakers}
avg = {k: np.average(speakers[k]) for k in speakers}
totals = {k: np.sum(speakers[k]) for k in speakers}

median = {k: np.median(speakers[k]) for k in speakers}
std_dev = {k: np.std(speakers[k], ddof=1) for k in speakers}  # sample std dev
count = {k: len(speakers[k]) for k in speakers}
s_range = {k: longest[k] - shortest[k] for k in speakers}

In [18]:
import pandas as pd

summary_df = pd.DataFrame({
    'count': count,
    'mean': avg,
    'median': median,
    'std_dev': std_dev,
    'min': shortest,
    'max': longest,
    'range': s_range,
    'total_duration': totals
})

# Optional: clean formatting
summary_df = summary_df.round(2)

print(summary_df)

   count  mean  median  std_dev   min    max  range  total_duration
4    902  2.59    1.89     2.24  0.16  16.60  16.44         2331.72
7    759  4.71    3.26     5.02  0.16  67.23  67.08         3573.08
6   1598  1.99    1.20     2.24  0.04  18.69  18.65         3186.45
5    960  4.96    2.71     5.78  0.15  28.72  28.57         4759.96
3    751  2.72    1.84     2.57  0.12  17.18  17.06         2045.76


In [19]:
latex_table = summary_df.to_latex(
    index=True,        # Keeps the speaker names as row labels
    caption='Descriptive statistics of utterance durations by speaker',
    label='tab:speaker_stats',
    float_format="%.2f",
    column_format="lrrrrrrr",  # Adjust based on column count
    bold_rows=True
)

print(latex_table)


\begin{table}
\caption{Descriptive statistics of utterance durations by speaker}
\label{tab:speaker_stats}
\begin{tabular}{lrrrrrrr}
\toprule
 & count & mean & median & std_dev & min & max & range & total_duration \\
\midrule
\textbf{4} & 902 & 2.59 & 1.89 & 2.24 & 0.16 & 16.60 & 16.44 & 2331.72 \\
\textbf{7} & 759 & 4.71 & 3.26 & 5.02 & 0.16 & 67.23 & 67.08 & 3573.08 \\
\textbf{6} & 1598 & 1.99 & 1.20 & 2.24 & 0.04 & 18.69 & 18.65 & 3186.45 \\
\textbf{5} & 960 & 4.96 & 2.71 & 5.78 & 0.15 & 28.72 & 28.57 & 4759.96 \\
\textbf{3} & 751 & 2.72 & 1.84 & 2.57 & 0.12 & 17.18 & 17.06 & 2045.76 \\
\bottomrule
\end{tabular}
\end{table}

