In [1]:
import pandas as pd
import numpy as np

# CSV-Datei einlesen
# Ersetze 'dein_dateiname.csv' mit dem tatsächlichen Dateinamen
df = pd.read_csv('/workspace/project/catatonia_VAE-main_bq/metadata_20250110/full_data_with_codiagnosis_and_scores.csv')

# Datentypen korrigieren falls nötig
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Sex'] = df['Sex'].astype(str)

# Zusammenfassungstabelle erstellen
summary_stats = []

# Für jedes Dataset die Statistiken berechnen
for dataset in df['Dataset'].unique():
    dataset_df = df[df['Dataset'] == dataset]
    
    # Alter: Mean ± SD
    age_mean = dataset_df['Age'].mean()
    age_std = dataset_df['Age'].std()
    age_summary = f"{age_mean:.1f} ± {age_std:.1f}"
    
    # Geschlecht zählen
    sex_counts = dataset_df['Sex'].value_counts()
    female_count = sex_counts.get('Female', 0)
    male_count = sex_counts.get('Male', 0)
    
    # Zeile für die Zusammenfassung erstellen
    summary_stats.append({
        'Dataset': dataset,
        'Age (Mean ± SD)': age_summary,
        'Female': female_count,
        'Male': male_count,
        'Total': len(dataset_df)
    })

# DataFrame aus der Zusammenfassung erstellen
summary_df = pd.DataFrame(summary_stats)

# Nach Dataset sortieren
summary_df = summary_df.sort_values('Dataset')

print("Zusammenfassungstabelle:")
print(summary_df.to_string(index=False))

# Optional: Als CSV speichern
summary_df.to_csv('dataset_summary.csv', index=False)

# Für LaTeX-Tabelle formatieren
print("\n\nFür LaTeX formatiert:")
print("\\begin{table}[h]")
print("\\centering")
print("\\caption{Dataset Summary Statistics}")
print("\\begin{tabular}{lcccc}")
print("\\toprule")
print("Dataset & Age (Mean ± SD) & Female & Male & Total \\\\")
print("\\midrule")

for _, row in summary_df.iterrows():
    print(f"{row['Dataset']} & {row['Age (Mean ± SD)']} & {row['Female']} & {row['Male']} & {row['Total']} \\\\")

print("\\bottomrule")
print("\\end{tabular}")
print("\\label{tab:dataset_summary}")
print("\\end{table}")

# Erweiterte Version mit MultiIndex für Sex
print("\n\nErweiterte Tabelle mit MultiIndex:")
pivot_df = df.groupby(['Dataset', 'Sex']).agg({
    'Age': ['count', 'mean', 'std']
}).round(1)

# MultiIndex flach machen
pivot_df.columns = ['_'.join(col).strip() for col in pivot_df.columns.values]
pivot_df = pivot_df.reset_index()

print(pivot_df)

Zusammenfassungstabelle:
 Dataset Age (Mean ± SD)  Female  Male  Total
   COBRE     38.4 ± 12.6      46   142    188
    EPSY      23.6 ± 3.9      65   109    174
     IXI     59.9 ± 12.3     225   196    421
    MCIC     33.9 ± 11.6      56   148    204
     NSS     38.8 ± 11.3      56    61    117
      NU       0.0 ± 0.0     200   235    435
   SRBPS     39.1 ± 14.0     545   646   1191
whiteCAT     37.5 ± 13.5      80    82    162


Für LaTeX formatiert:
\begin{table}[h]
\centering
\caption{Dataset Summary Statistics}
\begin{tabular}{lcccc}
\toprule
Dataset & Age (Mean ± SD) & Female & Male & Total \\
\midrule
COBRE & 38.4 ± 12.6 & 46 & 142 & 188 \\
EPSY & 23.6 ± 3.9 & 65 & 109 & 174 \\
IXI & 59.9 ± 12.3 & 225 & 196 & 421 \\
MCIC & 33.9 ± 11.6 & 56 & 148 & 204 \\
NSS & 38.8 ± 11.3 & 56 & 61 & 117 \\
NU & 0.0 ± 0.0 & 200 & 235 & 435 \\
SRBPS & 39.1 ± 14.0 & 545 & 646 & 1191 \\
whiteCAT & 37.5 ± 13.5 & 80 & 82 & 162 \\
\bottomrule
\end{tabular}
\label{tab:dataset_summary}
\end{table}