# センサーチャンネルごとの欠損値・分散分析

In [None]:
# センサーチャンネルごとの欠損値・分散分析
import duckdb
import pandas as pd
import numpy as np

# データベース接続
conn = duckdb.connect('/home/wsl/dev/my-study/ml/ml-note/kaggle_datasets.duckdb')

print("🔍 センサーチャンネル別欠損値・分散分析")
print("=" * 60)

In [None]:
# 1. IMUセンサーの欠損値・分散分析
print("🎯 IMUセンサー (acc_x/y/z, rot_w/x/y/z)")
imu_analysis = conn.execute("""
    SELECT 
        'acc_x' as channel,
        COUNT(*) as total_samples,
        COUNT(acc_x) as valid_samples,
        COUNT(*) - COUNT(acc_x) as missing_count,
        ROUND((COUNT(*) - COUNT(acc_x)) * 100.0 / COUNT(*), 2) as missing_pct,
        ROUND(AVG(acc_x), 4) as mean_val,
        ROUND(STDDEV(acc_x), 4) as std_val,
        ROUND(MIN(acc_x), 4) as min_val,
        ROUND(MAX(acc_x), 4) as max_val
    FROM "cmi_detect_behavior_with_sensor_data".train
    
    UNION ALL
    
    SELECT 
        'acc_y',
        COUNT(*),
        COUNT(acc_y),
        COUNT(*) - COUNT(acc_y),
        ROUND((COUNT(*) - COUNT(acc_y)) * 100.0 / COUNT(*), 2),
        ROUND(AVG(acc_y), 4),
        ROUND(STDDEV(acc_y), 4),
        ROUND(MIN(acc_y), 4),
        ROUND(MAX(acc_y), 4)
    FROM "cmi_detect_behavior_with_sensor_data".train
    
    UNION ALL
    
    SELECT 
        'acc_z',
        COUNT(*),
        COUNT(acc_z),
        COUNT(*) - COUNT(acc_z),
        ROUND((COUNT(*) - COUNT(acc_z)) * 100.0 / COUNT(*), 2),
        ROUND(AVG(acc_z), 4),
        ROUND(STDDEV(acc_z), 4),
        ROUND(MIN(acc_z), 4),
        ROUND(MAX(acc_z), 4)
    FROM "cmi_detect_behavior_with_sensor_data".train
    
    UNION ALL
    
    SELECT 
        'rot_w',
        COUNT(*),
        COUNT(rot_w),
        COUNT(*) - COUNT(rot_w),
        ROUND((COUNT(*) - COUNT(rot_w)) * 100.0 / COUNT(*), 2),
        ROUND(AVG(rot_w), 4),
        ROUND(STDDEV(rot_w), 4),
        ROUND(MIN(rot_w), 4),
        ROUND(MAX(rot_w), 4)
    FROM "cmi_detect_behavior_with_sensor_data".train
    
    UNION ALL
    
    SELECT 
        'rot_x',
        COUNT(*),
        COUNT(rot_x),
        COUNT(*) - COUNT(rot_x),
        ROUND((COUNT(*) - COUNT(rot_x)) * 100.0 / COUNT(*), 2),
        ROUND(AVG(rot_x), 4),
        ROUND(STDDEV(rot_x), 4),
        ROUND(MIN(rot_x), 4),
        ROUND(MAX(rot_x), 4)
    FROM "cmi_detect_behavior_with_sensor_data".train
""").fetchdf()

print(imu_analysis.to_string(index=False))

In [None]:
# 2. 温度センサーの欠損値・分散分析
print("\n🌡️ 温度センサー (thm_1〜5)")
thm_analysis = conn.execute("""
    SELECT 
        'thm_1' as channel,
        COUNT(*) as total_samples,
        COUNT(thm_1) as valid_samples,
        COUNT(*) - COUNT(thm_1) as missing_count,
        ROUND((COUNT(*) - COUNT(thm_1)) * 100.0 / COUNT(*), 2) as missing_pct,
        ROUND(AVG(thm_1), 2) as mean_val,
        ROUND(STDDEV(thm_1), 2) as std_val,
        ROUND(MIN(thm_1), 2) as min_val,
        ROUND(MAX(thm_1), 2) as max_val
    FROM "cmi_detect_behavior_with_sensor_data".train
    
    UNION ALL
    
    SELECT 'thm_2', COUNT(*), COUNT(thm_2), COUNT(*) - COUNT(thm_2), 
           ROUND((COUNT(*) - COUNT(thm_2)) * 100.0 / COUNT(*), 2),
           ROUND(AVG(thm_2), 2), ROUND(STDDEV(thm_2), 2), ROUND(MIN(thm_2), 2), ROUND(MAX(thm_2), 2)
    FROM "cmi_detect_behavior_with_sensor_data".train
    
    UNION ALL
    
    SELECT 'thm_3', COUNT(*), COUNT(thm_3), COUNT(*) - COUNT(thm_3), 
           ROUND((COUNT(*) - COUNT(thm_3)) * 100.0 / COUNT(*), 2),
           ROUND(AVG(thm_3), 2), ROUND(STDDEV(thm_3), 2), ROUND(MIN(thm_3), 2), ROUND(MAX(thm_3), 2)
    FROM "cmi_detect_behavior_with_sensor_data".train
    
    UNION ALL
    
    SELECT 'thm_4', COUNT(*), COUNT(thm_4), COUNT(*) - COUNT(thm_4), 
           ROUND((COUNT(*) - COUNT(thm_4)) * 100.0 / COUNT(*), 2),
           ROUND(AVG(thm_4), 2), ROUND(STDDEV(thm_4), 2), ROUND(MIN(thm_4), 2), ROUND(MAX(thm_4), 2)
    FROM "cmi_detect_behavior_with_sensor_data".train
    
    UNION ALL
    
    SELECT 'thm_5', COUNT(*), COUNT(thm_5), COUNT(*) - COUNT(thm_5), 
           ROUND((COUNT(*) - COUNT(thm_5)) * 100.0 / COUNT(*), 2),
           ROUND(AVG(thm_5), 2), ROUND(STDDEV(thm_5), 2), ROUND(MIN(thm_5), 2), ROUND(MAX(thm_5), 2)
    FROM "cmi_detect_behavior_with_sensor_data".train
""").fetchdf()

print(thm_analysis.to_string(index=False))

In [None]:
# 3. ToFセンサーの代表チャンネルの欠損値・分散分析
print("\n📡 ToFセンサー代表チャンネル (各センサーのv0, v31, v63)")
tof_analysis = conn.execute("""
    SELECT 
        'tof_1_v0' as channel,
        COUNT(*) as total_samples,
        COUNT(tof_1_v0) as valid_samples,
        COUNT(*) - COUNT(tof_1_v0) as missing_count,
        ROUND((COUNT(*) - COUNT(tof_1_v0)) * 100.0 / COUNT(*), 2) as missing_pct,
        ROUND(AVG(tof_1_v0), 1) as mean_val,
        ROUND(STDDEV(tof_1_v0), 1) as std_val,
        ROUND(MIN(tof_1_v0), 1) as min_val,
        ROUND(MAX(tof_1_v0), 1) as max_val
    FROM "cmi_detect_behavior_with_sensor_data".train
    
    UNION ALL
    
    SELECT 'tof_1_v31', COUNT(*), COUNT(tof_1_v31), COUNT(*) - COUNT(tof_1_v31), 
           ROUND((COUNT(*) - COUNT(tof_1_v31)) * 100.0 / COUNT(*), 2),
           ROUND(AVG(tof_1_v31), 1), ROUND(STDDEV(tof_1_v31), 1), ROUND(MIN(tof_1_v31), 1), ROUND(MAX(tof_1_v31), 1)
    FROM "cmi_detect_behavior_with_sensor_data".train
    
    UNION ALL
    
    SELECT 'tof_1_v63', COUNT(*), COUNT(tof_1_v63), COUNT(*) - COUNT(tof_1_v63), 
           ROUND((COUNT(*) - COUNT(tof_1_v63)) * 100.0 / COUNT(*), 2),
           ROUND(AVG(tof_1_v63), 1), ROUND(STDDEV(tof_1_v63), 1), ROUND(MIN(tof_1_v63), 1), ROUND(MAX(tof_1_v63), 1)
    FROM "cmi_detect_behavior_with_sensor_data".train
    
    UNION ALL
    
    SELECT 'tof_5_v0', COUNT(*), COUNT(tof_5_v0), COUNT(*) - COUNT(tof_5_v0), 
           ROUND((COUNT(*) - COUNT(tof_5_v0)) * 100.0 / COUNT(*), 2),
           ROUND(AVG(tof_5_v0), 1), ROUND(STDDEV(tof_5_v0), 1), ROUND(MIN(tof_5_v0), 1), ROUND(MAX(tof_5_v0), 1)
    FROM "cmi_detect_behavior_with_sensor_data".train
""").fetchdf()

print(tof_analysis.to_string(index=False))

In [None]:
# 4. センサー品質サマリー
print("\n📊 センサー品質サマリー")
print("IMUセンサー品質:", "🟢 優秀" if imu_analysis['missing_pct'].max() < 1 else "🟡 良好" if imu_analysis['missing_pct'].max() < 5 else "🔴 注意")
print("温度センサー品質:", "🟢 優秀" if thm_analysis['missing_pct'].max() < 1 else "🟡 良好" if thm_analysis['missing_pct'].max() < 5 else "🔴 注意")
print("ToFセンサー品質:", "🟢 優秀" if tof_analysis['missing_pct'].max() < 1 else "🟡 良好" if tof_analysis['missing_pct'].max() < 5 else "🔴 注意")

# 5. 高欠損センサーの特定
high_missing = []
for _, row in thm_analysis.iterrows():
    if row['missing_pct'] > 5:
        high_missing.append(f"{row['channel']}: {row['missing_pct']}%")
        
for _, row in tof_analysis.iterrows():
    if row['missing_pct'] > 5:
        high_missing.append(f"{row['channel']}: {row['missing_pct']}%")

if high_missing:
    print("\n⚠️ 高欠損センサー (>5%):")
    for sensor in high_missing:
        print(f"  • {sensor}")
else:
    print("\n✅ 全センサーが良好な品質 (欠損<5%)")

print("\n" + "=" * 60)