# Flight1 Data Exploratory Data Analysis (EDA)

This notebook provides a comprehensive overview of the Flight1 dataset, including:
- Data structure and types
- Timestamp analysis and sampling frequency
- Sensor data distributions (quaternions, angular velocity, acceleration)
- Data quality checks (nulls, gaps, outliers)


In [None]:
# Setup and imports
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (14, 6)

print("Libraries loaded successfully")

## 1. Load Data and Basic Overview

In [None]:
# Load the labels data (sensor readings aligned with frames)
labels_df = pd.read_csv("/Users/h33662/Projects/self/edth/data/labels/Flight1.csv")

# Load the raw logs data
logs_df = pd.read_csv("/Users/h33662/Projects/self/edth/data/raw/logs/Flight1.csv")

print("=" * 60)
print("LABELS DATASET (sensor data aligned with video frames)")
print("=" * 60)
print(f"Shape: {labels_df.shape[0]:,} rows × {labels_df.shape[1]} columns")
print(f"Columns: {list(labels_df.columns)}")
print(f"\nData types:\n{labels_df.dtypes}")
print(f"\nMemory usage: {labels_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n" + "=" * 60)
print("LOGS DATASET (raw flight logs)")
print("=" * 60)
print(f"Shape: {logs_df.shape[0]:,} rows × {logs_df.shape[1]} columns")
print(f"Columns: {list(logs_df.columns)}")
print(f"\nData types:\n{logs_df.dtypes}")
print(f"\nMemory usage: {logs_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Preview first and last few rows
print("First 5 rows of labels:")
display(labels_df.head())

print("\nLast 5 rows of labels:")
display(labels_df.tail())

print("\nFirst 5 rows of logs:")
display(logs_df.head())

## 2. Timestamp Analysis & Conversion

The `timestamp` column contains microsecond values. We'll convert these to readable formats and analyze sampling frequency.

In [None]:
# Convert timestamp from microseconds to seconds
labels_df["timestamp_s"] = labels_df["timestamp"] / 1e6
labels_df["elapsed_s"] = labels_df["timestamp_s"] - labels_df["timestamp_s"].iloc[0]

# Compute time differences (sampling intervals)
labels_df["dt_us"] = labels_df["timestamp"].diff()  # microseconds
labels_df["dt_s"] = labels_df["dt_us"] / 1e6  # seconds

# Summary statistics
print("=" * 60)
print("TIMESTAMP STATISTICS")
print("=" * 60)
print(
    f"Total duration: {labels_df['elapsed_s'].iloc[-1]:.2f} seconds ({labels_df['elapsed_s'].iloc[-1] / 60:.2f} minutes)"
)
print(f"Number of samples: {len(labels_df):,}")
print(f"\nTimestamp range (microseconds):")
print(f"  Start: {labels_df['timestamp'].iloc[0]:,}")
print(f"  End:   {labels_df['timestamp'].iloc[-1]:,}")
print(f"\nSampling interval (dt) statistics:")
print(labels_df["dt_s"].describe())
print(f"\nMean sampling rate: {1 / labels_df['dt_s'].mean():.2f} Hz")
print(f"Median sampling rate: {1 / labels_df['dt_s'].median():.2f} Hz")

## 3. Sampling Frequency Distribution Over Time

In [None]:
# Plot sampling interval distribution
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# 1. Histogram of sampling intervals
ax = axes[0, 0]
labels_df["dt_s"].dropna().hist(bins=100, ax=ax, edgecolor="black", alpha=0.7)
ax.set_xlabel("Sampling Interval (seconds)")
ax.set_ylabel("Frequency")
ax.set_title("Distribution of Sampling Intervals")
ax.axvline(
    labels_df["dt_s"].median(),
    color="red",
    linestyle="--",
    label=f"Median: {labels_df['dt_s'].median():.6f}s",
)
ax.axvline(
    labels_df["dt_s"].mean(),
    color="orange",
    linestyle="--",
    label=f"Mean: {labels_df['dt_s'].mean():.6f}s",
)
ax.legend()

# 2. Sampling rate over time
ax = axes[0, 1]
# Compute rolling sampling rate (Hz) in windows
window_size = 100
labels_df["sampling_rate_hz"] = 1 / labels_df["dt_s"]
rolling_rate = (
    labels_df["sampling_rate_hz"].rolling(window=window_size, center=True).mean()
)
ax.plot(labels_df["elapsed_s"], rolling_rate, alpha=0.7, linewidth=0.8)
ax.set_xlabel("Elapsed Time (seconds)")
ax.set_ylabel("Sampling Rate (Hz)")
ax.set_title(f"Sampling Rate Over Time (rolling window={window_size})")
ax.grid(True, alpha=0.3)

# 3. Cumulative sample count over time
ax = axes[1, 0]
ax.plot(labels_df["elapsed_s"], np.arange(len(labels_df)), linewidth=1.5)
ax.set_xlabel("Elapsed Time (seconds)")
ax.set_ylabel("Cumulative Sample Count")
ax.set_title("Cumulative Samples Over Time")
ax.grid(True, alpha=0.3)

# 4. Identify and visualize time gaps
ax = axes[1, 1]
# Flag large gaps (e.g., > 2x median interval)
median_dt = labels_df["dt_s"].median()
threshold = 2 * median_dt
gaps = labels_df[labels_df["dt_s"] > threshold].copy()
ax.scatter(
    gaps["elapsed_s"],
    gaps["dt_s"],
    color="red",
    s=50,
    alpha=0.7,
    label=f"Gaps (>{threshold:.6f}s)",
)
ax.scatter(
    labels_df["elapsed_s"], labels_df["dt_s"], alpha=0.3, s=1, label="All intervals"
)
ax.set_xlabel("Elapsed Time (seconds)")
ax.set_ylabel("Sampling Interval (seconds)")
ax.set_title(f"Time Gaps Detection (threshold={threshold:.6f}s)")
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nDetected {len(gaps)} time gaps (intervals > {threshold:.6f}s)")

## 4. Sensor Data Distributions

Analyze the distributions of quaternions (orientation), angular velocities, and accelerations.

In [None]:
# Statistical summary for sensor columns
sensor_cols = [
    "qw",
    "qx",
    "qy",
    "qz",
    "wx_radDs",
    "wy_radDs",
    "wz_radDs",
    "ax_mDs2",
    "ay_mDs2",
    "az_mDs2",
]

print("=" * 60)
print("SENSOR DATA STATISTICS")
print("=" * 60)
print(labels_df[sensor_cols].describe())

# Check quaternion normalization (should be ~1.0 if normalized)
labels_df["quat_norm"] = np.sqrt(
    labels_df["qw"] ** 2
    + labels_df["qx"] ** 2
    + labels_df["qy"] ** 2
    + labels_df["qz"] ** 2
)
print(f"\nQuaternion norm statistics (should be ≈1.0 if normalized):")
print(labels_df["quat_norm"].describe())

In [None]:
# Plot sensor data distributions
fig, axes = plt.subplots(3, 4, figsize=(18, 12))

# Quaternions
for i, col in enumerate(["qw", "qx", "qy", "qz"]):
    ax = axes[0, i]
    labels_df[col].hist(bins=50, ax=ax, edgecolor="black", alpha=0.7)
    ax.set_xlabel(col)
    ax.set_ylabel("Frequency")
    ax.set_title(f"Distribution of {col}")
    ax.grid(True, alpha=0.3)

# Angular velocities
for i, col in enumerate(["wx_radDs", "wy_radDs", "wz_radDs"]):
    ax = axes[1, i]
    labels_df[col].hist(bins=50, ax=ax, edgecolor="black", alpha=0.7, color="orange")
    ax.set_xlabel(f"{col} (rad/s)")
    ax.set_ylabel("Frequency")
    ax.set_title(f"Distribution of {col}")
    ax.grid(True, alpha=0.3)

# Quaternion norm
ax = axes[1, 3]
labels_df["quat_norm"].hist(bins=50, ax=ax, edgecolor="black", alpha=0.7, color="green")
ax.set_xlabel("Quaternion Norm")
ax.set_ylabel("Frequency")
ax.set_title("Distribution of Quaternion Norm")
ax.axvline(1.0, color="red", linestyle="--", label="Expected: 1.0")
ax.legend()
ax.grid(True, alpha=0.3)

# Accelerations
for i, col in enumerate(["ax_mDs2", "ay_mDs2", "az_mDs2"]):
    ax = axes[2, i]
    labels_df[col].hist(bins=50, ax=ax, edgecolor="black", alpha=0.7, color="red")
    ax.set_xlabel(f"{col} (m/s²)")
    ax.set_ylabel("Frequency")
    ax.set_title(f"Distribution of {col}")
    ax.grid(True, alpha=0.3)

# System time
ax = axes[2, 3]
labels_df["system_time_s"].hist(
    bins=50, ax=ax, edgecolor="black", alpha=0.7, color="purple"
)
ax.set_xlabel("System Time (s)")
ax.set_ylabel("Frequency")
ax.set_title("Distribution of system_time_s")
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Time Series Visualization of Sensor Data

In [None]:
# Plot sensor data over time
fig, axes = plt.subplots(3, 1, figsize=(16, 12))

# Quaternions over time
ax = axes[0]
ax.plot(labels_df["elapsed_s"], labels_df["qw"], label="qw", alpha=0.7, linewidth=0.8)
ax.plot(labels_df["elapsed_s"], labels_df["qx"], label="qx", alpha=0.7, linewidth=0.8)
ax.plot(labels_df["elapsed_s"], labels_df["qy"], label="qy", alpha=0.7, linewidth=0.8)
ax.plot(labels_df["elapsed_s"], labels_df["qz"], label="qz", alpha=0.7, linewidth=0.8)
ax.set_xlabel("Elapsed Time (seconds)")
ax.set_ylabel("Quaternion Components")
ax.set_title("Orientation (Quaternions) Over Time")
ax.legend(loc="best")
ax.grid(True, alpha=0.3)

# Angular velocities over time
ax = axes[1]
ax.plot(
    labels_df["elapsed_s"], labels_df["wx_radDs"], label="ωx", alpha=0.7, linewidth=0.8
)
ax.plot(
    labels_df["elapsed_s"], labels_df["wy_radDs"], label="ωy", alpha=0.7, linewidth=0.8
)
ax.plot(
    labels_df["elapsed_s"], labels_df["wz_radDs"], label="ωz", alpha=0.7, linewidth=0.8
)
ax.set_xlabel("Elapsed Time (seconds)")
ax.set_ylabel("Angular Velocity (rad/s)")
ax.set_title("Angular Velocities Over Time")
ax.legend(loc="best")
ax.grid(True, alpha=0.3)

# Accelerations over time
ax = axes[2]
ax.plot(
    labels_df["elapsed_s"], labels_df["ax_mDs2"], label="ax", alpha=0.7, linewidth=0.8
)
ax.plot(
    labels_df["elapsed_s"], labels_df["ay_mDs2"], label="ay", alpha=0.7, linewidth=0.8
)
ax.plot(
    labels_df["elapsed_s"], labels_df["az_mDs2"], label="az", alpha=0.7, linewidth=0.8
)
ax.set_xlabel("Elapsed Time (seconds)")
ax.set_ylabel("Acceleration (m/s²)")
ax.set_title("Accelerations Over Time")
ax.legend(loc="best")
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Data Quality Checks

In [None]:
# Check for missing values
print("=" * 60)
print("MISSING VALUES CHECK")
print("=" * 60)
missing = labels_df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "✓ No missing values found")

# Check for duplicate timestamps
print("\n" + "=" * 60)
print("DUPLICATE TIMESTAMPS CHECK")
print("=" * 60)
duplicates = labels_df["timestamp"].duplicated().sum()
print(
    f"Found {duplicates} duplicate timestamps"
    if duplicates > 0
    else "✓ No duplicate timestamps"
)

# Check for monotonicity (timestamps should always increase)
print("\n" + "=" * 60)
print("TIMESTAMP MONOTONICITY CHECK")
print("=" * 60)
non_monotonic = (
    labels_df["timestamp"].diff() <= 0
).sum() - 1  # -1 to exclude first NaN
print(
    f"⚠ Found {non_monotonic} non-monotonic timestamp transitions"
    if non_monotonic > 0
    else "✓ Timestamps are strictly monotonic (always increasing)"
)

# Outlier detection using IQR method for sensor data
print("\n" + "=" * 60)
print("OUTLIER DETECTION (IQR method, threshold=3.0)")
print("=" * 60)
outlier_summary = {}
for col in sensor_cols:
    Q1 = labels_df[col].quantile(0.25)
    Q3 = labels_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3.0 * IQR
    upper_bound = Q3 + 3.0 * IQR
    outliers = ((labels_df[col] < lower_bound) | (labels_df[col] > upper_bound)).sum()
    outlier_summary[col] = outliers
    if outliers > 0:
        print(f"  {col}: {outliers} outliers ({100 * outliers / len(labels_df):.2f}%)")

if sum(outlier_summary.values()) == 0:
    print("✓ No significant outliers detected in sensor data")

## 7. Correlation Analysis

In [None]:
# Compute correlation matrix for sensor data
corr_matrix = labels_df[sensor_cols].corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(
    corr_matrix,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    square=True,
    linewidths=1,
    cbar_kws={"shrink": 0.8},
)
plt.title("Correlation Matrix of Sensor Data", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.show()

# Identify strong correlations (|r| > 0.7, excluding diagonal)
print("=" * 60)
print("STRONG CORRELATIONS (|r| > 0.7)")
print("=" * 60)
strong_corr = []
for i in range(len(corr_matrix.columns)):
    for j in range(i + 1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > 0.7:
            strong_corr.append(
                (corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j])
            )

if strong_corr:
    for col1, col2, r in sorted(strong_corr, key=lambda x: abs(x[2]), reverse=True):
        print(f"  {col1} ↔ {col2}: r = {r:.3f}")
else:
    print("✓ No strong correlations found (all |r| ≤ 0.7)")

## 8. Summary Statistics Table

In [None]:
# Create a comprehensive summary
summary = {
    "Metric": [
        "Total Samples",
        "Total Duration (s)",
        "Total Duration (min)",
        "Mean Sampling Rate (Hz)",
        "Median Sampling Rate (Hz)",
        "Timestamp Range (µs)",
        "Number of Time Gaps (>2×median)",
        "Missing Values",
        "Duplicate Timestamps",
        "Non-monotonic Transitions",
        "Quaternion Norm Mean",
        "Quaternion Norm Std",
    ],
    "Value": [
        f"{len(labels_df):,}",
        f"{labels_df['elapsed_s'].iloc[-1]:.2f}",
        f"{labels_df['elapsed_s'].iloc[-1] / 60:.2f}",
        f"{1 / labels_df['dt_s'].mean():.2f}",
        f"{1 / labels_df['dt_s'].median():.2f}",
        f"{labels_df['timestamp'].iloc[0]:,} → {labels_df['timestamp'].iloc[-1]:,}",
        f"{len(gaps)}",
        f"{labels_df.isnull().sum().sum()}",
        f"{labels_df['timestamp'].duplicated().sum()}",
        f"{non_monotonic}",
        f"{labels_df['quat_norm'].mean():.6f}",
        f"{labels_df['quat_norm'].std():.6f}",
    ],
}

summary_df = pd.DataFrame(summary)
print("=" * 60)
print("DATASET SUMMARY")
print("=" * 60)
display(summary_df)