# Data Analysis and Visualization

## Data Loading

In [None]:
import pandas as pd

# Load the cleaned parquet data file
df = pd.read_parquet("../../data/processed/projectile-motion_practice_clean.parquet")

# Quick sanity checks
display(df.head())
print(df.dtypes)


## Exploratory Data Analysis with Seaborn

### EDA Configuration


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(context="notebook", style="whitegrid")


### Histogram of Percent Error (Eange)

In [None]:
plt.figure(figsize=(7, 4))
sns.histplot(df["pct_err_range"], bins=30)
plt.title("Percent Error (Range)")
plt.xlabel("Percent error (%)")
plt.ylabel("Count")
plt.tight_layout()


### Boxplot of Percent Error by Lab Section

In [None]:
plt.figure(figsize=(7, 4))
sns.boxplot(data=df, x="section", y="pct_err_range")
plt.title("Percent Error (Range) by Section")
plt.xlabel("Section")
plt.ylabel("Percent error (%)")
plt.tight_layout()


### Scatter: Angle vs Measured Range, Colored by Section

In [None]:
plt.figure(figsize=(7, 4))
sns.scatterplot(data=df, x="angle_deg", y="range_meas_m", hue="section", s=25)
plt.title("Measured Range vs Angle")
plt.xlabel("Angle (°)")
plt.ylabel("Measured range (m)")
plt.tight_layout()


### Correlation Heatmap for Selected Variables

In [None]:
num_cols = [
    "angle_deg",
    "vel_init_mps",
    "range_theor_m",
    "range_meas_m",
    "height_theor_m",
    "height_meas_m",
    "time_theor_s",
    "time_meas_s",
    "pct_err_range",
    "pct_err_height",
    "pct_err_time",
]
corr = df[num_cols].corr(numeric_only=True)
plt.figure(figsize=(7, 6))
sns.heatmap(corr, cmap="vlag", center=0, annot=False)
plt.title("Correlation (selected variables)")
plt.tight_layout()


## Statistical Analysis

### Statistical Analysis Configuration

In [None]:
import numpy as np
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf


### Descriptives by Section

In [None]:
summary = (
    df.groupby("section")[["pct_err_range", "pct_err_height", "pct_err_time"]]
    .agg(["count", "mean", "std"])
    .round(3)
)
print("Descriptives by section:\n", summary, "\n")


### Pearson Correlation: Angle vs Measured Range

In [None]:
r, p = stats.pearsonr(df["angle_deg"], df["range_meas_m"])
print(f"Pearson r(angle, measured range) = {r:.3f}, p = {p:.3e}")


### OLS: Measured Range ~ Theoretical Range + Angle + Initial Velocity

In [None]:
model = smf.ols("range_meas_m ~ range_theor_m + angle_deg + vel_init_mps", data=df).fit()
print(model.summary())


### One-Way ANOVA: Error by Section

In [None]:
groups = [g.dropna().values for _, g in df.groupby("section")["pct_err_range"]]
if all(len(g) > 1 for g in groups) and len(groups) > 1:
    F, p_anova = stats.f_oneway(*groups)
    print(f"One-way ANOVA on pct_err_range by section: F={F:.3f}, p={p_anova:.3e}")
else:
    print("ANOVA skipped (not enough groups or samples).")


### 95% CI for Mean Percent Error (Range)

In [None]:
arr = df["pct_err_range"].dropna().to_numpy()
mean = arr.mean()
se = arr.std(ddof=1) / np.sqrt(len(arr))
ci_low, ci_high = stats.t.interval(0.95, len(arr) - 1, loc=mean, scale=se)
print(f"Mean pct_err_range = {mean:.3f}% (95% CI: {ci_low:.3f}%, {ci_high:.3f}%)")


## Figure Generation

### Plotly Figures

#### Plotly Configuration

In [None]:
import plotly.express as px

df_sorted = df.sort_values("angle_deg")


#### Line: Angle vs Theoretical & Measured Range

In [None]:
fig = px.line(
    df_sorted,
    x="angle_deg",
    y=["range_theor_m", "range_meas_m"],
    labels={"value": "Range (m)", "angle_deg": "Angle (°)", "variable": "Series"},
    title="Projectile Range vs Angle (Interactive)",
)
fig.show()


#### Scatter with Trendline: Theoretical vs Measured Range

In [None]:
fig2 = px.scatter(
    df,
    x="range_theor_m",
    y="range_meas_m",
    trendline="ols",
    labels={"range_theor_m": "Theoretical (m)", "range_meas_m": "Measured (m)"},
    title="Measured vs Theoretical Range (Interactive)",
)
fig2.show()


### Matplotlib Figures

#### Matplotlib Configuration

In [None]:
import matplotlib.pyplot as plt

df_sorted = df.sort_values("angle_deg")


#### Theoretical vs Measured Range vs Angle

In [None]:
plt.figure(figsize=(7, 4))
plt.plot(df_sorted["angle_deg"], df_sorted["range_theor_m"], label="Theoretical", linewidth=2)
plt.plot(df_sorted["angle_deg"], df_sorted["range_meas_m"], "--", label="Measured")
plt.xlabel("Angle (°)")
plt.ylabel("Range (m)")
plt.title("Projectile Range vs Angle")
plt.legend()
plt.tight_layout()
plt.savefig("range_vs_angle.png", dpi=300)


#### Measured vs Theoretical Range (Scatter + Identity Line)

In [None]:
plt.figure(figsize=(5.5, 5.5))
plt.scatter(df["range_theor_m"], df["range_meas_m"], s=20, alpha=0.7)
m = max(df["range_theor_m"].max(), df["range_meas_m"].max())
plt.plot([0, m], [0, m], linewidth=1)  # y=x
plt.xlabel("Theoretical range (m)")
plt.ylabel("Measured range (m)")
plt.title("Measured vs Theoretical Range")
plt.tight_layout()
plt.savefig("measured_vs_theoretical_range.png", dpi=300)
