# Phase 3: Physics-Informed EDA

**Objective**: Validate the physical consistency of the cleaned "Silver" data.

**Key Physics Checks**:
1.  **Tensile Strength**: Is the distribution normal? (Formula: $\sigma = \frac{2F}{\pi D t}$)
2.  **Friction (Ejection)**: Are there sticking issues?
3.  **Viscoelasticity**: Does `Main Compression` scale with `Speed`?
4.  **Stationarity**: Check for drift in long batches.


In [ ]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.config import get_boto3_client, BUCKET_SILVER, get_pandas_storage_options

# Config
s3 = get_boto3_client()
PREFIX = "clean/"

sns.set(style="whitegrid")


## 1. Load Cleaned "Silver" Data
We load all available Parquet files from the Silver bucket.


In [ ]:
def load_silver_data():
    files = []
    paginator = s3.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=BUCKET_SILVER, Prefix=PREFIX):
        if 'Contents' in page:
            for obj in page['Contents']:
                if obj['Key'].endswith('.parquet'):
                    files.append(obj['Key'])
    
    dfs = []
    storage_options = get_pandas_storage_options()
    for key in files:
        s3_path = f"s3://{BUCKET_SILVER}/{key}"
        df = pd.read_parquet(s3_path, storage_options=storage_options)
        df['source_file'] = key
        dfs.append(df)
    
    if not dfs:
        print("No data found!")
        return pd.DataFrame()
        
    return pd.concat(dfs, ignore_index=True)

df = load_silver_data()
print(f"Loaded {len(df)} rows across {df['source_file'].nunique()} files.")
df.head()


## 2. Calculate Dynamic Tensile Strength
We use the formula recovered in Phase 0:
$$ \sigma = \frac{2000 \cdot F}{\pi \cdot D \cdot t} $$
Where:
*   $F$ = `main_comp` (kN) -> Converted to N (x1000)
*   $D$ = `diameter` (mm)
*   $t$ = `cyl_main` (mm) - Proxy for tablet height/thickness


In [ ]:
# Ensure columns exist
required = ['main_comp', 'diameter', 'cyl_main']
missing = [c for c in required if c not in df.columns]
if missing:
    print(f"Missing columns: {missing}")
else:
    # Calculate
    # Note: cyl_main might be 0? Check.
    df = df[df['cyl_main'] > 0].copy()
    
    df['tensile_strength'] = (2 * df['main_comp'] * 1000) / (np.pi * df['diameter'] * df['cyl_main'])
    
    plt.figure(figsize=(10, 5))
    sns.histplot(df['tensile_strength'], bins=50, kde=True, color='teal')
    plt.title("Distribution of Dynamic Tensile Strength (MPa)")
    plt.xlabel("Tensile Strength (MPa)")
    plt.show()
    
    print(df['tensile_strength'].describe())


## 3. Friction Analysis (Ejection Force)
High ejection force indicates "sticking" or low lubrication.
We investigate outliers.


In [ ]:
if 'ejection' in df.columns:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x=df['ejection'], color='orange')
    plt.title("Ejection Force Distribution (Friction Check)")
    plt.show()


## 4. Viscoelasticity Check
In viscoelastic materials, Force increases with Strain Rate (Speed).
We expect a positive correlation between `tbl_speed` and `main_comp`.


In [ ]:
if 'tbl_speed' in df.columns and 'main_comp' in df.columns:
    # Sample for scatter plot performance
    sample = df.sample(min(10000, len(df)))
    
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=sample, x='tbl_speed', y='main_comp', alpha=0.3, hue='diameter')
    plt.title("Viscoelasticity: Speed vs Main Compression")
    plt.show()


## 5. Stationarity (Drift Analysis)
We pick one long batch and check for signal drift over time.


In [ ]:
# Find largest file/batch
largest_file = df['source_file'].value_counts().idxmax()
print(f"Analyzing largest batch: {largest_file}")

batch_df = df[df['source_file'] == largest_file].sort_values('timestamp')

features = ['tensile_strength', 'ejection', 'tbl_fill', 'srel']
available = [f for f in features if f in batch_df.columns]

plt.figure(figsize=(15, 10))
for i, col in enumerate(available):
    plt.subplot(len(available), 1, i+1)
    plt.plot(batch_df['timestamp'], batch_df[col], label=col)
    plt.legend(loc='upper right')
    plt.ylabel(col)

plt.suptitle(f"Stationarity Check: {largest_file}")
plt.tight_layout()
plt.show()
