# Exploratory Data Analysis

This notebook provides an introduction to the Bionutrient Institute dataset and basic exploratory data analysis.

## Overview

The NutrientScanner project uses NIR (Near-Infrared) spectroscopy data to predict nutrient content in crops. This notebook will help you understand:

1. The structure of the dataset
2. Data quality and missing values
3. Distribution of target variables
4. NIR spectral characteristics
5. Model performance (if available)


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

print("📚 Libraries imported successfully!")


In [None]:
# Load the cleaned dataset
data_dir = Path("../data/clean")

# Check what files are available
print("📁 Available data files:")
for file in data_dir.glob("*.parquet"):
    print(f"   - {file.name}")

# Load features and target
try:
    X = pd.read_parquet(data_dir / "carrots__X.parquet")
    y = pd.read_parquet(data_dir / "carrots__y__antioxidants.parquet")
    
    print(f"\n✅ Data loaded successfully!")
    print(f"   Features shape: {X.shape}")
    print(f"   Target shape: {y.shape}")
    
except FileNotFoundError as e:
    print(f"❌ Data files not found: {e}")
    print("   Run: make train")
    print("   Or: python -m src.data.clean_bi --crop carrots --target antioxidants")


In [None]:
# Basic dataset information
if 'X' in locals() and 'y' in locals():
    print("📊 Dataset Overview")
    print("=" * 50)
    
    # Features information
    print(f"Number of samples: {len(X)}")
    print(f"Number of features: {len(X.columns)}")
    print(f"Feature range: {X.min().min():.3f} to {X.max().max():.3f}")
    
    # Target information
    y_values = y.iloc[:, 0] if y.shape[1] == 1 else y
    print(f"\nTarget variable: antioxidants")
    print(f"Target range: {y_values.min():.3f} to {y_values.max():.3f}")
    print(f"Target mean: {y_values.mean():.3f} ± {y_values.std():.3f}")
    
    # Missing values
    print(f"\nMissing values in features: {X.isnull().sum().sum()}")
    print(f"Missing values in target: {y_values.isnull().sum()}")
    
    # Data types
    print(f"\nFeature data types: {X.dtypes.value_counts().to_dict()}")
else:
    print("❌ No data loaded. Please run the data loading cell first.")


In [None]:
# Target variable distribution
if 'y' in locals():
    y_values = y.iloc[:, 0] if y.shape[1] == 1 else y
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Histogram
    axes[0].hist(y_values, bins=30, alpha=0.7, edgecolor='black')
    axes[0].set_xlabel('Antioxidants (mg/g)')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Distribution of Antioxidants')
    axes[0].grid(True, alpha=0.3)
    
    # Box plot
    axes[1].boxplot(y_values, vert=True)
    axes[1].set_ylabel('Antioxidants (mg/g)')
    axes[1].set_title('Box Plot of Antioxidants')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print("📈 Target Variable Statistics:")
    print(f"   Mean: {y_values.mean():.3f}")
    print(f"   Median: {y_values.median():.3f}")
    print(f"   Std: {y_values.std():.3f}")
    print(f"   Min: {y_values.min():.3f}")
    print(f"   Max: {y_values.max():.3f}")
    print(f"   Skewness: {y_values.skew():.3f}")
    print(f"   Kurtosis: {y_values.kurtosis():.3f}")
else:
    print("❌ No data loaded. Please run the data loading cell first.")


In [None]:
# NIR spectral characteristics
if 'X' in locals():
    # Convert column names to numeric for plotting
    wavelengths = []
    for col in X.columns:
        try:
            wavelengths.append(float(col))
        except ValueError:
            wavelengths.append(0)  # Fallback for non-numeric columns
    
    # Plot sample spectra
    plt.figure(figsize=(12, 6))
    
    # Plot first 10 samples
    for i in range(min(10, len(X))):
        plt.plot(wavelengths, X.iloc[i], alpha=0.7, linewidth=1)
    
    plt.xlabel('Wavelength (nm)')
    plt.ylabel('Absorbance')
    plt.title('Sample NIR Spectra (First 10 samples)')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # Plot mean spectrum
    plt.figure(figsize=(12, 6))
    mean_spectrum = X.mean()
    std_spectrum = X.std()
    
    plt.plot(wavelengths, mean_spectrum, 'b-', linewidth=2, label='Mean')
    plt.fill_between(wavelengths, 
                     mean_spectrum - std_spectrum, 
                     mean_spectrum + std_spectrum, 
                     alpha=0.3, label='±1 Std')
    
    plt.xlabel('Wavelength (nm)')
    plt.ylabel('Absorbance')
    plt.title('Mean NIR Spectrum with Standard Deviation')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    print(f"🔬 NIR Spectral Analysis:")
    print(f"   Wavelength range: {min(wavelengths):.1f} - {max(wavelengths):.1f} nm")
    print(f"   Number of wavelengths: {len(wavelengths)}")
    print(f"   Mean absorbance: {mean_spectrum.mean():.3f}")
    print(f"   Absorbance range: {mean_spectrum.min():.3f} - {mean_spectrum.max():.3f}")
else:
    print("❌ No data loaded. Please run the data loading cell first.")


In [None]:
# Check for trained models
models_dir = Path("../models")
model_files = list(models_dir.glob("*.joblib"))

if model_files:
    print("🤖 Found trained models:")
    for model_file in model_files:
        print(f"   - {model_file.name}")
    
    # Load metrics if available
    metrics_files = list(models_dir.glob("*__metrics.json"))
    if metrics_files:
        print(f"\n📊 Model Performance:")
        for metrics_file in metrics_files:
            with open(metrics_file, 'r') as f:
                metrics = json.load(f)
            
            print(f"\n   Model: {metrics_file.stem}")
            print(f"   Crop: {metrics['crop']}")
            print(f"   Target: {metrics['target']}")
            print(f"   R²: {metrics['cv_scores']['r2_mean']:.4f} ± {metrics['cv_scores']['r2_std']:.4f}")
            print(f"   RMSE: {metrics['cv_scores']['rmse_mean']:.4f} ± {metrics['cv_scores']['rmse_std']:.4f}")
    
    # Load and display truth vs prediction plot if available
    plot_files = list(models_dir.glob("*__truth_vs_pred.png"))
    if plot_files:
        print(f"\n📈 Model Performance Plots:")
        for plot_file in plot_files:
            print(f"   - {plot_file.name}")
        
        # Display the first plot
        from IPython.display import Image, display
        display(Image(str(plot_files[0])))
        
else:
    print("❌ No trained models found.")
    print("   Run: make train")
    print("   Or: python -m src.models.train_pls --crop carrots --target antioxidants")
