# Stress-Tolerant Seed Varieties Database - Exploratory Data Analysis

**Author:** kd475@cornell.edu  
**Date:** 2025-08-01  
**Purpose:** Comprehensive exploratory data analysis of the stress-tolerant seed varieties database

This notebook provides insights into the collected and processed seed variety data, including:
- Data quality assessment
- Distribution analysis
- Stress tolerance patterns
- Geographic and temporal trends
- Institution and breeding patterns

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
import sqlite3
from pathlib import Path
import warnings
from collections import Counter
from datetime import datetime

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print("Libraries imported successfully!")
print(f"Analysis started at: {datetime.now()}")

## 1. Data Loading and Initial Inspection

In [None]:
# Load the final database
data_dir = Path("../data/final")
csv_file = data_dir / "stress_tolerant_seed_database.csv"
sqlite_file = data_dir / "stress_tolerant_seed_database.db"

# Try loading from SQLite first, then CSV
if sqlite_file.exists():
    print("Loading data from SQLite database...")
    conn = sqlite3.connect(sqlite_file)
    df = pd.read_sql_query("SELECT * FROM stress_tolerant_varieties", conn)
    conn.close()
    print(f"Loaded {len(df)} records from SQLite")
elif csv_file.exists():
    print("Loading data from CSV file...")
    df = pd.read_csv(csv_file)
    print(f"Loaded {len(df)} records from CSV")
else:
    print("No database found. Please run the pipeline first.")
    df = pd.DataFrame()  # Empty dataframe

# Display basic information
if not df.empty:
    print(f"\nDataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
else:
    print("No data to analyze. Please ensure the pipeline has been run.")

In [None]:
# Display first few rows
if not df.empty:
    print("First 5 rows of the dataset:")
    display(df.head())
    
    print("\nDataset info:")
    print(df.info())

## 2. Data Preprocessing for Analysis

In [None]:
# Parse JSON fields for analysis
def parse_json_field(field_value):
    """Parse JSON field values"""
    if pd.isna(field_value) or field_value == '':
        return []
    
    try:
        if isinstance(field_value, str):
            return json.loads(field_value)
        elif isinstance(field_value, list):
            return field_value
        else:
            return []
    except:
        return []

if not df.empty:
    # Parse JSON fields
    json_fields = ['stressors_tolerated', 'quality_traits', 'genetic_markers', 
                   'parent_lines', 'adaptation_zones', 'recommended_states', 
                   'agro_climatic_zones', 'special_features', 'data_sources']
    
    for field in json_fields:
        if field in df.columns:
            df[field] = df[field].apply(parse_json_field)
            print(f"Parsed {field}")
    
    print("\nJSON fields parsed successfully!")

## 3. Data Quality Assessment

In [None]:
if not df.empty:
    print("=== DATA QUALITY ASSESSMENT ===")
    
    # Missing values analysis
    print("\nMissing Values Analysis:")
    missing_stats = df.isnull().sum().sort_values(ascending=False)
    missing_pct = (missing_stats / len(df) * 100).round(2)
    
    missing_df = pd.DataFrame({
        'Missing_Count': missing_stats,
        'Missing_Percentage': missing_pct
    })
    
    display(missing_df[missing_df['Missing_Count'] > 0])
    
    # Data completeness distribution
    if 'data_completeness_score' in df.columns:
        print("\nData Completeness Score Distribution:")
        print(df['data_completeness_score'].describe())
    
    # Quality flag distribution
    if 'quality_flag' in df.columns:
        print("\nQuality Flag Distribution:")
        print(df['quality_flag'].value_counts())
    
    # Confidence level distribution
    if 'confidence_level' in df.columns:
        print("\nConfidence Level Distribution:")
        print(df['confidence_level'].value_counts())

In [None]:
# Visualize data quality
if not df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Data completeness distribution
    if 'data_completeness_score' in df.columns:
        axes[0, 0].hist(df['data_completeness_score'], bins=20, alpha=0.7, color='skyblue')
        axes[0, 0].set_title('Data Completeness Score Distribution')
        axes[0, 0].set_xlabel('Completeness Score')
        axes[0, 0].set_ylabel('Frequency')
    
    # Quality flag distribution
    if 'quality_flag' in df.columns:
        quality_counts = df['quality_flag'].value_counts()
        axes[0, 1].pie(quality_counts.values, labels=quality_counts.index, autopct='%1.1f%%')
        axes[0, 1].set_title('Quality Flag Distribution')
    
    # Confidence level distribution
    if 'confidence_level' in df.columns:
        conf_counts = df['confidence_level'].value_counts()
        axes[1, 0].bar(conf_counts.index, conf_counts.values, color=['green', 'orange', 'red'])
        axes[1, 0].set_title('Confidence Level Distribution')
        axes[1, 0].set_ylabel('Count')
    
    # Missing values heatmap
    missing_matrix = df.isnull().astype(int)
    sns.heatmap(missing_matrix.corr(), annot=True, cmap='RdYlBu_r', ax=axes[1, 1])
    axes[1, 1].set_title('Missing Values Correlation')
    
    plt.tight_layout()
    plt.show()

## 4. Basic Statistics and Distribution Analysis

In [None]:
if not df.empty:
    print("=== BASIC STATISTICS ===")
    
    # Overall statistics
    print(f"Total varieties: {len(df)}")
    print(f"Unique varieties: {df['variety_name'].nunique()}")
    print(f"Unique crops: {df['crop_type'].nunique()}")
    print(f"Unique institutions: {df['breeding_institution'].nunique()}")
    
    # Crop type distribution
    print("\nCrop Type Distribution:")
    crop_dist = df['crop_type'].value_counts()
    print(crop_dist)
    
    # Year of release statistics
    if 'year_of_release' in df.columns:
        year_stats = df['year_of_release'].describe()
        print("\nYear of Release Statistics:")
        print(year_stats)
    
    # Maturity days statistics
    if 'maturity_days' in df.columns:
        maturity_stats = df['maturity_days'].describe()
        print("\nMaturity Days Statistics:")
        print(maturity_stats)

In [None]:
# Visualize basic distributions
if not df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Crop type distribution
    crop_counts = df['crop_type'].value_counts()
    axes[0, 0].pie(crop_counts.values, labels=crop_counts.index, autopct='%1.1f%%')
    axes[0, 0].set_title('Distribution by Crop Type')
    
    # Year of release distribution
    if 'year_of_release' in df.columns:
        df['year_of_release'].hist(bins=20, ax=axes[0, 1], alpha=0.7, color='lightgreen')
        axes[0, 1].set_title('Distribution by Year of Release')
        axes[0, 1].set_xlabel('Year')
        axes[0, 1].set_ylabel('Number of Varieties')
    
    # Top institutions
    top_institutions = df['breeding_institution'].value_counts().head(10)
    axes[1, 0].barh(range(len(top_institutions)), top_institutions.values)
    axes[1, 0].set_yticks(range(len(top_institutions)))
    axes[1, 0].set_yticklabels([inst[:30] + '...' if len(inst) > 30 else inst for inst in top_institutions.index])
    axes[1, 0].set_title('Top 10 Breeding Institutions')
    axes[1, 0].set_xlabel('Number of Varieties')
    
    # Maturity days distribution
    if 'maturity_days' in df.columns:
        df['maturity_days'].hist(bins=15, ax=axes[1, 1], alpha=0.7, color='salmon')
        axes[1, 1].set_title('Distribution by Maturity Days')
        axes[1, 1].set_xlabel('Days to Maturity')
        axes[1, 1].set_ylabel('Number of Varieties')
    
    plt.tight_layout()
    plt.show()

## 5. Stress Tolerance Analysis

In [None]:
if not df.empty and 'stressors_tolerated' in df.columns:
    print("=== STRESS TOLERANCE ANALYSIS ===")
    
    # Extract all stress tolerances
    all_stresses = []
    stress_variety_count = 0
    
    for stresses in df['stressors_tolerated']:
        if isinstance(stresses, list) and len(stresses) > 0:
            all_stresses.extend(stresses)
            stress_variety_count += 1
    
    print(f"Varieties with stress tolerance information: {stress_variety_count}")
    print(f"Total stress tolerance entries: {len(all_stresses)}")
    
    # Count stress tolerances
    stress_counter = Counter(all_stresses)
    stress_df = pd.DataFrame.from_dict(stress_counter, orient='index', columns=['Count'])
    stress_df = stress_df.sort_values('Count', ascending=False)
    
    print("\nTop Stress Tolerances:")
    print(stress_df.head(15))
    
    # Stress tolerance per crop
    stress_crop_data = []
    for idx, row in df.iterrows():
        crop = row['crop_type']
        stresses = row['stressors_tolerated']
        if isinstance(stresses, list):
            for stress in stresses:
                stress_crop_data.append({'Crop': crop, 'Stress': stress})
    
    if stress_crop_data:
        stress_crop_df = pd.DataFrame(stress_crop_data)
        print("\nStress tolerance by crop type:")
        stress_crop_crosstab = pd.crosstab(stress_crop_df['Stress'], stress_crop_df['Crop'])
        print(stress_crop_crosstab)

In [None]:
# Visualize stress tolerance analysis
if not df.empty and 'stressors_tolerated' in df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Top stress tolerances
    if len(stress_df) > 0:
        top_stresses = stress_df.head(15)
        axes[0, 0].barh(range(len(top_stresses)), top_stresses['Count'])
        axes[0, 0].set_yticks(range(len(top_stresses)))
        axes[0, 0].set_yticklabels(top_stresses.index)
        axes[0, 0].set_title('Top 15 Stress Tolerances')
        axes[0, 0].set_xlabel('Number of Varieties')
    
    # Number of stress tolerances per variety
    stress_counts_per_variety = df['stressors_tolerated'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    axes[0, 1].hist(stress_counts_per_variety, bins=range(0, max(stress_counts_per_variety)+2), 
                    alpha=0.7, color='lightcoral')
    axes[0, 1].set_title('Number of Stress Tolerances per Variety')
    axes[0, 1].set_xlabel('Number of Stress Tolerances')
    axes[0, 1].set_ylabel('Number of Varieties')
    
    # Stress tolerance heatmap by crop (if data exists)
    if 'stress_crop_crosstab' in locals() and not stress_crop_crosstab.empty:
        # Select top stresses and crops for better visualization
        top_stresses_for_heatmap = stress_crop_crosstab.sum(axis=1).sort_values(ascending=False).head(10)
        top_crops_for_heatmap = stress_crop_crosstab.sum(axis=0).sort_values(ascending=False).head(8)
        
        heatmap_data = stress_crop_crosstab.loc[top_stresses_for_heatmap.index, top_crops_for_heatmap.index]
        
        sns.heatmap(heatmap_data, annot=True, cmap='YlOrRd', ax=axes[1, 0])
        axes[1, 0].set_title('Stress Tolerance by Crop Type (Top 10 x Top 8)')
        axes[1, 0].set_xlabel('Crop Type')
        axes[1, 0].set_ylabel('Stress Tolerance')
    
    # Pie chart of varieties with/without stress tolerance info
    has_stress_info = sum(1 for x in df['stressors_tolerated'] if isinstance(x, list) and len(x) > 0)
    no_stress_info = len(df) - has_stress_info
    
    stress_info_data = [has_stress_info, no_stress_info]
    stress_info_labels = ['With Stress Info', 'Without Stress Info']
    
    axes[1, 1].pie(stress_info_data, labels=stress_info_labels, autopct='%1.1f%%', 
                   colors=['lightgreen', 'lightgray'])
    axes[1, 1].set_title('Varieties with Stress Tolerance Information')
    
    plt.tight_layout()
    plt.show()

## 6. Geographic and Temporal Analysis

In [None]:
if not df.empty:
    print("=== GEOGRAPHIC AND TEMPORAL ANALYSIS ===")
    
    # State-wise distribution
    if 'recommended_states' in df.columns:
        all_states = []
        for states in df['recommended_states']:
            if isinstance(states, list):
                all_states.extend(states)
        
        if all_states:
            state_counter = Counter(all_states)
            state_df = pd.DataFrame.from_dict(state_counter, orient='index', columns=['Count'])
            state_df = state_df.sort_values('Count', ascending=False)
            
            print("\nTop 15 Recommended States:")
            print(state_df.head(15))
    
    # Temporal trends
    if 'year_of_release' in df.columns:
        year_trends = df['year_of_release'].value_counts().sort_index()
        print("\nVarieties released by decade:")
        
        # Group by decades
        decades = {}
        for year, count in year_trends.items():
            if pd.notna(year):
                decade = f"{int(year)//10*10}s"
                decades[decade] = decades.get(decade, 0) + count
        
        for decade, count in sorted(decades.items()):
            print(f"{decade}: {count}")
    
    # Crop-year analysis
    if 'year_of_release' in df.columns and 'crop_type' in df.columns:
        crop_year_data = df.groupby(['crop_type', 'year_of_release']).size().reset_index(name='count')
        print("\nRecent trends (2010 onwards) by crop:")
        recent_data = crop_year_data[crop_year_data['year_of_release'] >= 2010]
        recent_summary = recent_data.groupby('crop_type')['count'].sum().sort_values(ascending=False)
        print(recent_summary.head(10))

In [None]:
# Visualize geographic and temporal patterns
if not df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Top recommended states
    if 'state_df' in locals() and len(state_df) > 0:
        top_states = state_df.head(15)
        axes[0, 0].barh(range(len(top_states)), top_states['Count'])
        axes[0, 0].set_yticks(range(len(top_states)))
        axes[0, 0].set_yticklabels(top_states.index)
        axes[0, 0].set_title('Top 15 Recommended States')
        axes[0, 0].set_xlabel('Number of Varieties')
    
    # Year of release trend
    if 'year_of_release' in df.columns:
        year_counts = df['year_of_release'].value_counts().sort_index()
        axes[0, 1].plot(year_counts.index, year_counts.values, marker='o', linewidth=2)
        axes[0, 1].set_title('Varieties Released by Year')
        axes[0, 1].set_xlabel('Year')
        axes[0, 1].set_ylabel('Number of Varieties')
        axes[0, 1].grid(True, alpha=0.3)
    
    # Decade distribution
    if 'decades' in locals():
        decade_names = list(decades.keys())
        decade_counts = list(decades.values())
        axes[1, 0].bar(decade_names, decade_counts, alpha=0.7, color='skyblue')
        axes[1, 0].set_title('Varieties Released by Decade')
        axes[1, 0].set_xlabel('Decade')
        axes[1, 0].set_ylabel('Number of Varieties')
        axes[1, 0].tick_params(axis='x', rotation=45)
    
    # Recent trends by crop (2010 onwards)
    if 'recent_summary' in locals() and len(recent_summary) > 0:
        top_recent_crops = recent_summary.head(10)
        axes[1, 1].pie(top_recent_crops.values, labels=top_recent_crops.index, autopct='%1.1f%%')
        axes[1, 1].set_title('Recent Varieties by Crop (2010+)')
    
    plt.tight_layout()
    plt.show()

## 7. Institution and Breeding Analysis

In [None]:
if not df.empty:
    print("=== INSTITUTION AND BREEDING ANALYSIS ===")
    
    # Top breeding institutions
    inst_counts = df['breeding_institution'].value_counts()
    print("\nTop 15 Breeding Institutions:")
    print(inst_counts.head(15))
    
    # Institution-crop specialization
    if 'crop_type' in df.columns:
        inst_crop_analysis = df.groupby(['breeding_institution', 'crop_type']).size().reset_index(name='count')
        
        # Find institutions with highest diversity (number of different crops)
        inst_diversity = inst_crop_analysis.groupby('breeding_institution')['crop_type'].nunique().sort_values(ascending=False)
        print("\nInstitutions with highest crop diversity:")
        print(inst_diversity.head(10))
        
        # Find crop specialists (institutions focusing on specific crops)
        print("\nTop specialists by crop:")
        for crop in df['crop_type'].value_counts().head(5).index:
            crop_specialists = df[df['crop_type'] == crop]['breeding_institution'].value_counts().head(3)
            print(f"\n{crop}:")
            for inst, count in crop_specialists.items():
                print(f"  {inst}: {count} varieties")
    
    # Quality traits analysis
    if 'quality_traits' in df.columns:
        all_quality_traits = []
        for traits in df['quality_traits']:
            if isinstance(traits, list):
                all_quality_traits.extend(traits)
        
        if all_quality_traits:
            quality_counter = Counter(all_quality_traits)
            quality_df = pd.DataFrame.from_dict(quality_counter, orient='index', columns=['Count'])
            quality_df = quality_df.sort_values('Count', ascending=False)
            
            print("\nTop 15 Quality Traits:")
            print(quality_df.head(15))

In [None]:
# Visualize institution and breeding patterns
if not df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Top institutions
    top_institutions = inst_counts.head(15)
    axes[0, 0].barh(range(len(top_institutions)), top_institutions.values)
    axes[0, 0].set_yticks(range(len(top_institutions)))
    # Truncate long institution names for display
    truncated_names = [name[:40] + '...' if len(name) > 40 else name for name in top_institutions.index]
    axes[0, 0].set_yticklabels(truncated_names)
    axes[0, 0].set_title('Top 15 Breeding Institutions')
    axes[0, 0].set_xlabel('Number of Varieties')
    
    # Institution diversity
    if 'inst_diversity' in locals():
        top_diverse = inst_diversity.head(10)
        axes[0, 1].bar(range(len(top_diverse)), top_diverse.values, alpha=0.7, color='lightgreen')
        axes[0, 1].set_xticks(range(len(top_diverse)))
        # Truncate institution names for x-axis
        truncated_diverse_names = [name[:15] + '...' if len(name) > 15 else name for name in top_diverse.index]
        axes[0, 1].set_xticklabels(truncated_diverse_names, rotation=45, ha='right')
        axes[0, 1].set_title('Institution Crop Diversity')
        axes[0, 1].set_ylabel('Number of Different Crops')
    
    # Quality traits
    if 'quality_df' in locals() and len(quality_df) > 0:
        top_quality = quality_df.head(15)
        axes[1, 0].barh(range(len(top_quality)), top_quality['Count'])
        axes[1, 0].set_yticks(range(len(top_quality)))
        axes[1, 0].set_yticklabels(top_quality.index)
        axes[1, 0].set_title('Top 15 Quality Traits')
        axes[1, 0].set_xlabel('Number of Varieties')
    
    # Number of quality traits per variety
    if 'quality_traits' in df.columns:
        quality_counts_per_variety = df['quality_traits'].apply(lambda x: len(x) if isinstance(x, list) else 0)
        axes[1, 1].hist(quality_counts_per_variety, bins=range(0, max(quality_counts_per_variety)+2), 
                        alpha=0.7, color='lightpink')
        axes[1, 1].set_title('Number of Quality Traits per Variety')
        axes[1, 1].set_xlabel('Number of Quality Traits')
        axes[1, 1].set_ylabel('Number of Varieties')
    
    plt.tight_layout()
    plt.show()

## 8. Advanced Analytics and Insights

In [None]:
if not df.empty:
    print("=== ADVANCED ANALYTICS AND INSIGHTS ===")
    
    # Multi-stress tolerant varieties
    stress_counts = df['stressors_tolerated'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    multi_stress_varieties = df[stress_counts >= 3]
    
    print(f"\nVarieties with 3+ stress tolerances: {len(multi_stress_varieties)}")
    if len(multi_stress_varieties) > 0:
        print("\nTop multi-stress tolerant varieties:")
        for idx, row in multi_stress_varieties.head(10).iterrows():
            stresses = ', '.join(row['stressors_tolerated'])
            print(f"  {row['variety_name']} ({row['crop_type']}): {stresses}")
    
    # Correlation analysis
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    if len(numeric_columns) > 1:
        correlation_matrix = df[numeric_columns].corr()
        print("\nCorrelation analysis (significant correlations):")
        
        # Find strong correlations (> 0.5 or < -0.5)
        strong_correlations = []
        for i in range(len(correlation_matrix.columns)):
            for j in range(i+1, len(correlation_matrix.columns)):
                corr_value = correlation_matrix.iloc[i, j]
                if abs(corr_value) > 0.5 and not np.isnan(corr_value):
                    strong_correlations.append((
                        correlation_matrix.columns[i],
                        correlation_matrix.columns[j],
                        corr_value
                    ))
        
        if strong_correlations:
            for col1, col2, corr in strong_correlations:
                print(f"  {col1} vs {col2}: {corr:.3f}")
        else:
            print("  No strong correlations found")
    
    # Data completeness by source
    if 'data_sources' in df.columns and 'data_completeness_score' in df.columns:
        source_completeness = []
        for idx, row in df.iterrows():
            sources = row['data_sources']
            completeness = row['data_completeness_score']
            if isinstance(sources, list) and pd.notna(completeness):
                for source in sources:
                    source_completeness.append({
                        'source': source,
                        'completeness': completeness
                    })
        
        if source_completeness:
            source_comp_df = pd.DataFrame(source_completeness)
            avg_completeness_by_source = source_comp_df.groupby('source')['completeness'].mean().sort_values(ascending=False)
            print("\nAverage data completeness by source:")
            print(avg_completeness_by_source.head(10))

In [None]:
# Advanced visualizations
if not df.empty:
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Multi-stress tolerance distribution
    stress_count_dist = stress_counts.value_counts().sort_index()
    axes[0, 0].bar(stress_count_dist.index, stress_count_dist.values, alpha=0.7, color='mediumpurple')
    axes[0, 0].set_title('Distribution of Stress Tolerance Count')
    axes[0, 0].set_xlabel('Number of Stress Tolerances')
    axes[0, 0].set_ylabel('Number of Varieties')
    
    # Correlation heatmap (if numeric data available)
    if len(numeric_columns) > 1:
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[0, 1])
        axes[0, 1].set_title('Correlation Matrix')
    
    # Data completeness by crop type
    if 'data_completeness_score' in df.columns:
        completeness_by_crop = df.groupby('crop_type')['data_completeness_score'].mean().sort_values(ascending=False)
        axes[1, 0].bar(range(len(completeness_by_crop)), completeness_by_crop.values, alpha=0.7, color='orange')
        axes[1, 0].set_xticks(range(len(completeness_by_crop)))
        axes[1, 0].set_xticklabels(completeness_by_crop.index, rotation=45, ha='right')
        axes[1, 0].set_title('Average Data Completeness by Crop Type')
        axes[1, 0].set_ylabel('Completeness Score')
    
    # Maturity vs Year scatter (if both available)
    if 'maturity_days' in df.columns and 'year_of_release' in df.columns:
        # Remove outliers for better visualization
        clean_data = df[(df['maturity_days'] < 300) & (df['year_of_release'] > 1990)]
        axes[1, 1].scatter(clean_data['year_of_release'], clean_data['maturity_days'], alpha=0.6)
        axes[1, 1].set_title('Maturity Days vs Year of Release')
        axes[1, 1].set_xlabel('Year of Release')
        axes[1, 1].set_ylabel('Maturity Days')
        axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 9. Summary and Key Insights

In [None]:
if not df.empty:
    print("=== SUMMARY ===")
    
    print(f"\n DATASET OVERVIEW:")
    print(f"   • Total varieties in database: {len(df):,}")
    print(f"   • Unique crops covered: {df['crop_type'].nunique()}")
    print(f"   • Breeding institutions involved: {df['breeding_institution'].nunique()}")
    print(f"   • Average data completeness: {df['data_completeness_score'].mean():.2f}")
    
    print(f"\n CROP INSIGHTS:")
    top_crop = df['crop_type'].value_counts().index[0]
    top_crop_count = df['crop_type'].value_counts().iloc[0]
    print(f"   • Most represented crop: {top_crop} ({top_crop_count} varieties)")
    print(f"   • Crop diversity index: {df['crop_type'].nunique() / len(df):.3f}")
    
    print(f"\n STRESS TOLERANCE INSIGHTS:")
    stress_variety_count = sum(1 for x in df['stressors_tolerated'] if isinstance(x, list) and len(x) > 0)
    print(f"   • Varieties with stress tolerance info: {stress_variety_count} ({stress_variety_count/len(df)*100:.1f}%)")
    
    if 'stress_df' in locals() and len(stress_df) > 0:
        top_stress = stress_df.index[0]
        top_stress_count = stress_df.iloc[0, 0]
        print(f"   • Most common stress tolerance: {top_stress} ({top_stress_count} varieties)")
    
    multi_stress_count = sum(1 for x in df['stressors_tolerated'] if isinstance(x, list) and len(x) >= 3)
    print(f"   • Multi-stress tolerant varieties (3+): {multi_stress_count}")
    
    print(f"\n INSTITUTIONAL INSIGHTS:")
    top_institution = df['breeding_institution'].value_counts().index[0]
    top_inst_count = df['breeding_institution'].value_counts().iloc[0]
    print(f"   • Most productive institution: {top_institution[:50]}... ({top_inst_count} varieties)")
    
    if 'inst_diversity' in locals():
        most_diverse = inst_diversity.index[0]
        diversity_count = inst_diversity.iloc[0]
        print(f"   • Most diverse institution: {most_diverse[:50]}... ({diversity_count} crops)")
    
    print(f"\n TEMPORAL INSIGHTS:")
    if 'year_of_release' in df.columns:
        recent_varieties = len(df[df['year_of_release'] >= 2010])
        print(f"   • Varieties released since 2010: {recent_varieties}")
        
        if recent_varieties > 0:
            avg_year = df[df['year_of_release'] >= 1990]['year_of_release'].mean()
            print(f"   • Average release year (1990+): {avg_year:.0f}")
    
    print(f"\n GEOGRAPHIC INSIGHTS:")
    if 'state_df' in locals() and len(state_df) > 0:
        top_state = state_df.index[0]
        top_state_count = state_df.iloc[0, 0]
        print(f"   • Most recommended state: {top_state} ({top_state_count} varieties)")
        print(f"   • States covered: {len(state_df)}")
    
    print(f"\n DATA QUALITY INSIGHTS:")
    if 'quality_flag' in df.columns:
        good_quality = len(df[df['quality_flag'] == 'GOOD'])
        print(f"   • High quality records: {good_quality} ({good_quality/len(df)*100:.1f}%)")
    
    if 'confidence_level' in df.columns:
        high_confidence = len(df[df['confidence_level'] == 'HIGH'])
        print(f"   • High confidence records: {high_confidence} ({high_confidence/len(df)*100:.1f}%)")
    
    print(f"\n RECOMMENDATIONS FOR FURTHER ANALYSIS:")
    print(f"   • Focus on multi-stress tolerant varieties for climate resilience")
    print(f"   • Investigate regional adaptation patterns")
    print(f"   • Analyze breeding institution collaboration networks")
    print(f"   • Study temporal trends in stress tolerance development")
    print(f"   • Examine quality trait combinations for market preferences")
    
    print(f"\n DATABASE GROWTH POTENTIAL:")
    low_completeness = len(df[df['data_completeness_score'] < 0.5])
    print(f"   • Records with improvement potential: {low_completeness}")
    print(f"   • Opportunity for data enrichment exists in stress tolerance and quality traits")
    
else:
    print("No data available for analysis. Please ensure the pipeline has been executed successfully.")