# US Tree Data Analysis

This notebook provides in-depth analysis of tree distribution, health, and environmental impact across the United States. The insights generated here will be integrated into our interactive dashboard.

In [None]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import plotly.express as px
import plotly.graph_objects as go
from osgeo import gdal
from pathlib import Path

# Setup notebook display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [None]:
from src.ml_utils import TreeHealthPredictor, TreeGrowthForecaster, calculate_environmental_impact

## Data Loading and Preprocessing

We'll load our tree data and perform initial preprocessing steps to prepare for analysis.

In [None]:
# Load sample data (replace with actual data when available)
def create_sample_data():
    # Create sample tree data
    df_trees = pd.DataFrame({
        'state': np.random.choice(['CA', 'NY', 'TX'], 1000),
        'city': np.random.choice(['Los Angeles', 'New York', 'Austin'], 1000),
        'species': np.random.choice(['Oak', 'Maple', 'Pine'], 1000),
        'health': np.random.choice(['Good', 'Fair', 'Poor'], 1000),
        'dbh': np.random.normal(30, 10, 1000),  # Diameter at breast height
        'height': np.random.normal(50, 15, 1000),
        'lat': np.random.uniform(25, 49, 1000),
        'lon': np.random.uniform(-125, -70, 1000)
    })
    
    # Create sample canopy coverage data
    df_canopy = pd.DataFrame({
        'state': np.random.choice(['CA', 'NY', 'TX'], 100),
        'county': [f'County_{i}' for i in range(100)],
        'canopy_pct': np.random.uniform(10, 60, 100)
    })
    
    return df_trees, df_canopy

# Load the data
df_trees, df_canopy = create_sample_data()

# Display basic information
print('Tree Dataset Info:')
print(df_trees.info())
print('\nCanopy Dataset Info:')
print(df_canopy.info())

## Geographic Distribution Analysis

Let's analyze the geographic distribution of trees and canopy coverage across different states and cities.

In [None]:
# Create a map of tree locations
fig = px.scatter_mapbox(df_trees,
                        lat='lat',
                        lon='lon',
                        color='species',
                        hover_data=['health', 'dbh', 'height'],
                        zoom=3,
                        title='Tree Distribution Map')

fig.update_layout(mapbox_style='carto-positron')
fig.show()

# Analyze state-level statistics
state_stats = df_trees.groupby('state').agg({
    'species': 'count',
    'dbh': 'mean',
    'height': 'mean'
}).round(2)

print('\nState-level Statistics:')
print(state_stats)

## Species Diversity Analysis

Analyzing tree species distribution and diversity indices across different regions.

In [None]:
# Calculate species diversity by state
def calculate_diversity_index(data):
    species_counts = data['species'].value_counts()
    proportions = species_counts / len(data)
    shannon_diversity = -np.sum(proportions * np.log(proportions))
    return shannon_diversity

diversity_by_state = df_trees.groupby('state').apply(calculate_diversity_index)

# Create species distribution visualization
fig = px.sunburst(df_trees,
                  path=['state', 'species'],
                  title='Tree Species Distribution by State')
fig.show()

print('\nShannon Diversity Index by State:')
print(diversity_by_state)

## Tree Health Assessment

Analyzing tree health conditions and identifying patterns or concerns.

In [None]:
# Analyze health distribution
health_by_species = pd.crosstab(df_trees['species'], df_trees['health'])

# Create health distribution visualization
fig = px.bar(df_trees,
             x='species',
             color='health',
             title='Tree Health Distribution by Species',
             barmode='group')
fig.show()

# Calculate health metrics
health_metrics = df_trees.groupby('state').agg({
    'health': lambda x: (x == 'Good').mean() * 100
}).round(2)
health_metrics.columns = ['Healthy Trees %']

print('\nHealth Metrics by State:')
print(health_metrics)

## Machine Learning Analysis

Let's use our ML utilities to predict tree health and forecast growth patterns.

In [None]:
# Initialize and train health predictor
health_predictor = TreeHealthPredictor()
metrics = health_predictor.train(df_trees)

print('Health Prediction Model Performance:')
print(f'Accuracy: {metrics["accuracy"]:.2f}')
print(f'F1 Score: {metrics["f1_score"]:.2f}')

# Predict health for new trees
sample_trees = df_trees.sample(5)
predicted_health = health_predictor.predict(sample_trees)

print('\nSample Health Predictions:')
for i, (_, tree) in enumerate(sample_trees.iterrows()):
    print(f'Tree {i+1} - Predicted Health: {predicted_health[i]}')

In [None]:
# Initialize and train growth forecaster
growth_forecaster = TreeGrowthForecaster()
metrics = growth_forecaster.train(df_trees)

print('\nGrowth Forecasting Model Performance:')
print(f'R² Score: {metrics["r2_score"]:.2f}')
print(f'RMSE: {metrics["rmse"]:.2f}')

# Forecast growth for sample trees
growth_forecast = growth_forecaster.forecast(sample_trees)

print('\nGrowth Forecasts (DBH in cm):')
print(growth_forecast)

## Environmental Impact Analysis

Analyze the environmental benefits of the tree population.

In [None]:
# Calculate environmental impact
impact_metrics = calculate_environmental_impact(df_trees)

print('Environmental Impact Metrics (Annual):')
for metric, value in impact_metrics.items():
    print(f'{metric.replace("_", " ").title()}: {value:,.0f}')

# Visualize impact metrics
fig = px.bar(x=list(impact_metrics.keys()),
            y=list(impact_metrics.values()),
            title='Annual Environmental Impact')
fig.update_layout(xaxis_title='Metric',
                  yaxis_title='Value',
                  xaxis_tickangle=45)
fig.show()