In [None]:
# AgriData Explorer - Rice Production EDA
# File: analysis/eda_rice.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("Set2")

#%% Load Data
print("Loading cleaned agricultural data...")
df = pd.read_csv('../data/processed/agri_data_cleaned.csv')
print(f"Data loaded: {df.shape}")
print(f"Years covered: {df['year'].min()} to {df['year'].max()}")

#%% Data Overview
print("\n" + "="*80)
print("RICE PRODUCTION DATA OVERVIEW")
print("="*80)

rice_cols = ['rice_area_1000_ha', 'rice_production_1000_tons', 'rice_yield_kg_per_ha']
print("\nRice Statistics:")
print(df[rice_cols].describe())

#%% ============================================================================
# EDA 1: TOP 7 RICE PRODUCING STATES (BAR PLOT)
#%% ============================================================================

# Aggregate rice production by state
state_rice = df.groupby('state_name')['rice_production_1000_tons'].sum().sort_values(ascending=False)
top7_rice_states = state_rice.head(7)

# Bar plot - Matplotlib
fig, ax = plt.subplots(figsize=(12, 6))
top7_rice_states.plot(kind='bar', color='#2ecc71', edgecolor='black', ax=ax)
ax.set_title('Top 7 Rice Producing States in India', fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('State Name', fontsize=12, fontweight='bold')
ax.set_ylabel('Total Rice Production (1000 tons)', fontsize=12, fontweight='bold')
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, v in enumerate(top7_rice_states):
    ax.text(i, v + 1000, f'{v:,.0f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig('plotly_exports/top7_rice_states_bar.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nTop 7 Rice Producing States:")
print(top7_rice_states)

# Interactive Plotly version
fig_plotly = px.bar(
    x=top7_rice_states.index,
    y=top7_rice_states.values,
    labels={'x': 'State', 'y': 'Rice Production (1000 tons)'},
    title='Top 7 Rice Producing States in India',
    color=top7_rice_states.values,
    color_continuous_scale='Greens'
)
fig_plotly.update_layout(
    title_font_size=18,
    showlegend=False,
    height=500
)
fig_plotly.write_html('plotly_exports/top7_rice_states_interactive.html')
fig_plotly.show()

#%% ============================================================================
# EDA 2: RICE PRODUCTION BY WEST BENGAL DISTRICTS
#%% ============================================================================

# Filter West Bengal data
wb_data = df[df['state_name'] == 'West Bengal'].copy()

# Aggregate by district
wb_districts = wb_data.groupby('district_name')['rice_production_1000_tons'].sum().sort_values(ascending=False)
top15_wb_districts = wb_districts.head(15)

# Create visualization
fig, ax = plt.subplots(figsize=(14, 8))
top15_wb_districts.plot(kind='barh', color='#3498db', edgecolor='black', ax=ax)
ax.set_title('Top 15 Rice Producing Districts in West Bengal', fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Total Rice Production (1000 tons)', fontsize=12, fontweight='bold')
ax.set_ylabel('District Name', fontsize=12, fontweight='bold')
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('plotly_exports/west_bengal_districts_rice.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nTop 15 Rice Producing Districts in West Bengal:")
print(top15_wb_districts)

#%% ============================================================================
# EDA 3: INDIA'S RICE PRODUCTION OVER LAST 50 YEARS (LINE PLOT)
#%% ============================================================================

# Aggregate rice production by year
yearly_rice = df.groupby('year')['rice_production_1000_tons'].sum().reset_index()

# Create line plot
fig, ax = plt.subplots(figsize=(14, 7))
ax.plot(yearly_rice['year'], yearly_rice['rice_production_1000_tons'], 
        marker='o', linewidth=2.5, markersize=6, color='#27ae60')
ax.fill_between(yearly_rice['year'], yearly_rice['rice_production_1000_tons'], 
                alpha=0.3, color='#27ae60')

ax.set_title("India's Rice Production Trend Over Last 50 Years", 
            fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Year', fontsize=12, fontweight='bold')
ax.set_ylabel('Total Rice Production (1000 tons)', fontsize=12, fontweight='bold')
ax.grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(yearly_rice['year'], yearly_rice['rice_production_1000_tons'], 1)
p = np.poly1d(z)
ax.plot(yearly_rice['year'], p(yearly_rice['year']), 
       linestyle='--', color='red', linewidth=2, label='Trend Line')
ax.legend()

plt.tight_layout()
plt.savefig('plotly_exports/india_rice_production_50years.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nRice Production Growth Analysis:")
print(f"Starting Year ({yearly_rice['year'].min()}): {yearly_rice['rice_production_1000_tons'].iloc[0]:,.0f} thousand tons")
print(f"Ending Year ({yearly_rice['year'].max()}): {yearly_rice['rice_production_1000_tons'].iloc[-1]:,.0f} thousand tons")
print(f"Total Growth: {((yearly_rice['rice_production_1000_tons'].iloc[-1] / yearly_rice['rice_production_1000_tons'].iloc[0]) - 1) * 100:.2f}%")

#%% ============================================================================
# EDA 4: RICE PRODUCTION VS WHEAT PRODUCTION (LAST 50 YEARS)
#%% ============================================================================

# Aggregate both crops by year
comparison = df.groupby('year').agg({
    'rice_production_1000_tons': 'sum',
    'wheat_production_1000_tons': 'sum'
}).reset_index()

# Create dual-axis comparison
fig, ax1 = plt.subplots(figsize=(14, 7))

color1 = '#27ae60'
color2 = '#e67e22'

ax1.set_xlabel('Year', fontsize=12, fontweight='bold')
ax1.set_ylabel('Rice Production (1000 tons)', color=color1, fontsize=12, fontweight='bold')
ax1.plot(comparison['year'], comparison['rice_production_1000_tons'], 
        marker='o', linewidth=2.5, color=color1, label='Rice')
ax1.tick_params(axis='y', labelcolor=color1)
ax1.grid(True, alpha=0.3)

ax2 = ax1.twinx()
ax2.set_ylabel('Wheat Production (1000 tons)', color=color2, fontsize=12, fontweight='bold')
ax2.plot(comparison['year'], comparison['wheat_production_1000_tons'], 
        marker='s', linewidth=2.5, color=color2, label='Wheat')
ax2.tick_params(axis='y', labelcolor=color2)

plt.title('Rice vs Wheat Production in India (Last 50 Years)', 
         fontsize=16, fontweight='bold', pad=20)

# Combine legends
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')

plt.tight_layout()
plt.savefig('plotly_exports/rice_vs_wheat_50years.png', dpi=300, bbox_inches='tight')
plt.show()

# Plotly interactive version
fig_plotly = go.Figure()

fig_plotly.add_trace(go.Scatter(
    x=comparison['year'],
    y=comparison['rice_production_1000_tons'],
    mode='lines+markers',
    name='Rice Production',
    line=dict(color='#27ae60', width=3),
    marker=dict(size=6)
))

fig_plotly.add_trace(go.Scatter(
    x=comparison['year'],
    y=comparison['wheat_production_1000_tons'],
    mode='lines+markers',
    name='Wheat Production',
    line=dict(color='#e67e22', width=3),
    marker=dict(size=6)
))

fig_plotly.update_layout(
    title='Rice vs Wheat Production in India (Interactive)',
    xaxis_title='Year',
    yaxis_title='Production (1000 tons)',
    hovermode='x unified',
    height=600
)

fig_plotly.write_html('plotly_exports/rice_vs_wheat_interactive.html')
fig_plotly.show()

#%% ============================================================================
# EDA 5: RICE YIELD EFFICIENCY ACROSS TOP STATES
#%% ============================================================================

# Calculate average yield for top producing states
top_states = top7_rice_states.index
state_yields = df[df['state_name'].isin(top_states)].groupby('state_name').agg({
    'rice_yield_kg_per_ha': 'mean',
    'rice_production_1000_tons': 'sum',
    'rice_area_1000_ha': 'sum'
}).sort_values('rice_yield_kg_per_ha', ascending=False)

# Create bar plot
fig, ax = plt.subplots(figsize=(12, 6))
state_yields['rice_yield_kg_per_ha'].plot(kind='bar', color='#9b59b6', edgecolor='black', ax=ax)
ax.set_title('Average Rice Yield in Top Producing States', fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('State Name', fontsize=12, fontweight='bold')
ax.set_ylabel('Average Yield (Kg per Ha)', fontsize=12, fontweight='bold')
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', alpha=0.3)

# Add value labels
for i, v in enumerate(state_yields['rice_yield_kg_per_ha']):
    ax.text(i, v + 50, f'{v:,.0f}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig('plotly_exports/rice_yield_efficiency.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nRice Yield Efficiency:")
print(state_yields[['rice_yield_kg_per_ha']])

#%% ============================================================================
# EDA 6: GEOGRAPHICAL HEATMAP OF RICE PRODUCTION
#%% ============================================================================

# State-wise rice production for latest year
latest_year = df['year'].max()
latest_data = df[df['year'] == latest_year].groupby('state_name')['rice_production_1000_tons'].sum().reset_index()

# Create choropleth (note: requires state name mapping to proper format)
print(f"\nRice Production Data for Year {latest_year}:")
print(latest_data.sort_values('rice_production_1000_tons', ascending=False).head(10))

#%% Summary Statistics
print("\n" + "="*80)
print("RICE PRODUCTION SUMMARY STATISTICS")
print("="*80)

print(f"\nTotal Rice Production (All Years): {df['rice_production_1000_tons'].sum():,.0f} thousand tons")
print(f"Average Annual Rice Production: {df.groupby('year')['rice_production_1000_tons'].sum().mean():,.0f} thousand tons")
print(f"Highest Producing State: {top7_rice_states.index[0]} ({top7_rice_states.iloc[0]:,.0f} thousand tons)")
print(f"Average Rice Yield: {df[df['rice_yield_kg_per_ha'] > 0]['rice_yield_kg_per_ha'].mean():,.0f} kg/ha")

print("\n✓ Rice EDA completed successfully!")
print("✓ All visualizations saved to plotly_exports/")