# 🌾 Spark Analysis: Crop Yield Forecasting
This notebook performs data analysis on the `clean_crop_weather_historical.csv` file using PySpark.
- Ensure Spark is correctly configured and Java 8 or 11 is installed.
- Works in local mode (no Hadoop needed).
- Uses PySpark + Pandas for hybrid analysis.

In [None]:
# ✅ Set up Spark
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CropWeatherAnalysis") \
    .getOrCreate()

spark

In [None]:
# 📥 Load the dataset
df = spark.read.csv("clean_crop_weather_historical.csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)

## 🔍 Basic Data Info

In [None]:
# Column count, row count
print(f"Total Rows: {df.count()}")
print(f"Total Columns: {len(df.columns)}")
df.columns

## 🔧 Data Cleaning (Optional)

In [None]:
# Drop rows with nulls in important columns
clean_df = df.dropna(subset=["year", "state_name", "rice_yield_kg_per_ha"])
print(f"Cleaned Rows: {clean_df.count()}")

## 📊 Group-wise Yield Summary

In [None]:
from pyspark.sql.functions import avg

# Example: Rice yield by state and year
summary_df = clean_df.groupBy("year", "state_name") \
                    .agg(avg("rice_yield_kg_per_ha").alias("avg_rice_yield")) \
                    .orderBy("year")

summary_df.show(10)

## 📈 Optional: Convert to Pandas for Plotting

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

pandas_df = summary_df.toPandas()
plt.figure(figsize=(14,6))
sns.lineplot(data=pandas_df, x="year", y="avg_rice_yield", hue="state_name")
plt.title("Average Rice Yield per State (kg/ha)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()