In [8]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Load data from CSV
df = pd.read_csv(r"C:\Users\Aadi\crop-yield-prediction\data\Crop_production.csv")

# Clean column names
df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace(',', '')

# Select relevant columns
features = df[[
    'State_Name', 
    'Crop_Type', 
    'Crop', 
    'N', 
    'P', 
    'K', 
    'pH', 
    'rainfall', 
    'temperature', 
    'Area_in_hectares'
]]

target = df['Yield_ton_per_hec']

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'),  # Fixed syntax
         ['State_Name', 'Crop_Type', 'Crop'])
    ],
    remainder='passthrough'
)

# Build model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train model
model.fit(features, target)

# Save model
joblib.dump(model, 'model.pkl')

['model.pkl']