In [1]:
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score
import numpy as np

# Data Preprocessing Function
def preprocess_data():
    merged_data = pd.read_csv('final_dataset.csv')
    prices_data = pd.read_csv('prices_districtwise.csv')
    prices_data.columns = prices_data.columns.str.lower()
    prices_data = prices_data.applymap(lambda x: x.lower() if isinstance(x, str) else x)
    prices_data['arrival_date'] = pd.to_datetime(prices_data['arrival_date'], format='%d/%m/%Y')
    merged_data = pd.merge(merged_data, prices_data, left_on=['district_name'], right_on=['district'])
    merged_data.ffill(inplace=True)
    merged_data['average_price'] = merged_data[['min_price', 'max_price']].mean(axis=1)
    return merged_data

# Train Model for Each District
def train_models_by_district():
    merged_data = preprocess_data()
    districts = merged_data['district_name'].unique()
    for district in districts:
        district_data = merged_data[merged_data['district_name'] == district]
        if district_data.empty:
            continue

        features = ['area', 'actual rainfall', 'normal rainfall', 'crop_year', 'average_price']
        target_production = 'production'
        target_deficiency = 'nutrient_deficiency'

        # Train Random Forest Model for Crop Production
        X = district_data[features]
        y = district_data[target_production]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        crop_production_model = RandomForestRegressor()
        crop_production_model.fit(X_train, y_train)
        with open(f'{district}_crop_production_model.pkl', 'wb') as f:
            pickle.dump(crop_production_model, f)

        # Train Logistic Regression Model for Nutrient Deficiency
        X = district_data[features]
        y = district_data[target_deficiency]
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
        nutrient_deficiency_model = LogisticRegression()
        nutrient_deficiency_model.fit(X_train, y_train)
        with open(f'{district}_nutrient_deficiency_model.pkl', 'wb') as f:
            pickle.dump((nutrient_deficiency_model, scaler), f)

# Predict Outcomes for Given Input
def predict_outcomes(district, input_data):
    with open(f'{district}_crop_production_model.pkl', 'rb') as f:
        crop_production_model = pickle.load(f)
    with open(f'{district}_nutrient_deficiency_model.pkl', 'rb') as f:
        nutrient_deficiency_model, scaler = pickle.load(f)
    
    input_data_scaled = scaler.transform(input_data)
    production_prediction = crop_production_model.predict(input_data)
    deficiency_prediction = nutrient_deficiency_model.predict(input_data_scaled)
    
    return production_prediction, deficiency_prediction

# Train models for all districts
train_models_by_district()

# Example input data for prediction
example_input = pd.DataFrame({
    'area': [2000],
    'actual rainfall': [310],
    'normal rainfall': [250],
    'crop_year': [2025],
    'average_price': [800]
})

# Predict outcomes for a specific district
district = 'kurnool'
production, deficiency = predict_outcomes(district, example_input)
print(f'Production Prediction: {production}')
print(f'Deficiency Prediction: {deficiency}')


  prices_data = prices_data.applymap(lambda x: x.lower() if isinstance(x, str) else x)


KeyError: 'nutrient_deficiency'