In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt


In [None]:
# Load the new provided data again to ensure we have the latest
corn_data_file_path = '/mnt/data/CornCorn_treatment_weatherdata_NUE2024.csv'
corn_data = pd.read_csv(corn_data_file_path)

# Ensure proper conversion of necessary columns
corn_data['Yield_kg_ha'] = corn_data['Yield_Mgha'] * 1000
corn_data['Nrate_lbac'] = pd.to_numeric(corn_data['Nrate_lbac'], errors='coerce')
corn_data['Min_RH_Prct'] = pd.to_numeric(corn_data['Min_RH_Prct'], errors='coerce')
corn_data['Max_RH_Prct'] = pd.to_numeric(corn_data['Max_RH_Prct'], errors='coerce')
corn_data['Avg_8inST_F'] = pd.to_numeric(corn_data['Avg_8inST_F'], errors='coerce')
corn_data['HTDD Heating degree days'] = pd.to_numeric(corn_data['HTDD Heating degree days'], errors='coerce')
corn_data['Total_Eva_In'] = pd.to_numeric(corn_data['Total_Eva_In'], errors='coerce')
corn_data['CLDD cooling degree day'] = pd.to_numeric(corn_data['CLDD cooling degree day'], errors='coerce')
corn_data['Avg_DewP_F'] = pd.to_numeric(corn_data['Avg_DewP_F'], errors='coerce')
corn_data['TMIN_F'] = pd.to_numeric(corn_data['TMIN_F'], errors='coerce')
corn_data['Max_WS_Mph'] = pd.to_numeric(corn_data['Max_WS_Mph'], errors='coerce')

# Define features and target
features = [
    'Nrate_lbac', 'Min_RH_Prct', 'Max_RH_Prct', 'Avg_8inST_F',
    'HTDD Heating degree days', 'Total_Eva_In', 'CLDD cooling degree day',
    'Avg_DewP_F', 'TMIN_F', 'Max_WS_Mph'
]
target = 'Yield_kg_ha'

# Train the Random Forest model for each region and predict yields
corn_regions = corn_data['Regions'].unique()
predictions_by_region = {}

for region in corn_regions:
    region_data = corn_data[corn_data['Regions'] == region].dropna(subset=features + [target])
    if not region_data.empty:
        X = region_data[features]
        y = region_data[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model.fit(X_train, y_train)
        y_pred = rf_model.predict(X_test)
        
        predictions_by_region[region] = {
            'y_test': y_test,
            'y_pred': y_pred,
            'rmse': mean_squared_error(y_test, y_pred, squared=False),
            'mae': mean_absolute_error(y_test, y_pred),
            'r2': r2_score(y_test, y_pred)
        }

import ace_tools as tools
tools.display_dataframe_to_user(name="Predictions by Region", dataframe=pd.DataFrame(predictions_by_region))


# Plot the actual vs predicted yields for each region with increased font size on the blue markers
for region, data in predictions_by_region.items():
    y_test = data['y_test']
    y_pred = data['y_pred']
    rmse = data['rmse']
    mae = data['mae']
    r2 = data['r2']
    
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, label='Predictions', color='blue', alpha=0.6, s=100)  # Increased marker size
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
    plt.xlabel('Actual Yield (kg/ha)', fontsize=14)
    plt.ylabel('Predicted Yield (kg/ha)', fontsize=14)
    plt.title(f'Actual vs Predicted Yield for {region}', fontsize=16)
    plt.legend(fontsize=12)
    
    # Display additional metrics on the plot
    metrics_text = f'RMSE: {rmse:.2f}\nMAE: {mae:.2f}\nR²: {r2:.2f}'
    plt.text(0.05, 0.95, metrics_text, transform=plt.gca().transAxes,
             fontsize=12, verticalalignment='top', bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.8))
    
    plt.show()