In [16]:
import sys
sys.path.append('.')


import os

if not os.path.exists('src'):
    os.makedirs('src')

with open('src/__init__.py', 'w') as f:
    pass


data_loader_content = """
import pandas as pd

def load_data(path):
    '''Loads data from a specified CSV path.'''
    return pd.read_csv(path)
"""
with open('src/data_loader.py', 'w') as f:
    f.write(data_loader_content.strip())

preprocess_content = """
import pandas as pd
from sklearn.model_selection import train_test_split

def preprocess(data):
    # Using 'yield' as the target variable based on data inspection
    X = data.drop('yield', axis=1)
    y = data['yield']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test
"""
with open('src/preprocess.py', 'w') as f:
    f.write(preprocess_content.strip())


train_content = """
import joblib
from sklearn.ensemble import RandomForestRegressor
import os

def train_model(X_train, y_train):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

def save_model(model, filename='random_forest_model.pkl'):
    os.makedirs('models', exist_ok=True)
    joblib.dump(model, os.path.join('models', filename))
"""
with open('src/train.py', 'w') as f:
    f.write(train_content.strip())

evaluate_content = """
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error

def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    return {'R2 Score': r2, 'RMSE': rmse}
"""
with open('src/evaluate.py', 'w') as f:
    f.write(evaluate_content.strip())


predict_content = """
import pandas as pd

def predict(model, rainfall, temperature, fertilizer):
    input_data = pd.DataFrame({
        'rainfall': [rainfall],
        'temperature': [temperature],
        'fertilizer': [fertilizer]
    })
    prediction = model.predict(input_data)[0]
    return prediction
"""
with open('src/predict.py', 'w') as f:
    f.write(predict_content.strip())



if 'src.data_loader' in sys.modules:
    del sys.modules['src.data_loader']
if 'src.preprocess' in sys.modules:
    del sys.modules['src.preprocess']
if 'src.train' in sys.modules:
    del sys.modules['src.train']
if 'src.evaluate' in sys.modules:
    del sys.modules['src.evaluate']
if 'src.predict' in sys.modules:
    del sys.modules['src.predict']

from src.data_loader import load_data
from src.preprocess import preprocess
from src.train import train_model, save_model
from src.evaluate import evaluate_model
from src.predict import predict
import joblib

DATA_PATH = "/content/crop_yield.csv"

def main():

    data = load_data(DATA_PATH)
    print("Loaded data columns:", data.columns.tolist()) # Add this line to print column names


    X_train, X_test, y_train, y_test = preprocess(data)


    model = train_model(X_train, y_train)
    save_model(model)


    metrics = evaluate_model(model, X_test, y_test)

    print("\n=== MODEL EVALUATION ===")
    for k, v in metrics.items():
        print(f"{k}: {v}")

    loaded_model = joblib.load("models/random_forest_model.pkl")
    sample_prediction = predict(
        loaded_model,
        rainfall=800,
        temperature=26,
        fertilizer=120
    )

    print("\n=== SAMPLE PREDICTION ===")
    print(f"Predicted Crop Yield: {round(sample_prediction, 2)} tons/hectare")

if __name__ == "__main__":
    main()


Loaded data columns: ['rainfall', 'temperature', 'fertilizer', 'yield']

=== MODEL EVALUATION ===
R2 Score: 0.9860269892973478
RMSE: 0.05479781017522463

=== SAMPLE PREDICTION ===
Predicted Crop Yield: 3.25 tons/hectare
