In [None]:
# %pip install streamlit

In [7]:
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from tqdm import tqdm

# Function to load data
@st.cache_data
def load_data():
    # Replace this with your actual dataset
    data = pd.read_csv('dataset/crop_yield.csv')
    return data

# Function to preprocess data
def preprocess_data(data):
    label_encoders = {}
    for column in ['Region', 'Soil_Type', 'Crop', 'Weather_Condition']:
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le
    
    X = data.drop('Yield_tons_per_hectare', axis=1)
    y = data['Yield_tons_per_hectare']
    return X, y

# Function to train models and display progress
def train_models(X_train, y_train, X_test, y_test):
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(),
        # 'Random Forest': RandomForestRegressor(),
        # 'Support Vector Regressor': SVR(),
        # 'K-Nearest Neighbors': KNeighborsRegressor(),
        # 'XGBoost': xgb.XGBRegressor(),
        # 'LightGBM': lgb.LGBMRegressor(),
        # 'CatBoost': CatBoostRegressor(verbose=0)
    }

    results = []
    for model_name, model in tqdm(models.items(), desc="Training Models"):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        results.append({
            'Model': model_name,
            'RMSE': rmse,
            'R2 Score': r2
        })
    return results

# Streamlit App
def main():
    st.title("Crop Yield Prediction App")
    
    # Load data
    data = load_data()
    st.write("### Raw Data")
    st.write(data.head())

    # Preprocess data
    X, y = preprocess_data(data)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Display shapes of training and testing sets
    st.write("### Data Split")
    st.write(f"Training data: {X_train.shape[0]} rows, Testing data: {X_test.shape[0]} rows")
    
    # Train models and show progress using tqdm
    st.write("### Model Training Progress")
    progress_bar = st.progress(0)
    results = []
    
    for i, (model_name, model) in enumerate({
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(),
        # 'Random Forest': RandomForestRegressor(),
        # 'Support Vector Regressor': SVR(),
        # 'K-Nearest Neighbors': KNeighborsRegressor(),
        # 'XGBoost': xgb.XGBRegressor(),
        # 'LightGBM': lgb.LGBMRegressor(),
        # 'CatBoost': CatBoostRegressor(verbose=0)
    }.items()):
        with st.spinner(f"Training {model_name}..."):
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            results.append({
                'Model': model_name,
                'RMSE': rmse,
                'R2 Score': r2
            })
        
        progress_bar.progress((i+1) / 8)
    
    # Show results
    st.write("### Model Results")
    results_df = pd.DataFrame(results)
    st.write(results_df)

# Run the Streamlit app
if __name__ == "__main__":
    main()


2024-10-20 11:21:26.522 No runtime found, using MemoryCacheStorageManager
2024-10-20 11:21:26.539 No runtime found, using MemoryCacheStorageManager
