In [1]:
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import joblib

st.set_page_config(page_title="Commodity Export Predictor")

@st.cache_data
def load_data():
    return pd.read_csv('exports.csv')

export_21 = load_data()
st.write("Original data shape:", export_21.shape)

features = ['year', 'state_name', 'port_of_export', 'general_principal_commodity_category']
target = 'quantity_of_commodity'

@st.cache_resource
def create_label_encoder_with_unknown(categories):
    le = LabelEncoder()
    le.fit(list(categories) + ['Unknown'])
    return le

le_state = create_label_encoder_with_unknown(export_21['state_name'].unique())
le_port = create_label_encoder_with_unknown(export_21['port_of_export'].unique())
le_commodity = create_label_encoder_with_unknown(export_21['general_principal_commodity_category'].unique())

def transform_with_unknown(encoder, series):
    return series.map(lambda x: encoder.transform(['Unknown'])[0] if x not in encoder.classes_ else encoder.transform([x])[0])

X = export_21[features]
y = export_21[target]

st.write("\nFeatures info:")
st.write(X.info())
st.write("\nTarget info:")
st.write(y.info())

st.write("\nColumns with NaN values:")
st.write(X.columns[X.isna().any()].tolist())
st.write("Target has NaN values:", y.isna().any())

numeric_columns = X.select_dtypes(include=[np.number]).columns
inf_columns = numeric_columns[np.isinf(X[numeric_columns]).any()].tolist()
st.write("\nNumeric columns with infinite values:")
st.write(inf_columns)
st.write("Target has infinite values:", np.isinf(y).any())

mask = ~(X.isna().any(axis=1) | y.isna() | np.isinf(y))
X = X[mask]
y = y[mask]

st.write(f"\nNumber of samples after removing NaN and infinite values: {len(X)}")

X['state_name'] = transform_with_unknown(le_state, X['state_name'])
X['port_of_export'] = transform_with_unknown(le_port, X['port_of_export'])
X['general_principal_commodity_category'] = transform_with_unknown(le_commodity, X['general_principal_commodity_category'])

st.write("\nAfter transformations:")
st.write("X shape:", X.shape)
st.write("y shape:", y.shape)

st.write("\nCheck for NaN values after transformations:")
st.write("X has NaN:", X.isna().any().any())
st.write("y has NaN:", y.isna().any())

st.write("\nCheck for infinite values after transformations:")
st.write("X has inf:", np.isinf(X).any().any())
st.write("y has inf:", np.isinf(y).any())

mask = ~(X.isna().any(axis=1) | y.isna() | np.isinf(y) | np.isinf(X).any(axis=1))
X = X[mask]
y = y[mask]

st.write(f"\nFinal number of samples: {len(X)}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

@st.cache_resource
def train_model(X_train_scaled, y_train):
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train_scaled, y_train)
    return rf_model

rf_model = train_model(X_train_scaled, y_train)

y_pred = rf_model.predict(X_test_scaled)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

y_range = y_test.max() - y_test.min()
normalized_rmse = rmse / y_range

r2 = r2_score(y_test, y_pred)

st.write(f"\nNormalized RMSE: {normalized_rmse}")
st.write(f"R-squared Score: {r2}")

feature_importance = pd.DataFrame({'feature': X.columns, 'importance': rf_model.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)

st.write("\nFeature Importance:")
st.write(feature_importance)

st.write("\nFeature statistics:")
st.write(X.describe())
st.write("\nTarget variable statistics:")
st.write(y.describe())

joblib.dump(rf_model, 'random_forest_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(le_state, 'le_state.joblib')
joblib.dump(le_port, 'le_port.joblib')
joblib.dump(le_commodity, 'le_commodity.joblib')

st.write("\nModel and encoders saved successfully.")

st.title("Commodity Export Quantity Predictor")

year = st.selectbox("Select Year", sorted(export_21['year'].unique()))
state = st.selectbox("Select State", sorted(export_21['state_name'].unique()))
port = st.selectbox("Select Port", sorted(export_21[export_21['state_name'] == state]['port_of_export'].unique()))
commodity = st.selectbox("Select Commodity", sorted(export_21['general_principal_commodity_category'].unique()))

if st.button("Predict"):
    input_data = pd.DataFrame({
        'year': [year],
        'state_name': [state],
        'port_of_export': [port],
        'general_principal_commodity_category': [commodity]
    })
    
    input_data['state_name'] = transform_with_unknown(le_state, input_data['state_name'])
    input_data['port_of_export'] = transform_with_unknown(le_port, input_data['port_of_export'])
    input_data['general_principal_commodity_category'] = transform_with_unknown(le_commodity, input_data['general_principal_commodity_category'])
    
    input_scaled = scaler.transform(input_data)
    prediction = rf_model.predict(input_scaled)[0]
    
    st.success(f"Predicted Quantity of Commodity: {prediction:.2f}")

2024-09-25 00:53:22.693 
  command:

    streamlit run C:\Users\jasbi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\ipykernel_launcher.py [ARGUMENTS]
2024-09-25 00:53:22.693 No runtime found, using MemoryCacheStorageManager


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73317 entries, 0 to 73316
Data columns (total 4 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   year                                  73317 non-null  int64 
 1   state_name                            73317 non-null  object
 2   port_of_export                        73317 non-null  object
 3   general_principal_commodity_category  73317 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.2+ MB
<class 'pandas.core.series.Series'>
RangeIndex: 73317 entries, 0 to 73316
Series name: quantity_of_commodity
Non-Null Count  Dtype
--------------  -----
73317 non-null  int64
dtypes: int64(1)
memory usage: 572.9 KB


2024-09-25 00:54:27.604 Session state does not function when running a script without `streamlit run`
