In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

# Load dataset
df = pd.read_csv("co2_emissions.csv")

# Drop high-cardinality categorical columns
df = df.drop(columns=['make', 'model'])

# One-hot encoding of categorical features
df_encoded = pd.get_dummies(df, columns=['vehicle_class', 'transmission', 'fuel_type'], drop_first=True)

# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['co2_emissions'])
y = df_encoded['co2_emissions']

# Split dataset into train (80%) and test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Decision Tree model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# ✅ Save feature column names to ensure consistency in the Streamlit app
joblib.dump(X.columns.tolist(), "feature_columns.pkl")

# ✅ Save trained model
joblib.dump(dt_model, "co2_emission_model.pkl")

print("✅ Model and feature columns saved successfully!")


✅ Model and feature columns saved successfully!
