In [1]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('train_cleaned.csv')

# Load the testing dataset
df_test = pd.read_csv('test_cleaned.csv')

# Combine training and testing data for preprocessing
df_combined = pd.concat([df_train, df_test], axis=0, ignore_index=True)

# Data preprocessing
X_combined = df_combined.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_combined['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split the data back into training and testing sets
X_train_scaled = X_combined_scaled[:len(df_train)]
X_test_scaled = X_combined_scaled[len(df_train):]

# Target variable for training
y_train = y_combined[:len(df_train)]

# Base models
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Meta-model
meta_model = LinearRegression()

# Stacking ensemble
base_models = [('rf', rf_model), ('gb', gb_model)]
ensemble_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Train the ensemble model
ensemble_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_ensemble = ensemble_model.predict(X_test_scaled)

# Evaluate the performance (you may need a separate validation set for this in a real scenario)
# For this example, let's assume you have a validation set
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

y_pred_val_ensemble = ensemble_model.predict(X_val_split)
rmse_val_ensemble = mean_squared_error(y_val_split, y_pred_val_ensemble, squared=False)
print(f'Validation RMSE for Ensemble Model: {rmse_val_ensemble}')

# Note: You can adjust hyperparameters, add more base models, or experiment with different meta-models as needed.
# Calculate RMSE for the test set
rmse_test = mean_squared_error(y_test, y_pred_ensemble, squared=False)
print(f"RMSE for test set: {rmse_test}")

# Calculate RMSE for the validation set
rmse_val = mean_squared_error(y_val_split, y_pred_val_ensemble, squared=False)
print(f"RMSE for validation set: {rmse_val}")


Validation RMSE for Ensemble Model: 9.513209111508864


NameError: name 'y_test' is not defined

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Load the training dataset
df_train = pd.read_csv('train_cleaned.csv')

# Load the testing dataset
df_test = pd.read_csv('test_cleaned.csv')

# Data preprocessing
X_train = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_train = df_train['CO2 Emissions(g/km)']

X_test = df_test.copy()

# One-hot encode categorical variables
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Base models
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Meta-model
meta_model = LinearRegression()

# Stacking ensemble
base_models = [('rf', rf_model), ('gb', gb_model)]
ensemble_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Train the ensemble model on the entire training data
ensemble_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_test = ensemble_model.predict(X_test_scaled)

# Save predictions with Id
result_df = pd.DataFrame({'Id': df_test['Id'], 'CO2 Emissions(g/km)': y_pred_test})

# Save the predictions to a CSV file
result_df.to_csv('ensemble_submission.csv', index=False)
