In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Membaca data
df_train = pd.read_csv('cleaned_train.csv')
df_train = df_train.dropna()

# Membuang kolom 'Id'
df_train = df_train.drop(['Id'], axis=1)

# Memilih fitur dan target
X_combined = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split data
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_combined_scaled, y_combined, test_size=0.2, random_state=42)

# Model RandomForest
rf_model = RandomForestRegressor(n_estimators=150, max_depth=30, random_state=42, min_samples_split=10, min_samples_leaf=2, bootstrap=True)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_test_scaled)

# Model GradientBoosting
gb_model = GradientBoostingRegressor(n_estimators=150, max_depth=10, random_state=42, learning_rate=0.1)
gb_model.fit(X_train_scaled, y_train)
gb_pred = gb_model.predict(X_test_scaled)

# Gabungkan hasil prediksi
ensemble_pred = 0.30 * rf_pred + 0.70 * gb_pred

# Evaluasi RMSE
ensemble_rmse = mean_squared_error(y_test, ensemble_pred, squared=False)
print(f"RMSE on the test set using Ensemble: {ensemble_rmse}")
# RMSE on the test set using Ensemble: 19.50849679504109

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Membaca data training
df_train = pd.read_csv('cleaned_train.csv')
df_train = df_train.dropna()

# Membuang kolom 'Id'
df_train = df_train.drop(['Id'], axis=1)

# Memilih fitur dan target
X_train = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_train = df_train['CO2 Emissions(g/km)']

# One-hot encode
X_train = pd.get_dummies(X_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Model RandomForest
rf_model = RandomForestRegressor(n_estimators=150, max_depth=30, random_state=42, min_samples_split=10, min_samples_leaf=2, bootstrap=True)
rf_model.fit(X_train_scaled, y_train)

# Model GradientBoosting
gb_model = GradientBoostingRegressor(n_estimators=150, max_depth=10, random_state=42, learning_rate=0.1)
gb_model.fit(X_train_scaled, y_train)

# Membaca data test
df_test = pd.read_csv('cleaned_test.csv')

# Simpan kolom 'Id' untuk digunakan dalam penggabungan hasil prediksi
test_ids = df_test['Id']

# Membuang kolom 'Id'
df_test = df_test.drop(['Id'], axis=1)

# One-hot encode categorical variables
X_test = pd.get_dummies(df_test)

# Standardize the data menggunakan scaler yang sama dari data training
X_test_scaled = scaler.transform(X_test)

# Prediksi menggunakan model RandomForest
rf_test_pred = rf_model.predict(X_test_scaled)

# Prediksi menggunakan model GradientBoosting
gb_test_pred = gb_model.predict(X_test_scaled)

# Gabungkan hasil prediksi
ensemble_test_pred = 0.3 * rf_test_pred + 0.7 * gb_test_pred

# Buat DataFrame hasil prediksi untuk data test
ensemble_result_df = pd.DataFrame({'Id': test_ids, 'CO2 Emissions(g/km)': ensemble_test_pred})

# Simpan hasil prediksi dalam file CSV
ensemble_result_df.to_csv('ensemble2.csv', index=False)
