In [1]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('train_cleaned.csv')

# Load the testing dataset
df_test = pd.read_csv('test_cleaned.csv')

# Combine training and testing data for preprocessing
df_combined = pd.concat([df_train, df_test], axis=0, ignore_index=True)

# Data preprocessing
X_combined = df_combined.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_combined['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split the data back into training and testing sets
X_train_scaled = X_combined_scaled[:len(df_train)]
X_test_scaled = X_combined_scaled[len(df_train):]

# Target variable for training
y_train = y_combined[:len(df_train)]

# Base models
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Meta-model
meta_model = LinearRegression()

# Stacking ensemble
base_models = [('rf', rf_model), ('gb', gb_model)]
ensemble_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Train the ensemble model
ensemble_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_ensemble = ensemble_model.predict(X_test_scaled)

# Evaluate the performance (you may need a separate validation set for this in a real scenario)
# For this example, let's assume you have a validation set
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

y_pred_val_ensemble = ensemble_model.predict(X_val_split)
rmse_val_ensemble = mean_squared_error(y_val_split, y_pred_val_ensemble, squared=False)
print(f'Validation RMSE for Ensemble Model: {rmse_val_ensemble}')

# Note: You can adjust hyperparameters, add more base models, or experiment with different meta-models as needed.
# Calculate RMSE for the test set
rmse_test = mean_squared_error(y_test, y_pred_ensemble, squared=False)
print(f"RMSE for test set: {rmse_test}")

# Calculate RMSE for the validation set
rmse_val = mean_squared_error(y_val_split, y_pred_val_ensemble, squared=False)
print(f"RMSE for validation set: {rmse_val}")


Validation RMSE for Ensemble Model: 9.513209111508864


NameError: name 'y_test' is not defined

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# Load the training dataset
df_train = pd.read_csv('train_cleaned.csv')

# Load the testing dataset
df_test = pd.read_csv('test_cleaned.csv')

# Data preprocessing
X_train = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_train = df_train['CO2 Emissions(g/km)']

X_test = df_test.copy()

# One-hot encode categorical variables
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Base models
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Meta-model
meta_model = LinearRegression()

# Stacking ensemble
base_models = [('rf', rf_model), ('gb', gb_model)]
ensemble_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Train the ensemble model on the entire training data
ensemble_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_test = ensemble_model.predict(X_test_scaled)

# Save predictions with Id
result_df = pd.DataFrame({'Id': df_test['Id'], 'CO2 Emissions(g/km)': y_pred_test})

# Save the predictions to a CSV file
result_df.to_csv('ensemble_submission.csv', index=False)


In [11]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Membaca data
df_train = pd.read_csv('cleaned_train.csv')
df_train = df_train.dropna()

# Membuang kolom 'Id'
df_train = df_train.drop(['Id'], axis=1)

# Memilih fitur dan target
X_combined = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split data
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_combined_scaled, y_combined, test_size=0.2, random_state=42)

# Model RandomForest
rf_model = RandomForestRegressor(n_estimators=150, max_depth=30, random_state=42, min_samples_split=10, min_samples_leaf=2, bootstrap=True)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_test_scaled)

# Model GradientBoosting
gb_model = GradientBoostingRegressor(n_estimators=150, max_depth=10, random_state=42, learning_rate=0.1)
gb_model.fit(X_train_scaled, y_train)
gb_pred = gb_model.predict(X_test_scaled)

# Gabungkan hasil prediksi
ensemble_pred = 0.30 * rf_pred + 0.70 * gb_pred

# Evaluasi RMSE
ensemble_rmse = mean_squared_error(y_test, ensemble_pred, squared=False)
print(f"RMSE on the test set using Ensemble: {ensemble_rmse}")
# RMSE on the test set using Ensemble: 19.50849679504109

RMSE on the test set using Ensemble: 19.50849679504109


In [10]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Membaca data
df_train = pd.read_csv('cleaned_train.csv')
df_train = df_train.dropna()

# Membuang kolom 'Id'
df_train = df_train.drop(['Id'], axis=1)

# Memilih fitur dan target
X_combined = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split data
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_combined_scaled, y_combined, test_size=0.2, random_state=42)

# Base models
base_models = [
    ('rf', RandomForestRegressor(n_estimators=150, max_depth=30, random_state=42, min_samples_split=10, min_samples_leaf=2, bootstrap=True)),
    ('gb', GradientBoostingRegressor(n_estimators=150, max_depth=10, random_state=42, learning_rate=0.1))
]

meta_model = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42)

# Stacking ensemble with different weights
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Train the stacking model
stacking_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
stacking_pred = stacking_model.predict(X_test_scaled)

# Evaluasi RMSE
stacking_rmse = mean_squared_error(y_test, stacking_pred, squared=False)
print(f"RMSE on the test set using Stacking Ensemble: {stacking_rmse}")

RMSE on the test set using Stacking Ensemble: 19.50987184527166


In [5]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Membaca data
df_train = pd.read_csv('cleaned_train.csv')
df_train = df_train.dropna()

# Membuang kolom 'Id'
df_train = df_train.drop(['Id'], axis=1)

# Memilih fitur dan target
X_combined = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split data
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_combined_scaled, y_combined, test_size=0.2, random_state=42)

# Model GradientBoosting
gb_model = GradientBoostingRegressor(n_estimators=150, max_depth=10, random_state=42, learning_rate=0.1)
gb_model.fit(X_train_scaled, y_train)
gb_pred = gb_model.predict(X_test_scaled)

# Evaluasi RMSE
gb_rmse = mean_squared_error(y_test, gb_pred, squared=False)
print(f"RMSE on the test set using Gradient Boosting: {gb_rmse}")
# RMSE on the test set using Gradient Boosting: 19.552591571981765


RMSE on the test set using Gradient Boosting: 19.65937153473047


In [4]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Membaca data
df_train = pd.read_csv('cleaned_train.csv')
df_train = df_train.dropna()

# Membuang kolom 'Id'
df_train = df_train.drop(['Id'], axis=1)

# Memilih fitur dan target
X_combined = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split data
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_combined_scaled, y_combined, test_size=0.2, random_state=42)

# Model GradientBoosting
gb_model = GradientBoostingRegressor(n_estimators=150, max_depth=10, random_state=42, learning_rate=0.1)
gb_model.fit(X_train_scaled, y_train)
gb_pred = gb_model.predict(X_test_scaled)

# Evaluasi RMSE
gb_rmse = mean_squared_error(y_test, gb_pred, squared=False)
print(f"RMSE on the test set using Gradient Boosting: {gb_rmse}")
# RMSE on the test set using Gradient Boosting: 19.552591571981765


RMSE on the test set using Gradient Boosting: 19.65937153473047


In [12]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Membaca data training
df_train = pd.read_csv('cleaned_train.csv')
df_train = df_train.dropna()

# Membuang kolom 'Id'
df_train = df_train.drop(['Id'], axis=1)

# Memilih fitur dan target
X_train = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_train = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_train = pd.get_dummies(X_train)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Model RandomForest
rf_model = RandomForestRegressor(n_estimators=150, max_depth=30, random_state=42, min_samples_split=10, min_samples_leaf=2, bootstrap=True)
rf_model.fit(X_train_scaled, y_train)

# Model GradientBoosting
gb_model = GradientBoostingRegressor(n_estimators=150, max_depth=10, random_state=42, learning_rate=0.1)
gb_model.fit(X_train_scaled, y_train)

# Membaca data test
df_test = pd.read_csv('cleaned_test.csv')

# Simpan kolom 'Id' untuk digunakan dalam penggabungan hasil prediksi
test_ids = df_test['Id']

# Membuang kolom 'Id'
df_test = df_test.drop(['Id'], axis=1)

# One-hot encode categorical variables
X_test = pd.get_dummies(df_test)

# Standardize the data menggunakan scaler yang sama dari data training
X_test_scaled = scaler.transform(X_test)

# Prediksi menggunakan model RandomForest
rf_test_pred = rf_model.predict(X_test_scaled)

# Prediksi menggunakan model GradientBoosting
gb_test_pred = gb_model.predict(X_test_scaled)

# Gabungkan hasil prediksi
ensemble_test_pred = 0.3 * rf_test_pred + 0.7 * gb_test_pred

# Buat DataFrame hasil prediksi untuk data test
ensemble_result_df = pd.DataFrame({'Id': test_ids, 'CO2 Emissions(g/km)': ensemble_test_pred})

# Simpan hasil prediksi dalam file CSV
ensemble_result_df.to_csv('ensemble2.csv', index=False)


In [9]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Membaca data
df_train = pd.read_csv('cleaned_train.csv')
df_train = df_train.dropna()

# Membuang kolom 'Id'
df_train = df_train.drop(['Id'], axis=1)

# Memilih fitur dan target
X_combined = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split data
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_combined_scaled, y_combined, test_size=0.2, random_state=42)

# Model RandomForest
rf_model = RandomForestRegressor(n_estimators=150, max_depth=30, random_state=42, min_samples_split=10, min_samples_leaf=2, bootstrap=True)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_test_scaled)

# Model GradientBoosting
gb_model = GradientBoostingRegressor(n_estimators=150, max_depth=10, random_state=42, learning_rate=0.1)
gb_model.fit(X_train_scaled, y_train)
gb_pred = gb_model.predict(X_test_scaled)

# Model Lasso Regression
lasso_model = Lasso(alpha=0.01, random_state=42)
lasso_model.fit(X_train_scaled, y_train)
lasso_pred = lasso_model.predict(X_test_scaled)

# Gabungkan hasil prediksi
ensemble_pred = 0.4 * rf_pred + 0.4 * gb_pred + 0.2 * lasso_pred

# Evaluasi RMSE
ensemble_rmse = mean_squared_error(y_test, ensemble_pred, squared=False)
print(f"RMSE on the test set using Ensemble: {ensemble_rmse}")


RMSE on the test set using Ensemble: 19.882205504901215


  model = cd_fast.enet_coordinate_descent(


In [10]:
pip install imbalanced-learn

Collecting imbalanced-learnNote: you may need to restart the kernel to use updated packages.

  Downloading imbalanced_learn-0.11.0-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
   ---------------------------------------- 0.0/235.6 kB ? eta -:--:--
   ---------------------------------------- 0.0/235.6 kB ? eta -:--:--
   - -------------------------------------- 10.2/235.6 kB ? eta -:--:--
   - -------------------------------------- 10.2/235.6 kB ? eta -:--:--
   ------ -------------------------------- 41.0/235.6 kB 487.6 kB/s eta 0:00:01
   --------------------------- ------------ 163.8/235.6 kB 1.2 MB/s eta 0:00:01
   ---------------------------------------- 235.6/235.6 kB 1.4 MB/s eta 0:00:00
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.11.0


In [11]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pandas as pd

# Membaca data
df_train = pd.read_csv('cleaned_train.csv')
df_train = df_train.dropna()

# Membuang kolom 'Id'
df_train = df_train.drop(['Id'], axis=1)

# Memilih fitur dan target
X_combined = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Melakukan resampling menggunakan SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_combined_scaled, y_combined)

# Split data
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Model RandomForest
rf_model = RandomForestRegressor(n_estimators=150, max_depth=30, random_state=42, min_samples_split=10, min_samples_leaf=2, bootstrap=True)
rf_model.fit(X_train_resampled, y_train_resampled)
rf_pred = rf_model.predict(X_test_resampled)

# Model GradientBoosting
gb_model = GradientBoostingRegressor(n_estimators=150, max_depth=10, random_state=42, learning_rate=0.1)
gb_model.fit(X_train_resampled, y_train_resampled)
gb_pred = gb_model.predict(X_test_resampled)

# Gabungkan hasil prediksi
ensemble_pred = 0.5 * rf_pred + 0.5 * gb_pred

# Evaluasi RMSE
ensemble_rmse = mean_squared_error(y_test_resampled, ensemble_pred, squared=False)
print(f"RMSE on the test set using Ensemble after resampling: {ensemble_rmse}")


ValueError: Expected n_neighbors <= n_samples,  but n_samples = 5, n_neighbors = 6