In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv('cleaned_train.csv')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Pisahkan fitur dan target
X = df[['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100km)', 'Fuel Consumption Hwy (L/100km)', 'Fuel Consumption Comb (L/100km)']]
y = df['CO2 Emissions(g/km)']

# Bagi data menjadi set pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Regresi Linear
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
linear_pred = linear_model.predict(X_test)

# Model Random Forest Regressor
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Model Gradient Boosting Regressor
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

# Evaluasi kinerja
print('Linear Regression MSE:', mean_squared_error(y_test, linear_pred))
print('Random Forest Regression MSE:', mean_squared_error(y_test, rf_pred))
print('Gradient Boosting Regression MSE:', mean_squared_error(y_test, gb_pred))

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

linear_rmse = rmse(y_test, linear_pred)
rf_rmse = rmse(y_test, rf_pred)
gb_rmse = rmse(y_test, gb_pred)

print('Linear Regression RMSE:', linear_rmse)
print('Random Forest Regression RMSE:', rf_rmse)
print('Gradient Boosting Regression RMSE:', gb_rmse)


Linear Regression MSE: 4691.260606912452
Random Forest Regression MSE: 4876.326765398191
Gradient Boosting Regression MSE: 4486.666685941531
Linear Regression RMSE: 68.49277777191149
Random Forest Regression RMSE: 69.83070073684061
Gradient Boosting Regression RMSE: 66.98258494520445


In [4]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

X = df[['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100km)', 'Fuel Consumption Hwy (L/100km)', 'Fuel Consumption Comb (L/100km)']]
y = df['CO2 Emissions(g/km)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_
rf_pred_tuned = best_rf_model.predict(X_test)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rf_rmse_tuned = rmse(y_test, rf_pred_tuned)
print('Tuned Random Forest Regression RMSE:', rf_rmse_tuned)

Tuned Random Forest Regression RMSE: 67.13190999998557
