In [2]:
import pandas as pd

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('cleaned_train.csv')

# Combine training and testing data for preprocessing
X_combined = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data (optional but can be beneficial for some models)
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split the data into training and testing sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_combined_scaled, y_combined, test_size=0.2, random_state=42)

# Choose a regression model (Random Forest in this example)
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model with the training data
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE on the training set: {rmse}")


RMSE on the training set: 20.707022769634516


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('cleaned_train.csv')

# Select only the relevant columns for prediction
selected_features = ['Engine Size(L)', 'Cylinders', 'Fuel Consumption Comb (L/100km)']
X_combined = df_train[selected_features]
y_combined = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables (not needed for these features)
# X_combined = pd.get_dummies(X_combined)

# Standardize the data (optional but can be beneficial for some models)
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split the data into training and testing sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_combined_scaled, y_combined, test_size=0.2, random_state=42)

# Choose a regression model (Random Forest in this example)
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model with the training data
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE on the training set: {rmse}")


RMSE on the training set: 32.86453441635217


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('cleaned_train.csv')

# Select features for prediction
selected_features = ['Make', 'Vehicle Class', 'Engine Size(L)', 'Cylinders', 'Transmission', 'Fuel Type',
                     'Fuel Consumption City (L/100km)', 'Fuel Consumption Hwy (L/100km)']

X_combined = df_train[selected_features]
y_combined = df_train['CO2 Emissions(g/km)']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# Create transformers for numerical and categorical features
numeric_features = ['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100km)', 'Fuel Consumption Hwy (L/100km)']
categorical_features = ['Make', 'Vehicle Class', 'Transmission', 'Fuel Type']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with preprocessing and the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model with the training data
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE on the testing set: {rmse}")


RMSE on the testing set: 20.934895187832716


In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('cleaned_train.csv')

# Select features for prediction
selected_features = ['Make', 'Vehicle Class', 'Engine Size(L)', 'Cylinders', 'Transmission', 'Fuel Type',
                     'Fuel Consumption City (L/100km)', 'Fuel Consumption Hwy (L/100km)']

X_combined = df_train[selected_features]
y_combined = df_train['CO2 Emissions(g/km)']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# Create transformers for numerical and categorical features
numeric_features = ['Engine Size(L)', 'Cylinders', 'Fuel Consumption City (L/100km)', 'Fuel Consumption Hwy (L/100km)']
categorical_features = ['Make', 'Vehicle Class', 'Transmission', 'Fuel Type']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with preprocessing and the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Hyperparameter tuning using Grid Search
param_grid = {
    'regressor__n_estimators': [50, 100, 150],
    'regressor__max_depth': [None, 10, 20, 30],
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best parameters from grid search
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

# Train the model with the best hyperparameters
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE on the testing set: {rmse}")


KeyboardInterrupt: 

In [4]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd
from scipy.stats import randint

# Load the training dataset
df_train = pd.read_csv('cleaned_train.csv')

# Combine training and testing data for preprocessing
X_combined = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data (optional but can be beneficial for some models)
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split the data into training and testing sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_combined_scaled, y_combined, test_size=0.2, random_state=42)

# Define the parameter grid
param_dist = {
    'n_estimators': randint(10, 200),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'bootstrap': [True, False]
}

# Create a RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(
    rf, param_distributions=param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42, n_jobs=-1
)

# Fit the model
random_search.fit(X_train_scaled, y_train)

# Print the best parameters found
print("Best Parameters: ", random_search.best_params_)

# Get the best model
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred_random_search = best_model.predict(X_test_scaled) 

# Calculate RMSE
rmse_random_search = mean_squared_error(y_test, y_pred_random_search, squared=False)
print(f"RMSE on the test set with best parameters: {rmse_random_search}")


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
17 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\UsEr\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\UsEr\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\UsEr\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\UsEr\AppData\Local\Programs\Python\Python312\Lib\site

Best Parameters:  {'bootstrap': True, 'max_depth': 30, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 13, 'n_estimators': 167}
RMSE on the test set with best parameters: 21.120915688707832


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('train_cleaned.csv')

# Combine training and testing data for preprocessing
X_combined = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data (optional but can be beneficial for some models)
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split the data into training and testing sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_combined_scaled, y_combined, test_size=0.2, random_state=42)

# Choose a regression model (Random Forest in this example)
# model = RandomForestRegressor(n_estimators=100, random_state=42)
model = RandomForestRegressor(n_estimators=150, max_depth=None, min_samples_split=10, min_samples_leaf=2)


# Train the model with the training data
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE on the training set: {rmse}")


RMSE on the training set: 20.5849803925713


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the cleaned training dataset
df_train_cleaned = pd.read_csv('train_cleaned.csv')

# Separate features and target variable
X_train_cleaned = df_train_cleaned.drop(['CO2 Emissions(g/km)'], axis=1)
y_train_cleaned = df_train_cleaned['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_train_cleaned = pd.get_dummies(X_train_cleaned)

# Standardize the data
scaler = StandardScaler()
X_train_cleaned_scaled = scaler.fit_transform(X_train_cleaned)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'de'
    # 'max_depth': [None, 10, 20],
    # 'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the RandomForestRegressor
model = RandomForestRegressor(random_state=42)

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

# Fit the grid search to the data
grid_search.fit(X_train_cleaned_scaled, y_train_cleaned)

# Print the best parameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 150}


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('train_cleaned.csv')

# Load the testing dataset
df_test = pd.read_csv('test_cleaned.csv')

# Combine training and testing data for preprocessing
df_combined = pd.concat([df_train, df_test], axis=0, ignore_index=True)

# Data preprocessing
X_combined = df_combined.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_combined['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data (optional but can be beneficial for some models)
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split the data back into training and testing sets
X_train_scaled = X_combined_scaled[:len(df_train)]
X_test_scaled = X_combined_scaled[len(df_train):]

# Target variable for training
y_train = y_combined[:len(df_train)]

# Choose a regression model (Random Forest in this example)
model = RandomForestRegressor(n_estimators=200, max_depth=10, min_samples_split=2, random_state=42)

# Train the model with all available data
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Since this is a test set, you might use the predictions as needed for your specific use case.
# You might save the predictions to a file or use them for further analysis.

# Note: You may need to adjust the code based on the specifics of your datasets and problem.
df_test['CO2 Emissions(g/km)'] = y_pred

# Create a DataFrame with 'Id' and 'CO2 Emissions(g/km)' columns
result_df = df_test[['Id', 'CO2 Emissions(g/km)']]

# Save the result to a CSV file
result_df.to_csv('submission7.csv', index=False)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('train_cleaned.csv')

# Load the testing dataset
df_test = pd.read_csv('test_cleaned.csv')

# Combine training and testing data for preprocessing
df_combined = pd.concat([df_train, df_test], axis=0, ignore_index=True)

# Data preprocessing
X_combined = df_combined.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_combined['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data (optional but can be beneficial for some models)
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split the data back into training and testing sets
X_train_scaled = X_combined_scaled[:len(df_train)]
X_test_scaled = X_combined_scaled[len(df_train):]

# Target variable for training
y_train = y_combined[:len(df_train)]

# Choose a regression model (Random Forest in this example)
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model with all available data
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Since this is a test set, you might use the predictions as needed for your specific use case.
# You might save the predictions to a file or use them for further analysis.

# Note: You may need to adjust the code based on the specifics of your datasets and problem.
df_test['CO2 Emissions(g/km)'] = y_pred

# Create a DataFrame with 'Id' and 'CO2 Emissions(g/km)' columns
result_df = df_test[['Id', 'CO2 Emissions(g/km)']]

# Save the result to a CSV file
result_df.to_csv('randomforest.csv', index=False)

In [12]:
df = pd.read_csv('submission6.csv')

# Mengubah format kolom CO2 Emissions(g/km) menjadi satu angka dibelakang koma
df['CO2 Emissions(g/km)'] = df['CO2 Emissions(g/km)'].round(1)

# Menyimpan dataframe yang telah diubah ke dalam file CSV baru
df.to_csv('submission.csv', index=False)

In [7]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('cleaned_train.csv')

# Combine training and testing data for preprocessing
X_combined = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data (optional but can be beneficial for some models)
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split the data into training and testing sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_combined_scaled, y_combined, test_size=0.2, random_state=42)

# Create a base model
base_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create the RFE model and select 3 attributes
rfe = RFE(base_model, n_features_to_select=3)
rfe.fit(X_combined_scaled, y_combined)

# Selected features
selected_features_rfe = X_combined.columns[rfe.support_]

# Use the selected features to train the model
X_train_rfe = X_train_scaled[selected_features_rfe]
X_test_rfe = X_test_scaled[selected_features_rfe]

# Train the model with the selected features
model_rfe = RandomForestRegressor(n_estimators=100, random_state=42)
model_rfe.fit(X_train_rfe, y_train)

# Make predictions on the test set
y_pred_rfe = model_rfe.predict(X_test_rfe)

# Calculate RMSE
rmse_rfe = mean_squared_error(y_test, y_pred_rfe, squared=False)
print(f"RMSE with RFE selected features: {rmse_rfe}")


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [8]:
print("Selected Features with RFE:")
print(selected_features_rfe)


Selected Features with RFE:
Index(['Engine Size(L)', 'Cylinders', 'Fuel Consumption Comb (L/100km)'], dtype='object')
