In [2]:
import pandas as pd

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('cleaned_train.csv')

# Combine training and testing data for preprocessing
X_combined = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data (optional but can be beneficial for some models)
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split the data into training and testing sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_combined_scaled, y_combined, test_size=0.2, random_state=42)

# Choose a regression model (Random Forest in this example)
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model with the training data
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE on the training set: {rmse}")


RMSE on the training set: 20.707022769634516


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('train_cleaned.csv')

# Combine training and testing data for preprocessing
X_combined = df_train.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_train['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data (optional but can be beneficial for some models)
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split the data into training and testing sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_combined_scaled, y_combined, test_size=0.2, random_state=42)

# Choose a regression model (Random Forest in this example)
# model = RandomForestRegressor(n_estimators=100, random_state=42)
model = RandomForestRegressor(n_estimators=150, max_depth=None, min_samples_split=10, min_samples_leaf=2)


# Train the model with the training data
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE on the training set: {rmse}")


RMSE on the training set: 20.5849803925713


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the cleaned training dataset
df_train_cleaned = pd.read_csv('train_cleaned.csv')

# Separate features and target variable
X_train_cleaned = df_train_cleaned.drop(['CO2 Emissions(g/km)'], axis=1)
y_train_cleaned = df_train_cleaned['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_train_cleaned = pd.get_dummies(X_train_cleaned)

# Standardize the data
scaler = StandardScaler()
X_train_cleaned_scaled = scaler.fit_transform(X_train_cleaned)

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'de'
    # 'max_depth': [None, 10, 20],
    # 'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the RandomForestRegressor
model = RandomForestRegressor(random_state=42)

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

# Fit the grid search to the data
grid_search.fit(X_train_cleaned_scaled, y_train_cleaned)

# Print the best parameters
print("Best Hyperparameters:", grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 150}


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('cleaned_train.csv')

# Load the testing dataset
df_test = pd.read_csv('cleaned_test.csv')

# Combine training and testing data for preprocessing
df_combined = pd.concat([df_train, df_test], axis=0, ignore_index=True)

# Data preprocessing
X_combined = df_combined.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_combined['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data (optional but can be beneficial for some models)
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split the data back into training and testing sets
X_train_scaled = X_combined_scaled[:len(df_train)]
X_test_scaled = X_combined_scaled[len(df_train):]

# Target variable for training
y_train = y_combined[:len(df_train)]

# Choose a regression model (Random Forest in this example)
model = RandomForestRegressor(n_estimators=200, max_depth=10, min_samples_split=2, random_state=42)

# Train the model with all available data
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Since this is a test set, you might use the predictions as needed for your specific use case.
# You might save the predictions to a file or use them for further analysis.

# Note: You may need to adjust the code based on the specifics of your datasets and problem.
df_test['CO2 Emissions(g/km)'] = y_pred

# Create a DataFrame with 'Id' and 'CO2 Emissions(g/km)' columns
result_df = df_test[['Id', 'CO2 Emissions(g/km)']]

# Save the result to a CSV file
result_df.to_csv('submission7.csv', index=False)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('cleaned_train.csv')

# Load the testing dataset
df_test = pd.read_csv('cleaned_test.csv')

# Combine training and testing data for preprocessing
df_combined = pd.concat([df_train, df_test], axis=0, ignore_index=True)

# Data preprocessing
X_combined = df_combined.drop(['CO2 Emissions(g/km)'], axis=1)
y_combined = df_combined['CO2 Emissions(g/km)']

# One-hot encode categorical variables
X_combined = pd.get_dummies(X_combined)

# Standardize the data (optional but can be beneficial for some models)
scaler = StandardScaler()
X_combined_scaled = scaler.fit_transform(X_combined)

# Split the data back into training and testing sets
X_train_scaled = X_combined_scaled[:len(df_train)]
X_test_scaled = X_combined_scaled[len(df_train):]

# Target variable for training
y_train = y_combined[:len(df_train)]

# Choose a regression model (Random Forest in this example)
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model with all available data
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Since this is a test set, you might use the predictions as needed for your specific use case.
# You might save the predictions to a file or use them for further analysis.

# Note: You may need to adjust the code based on the specifics of your datasets and problem.
df_test['CO2 Emissions(g/km)'] = y_pred

# Create a DataFrame with 'Id' and 'CO2 Emissions(g/km)' columns
result_df = df_test[['Id', 'CO2 Emissions(g/km)']]

# Save the result to a CSV file
result_df.to_csv('submission6.csv', index=False)

In [12]:
df = pd.read_csv('submission6.csv')

# Mengubah format kolom CO2 Emissions(g/km) menjadi satu angka dibelakang koma
df['CO2 Emissions(g/km)'] = df['CO2 Emissions(g/km)'].round(1)

# Menyimpan dataframe yang telah diubah ke dalam file CSV baru
df.to_csv('submission.csv', index=False)