In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load Data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Identify numeric and categorical columns separately
numeric_cols_train = train.select_dtypes(include=[np.number]).columns
numeric_cols_test = test.select_dtypes(include=[np.number]).columns

# Fill missing values for numeric columns
train[numeric_cols_train] = train[numeric_cols_train].fillna(train[numeric_cols_train].mean())
test[numeric_cols_test] = test[numeric_cols_test].fillna(test[numeric_cols_test].mean())

# Identify categorical columns
categorical_cols = train.select_dtypes(exclude=[np.number]).columns

# Fill missing values for categorical columns using mode
for col in categorical_cols:
    train[col].fillna(train[col].mode()[0], inplace=True)
    test[col].fillna(test[col].mode()[0], inplace=True)

# Encode categorical features
label_enc = LabelEncoder()
for col in categorical_cols:
    train[col] = label_enc.fit_transform(train[col])
    test[col] = label_enc.transform(test[col])

# Feature Selection
X = train.drop(columns=['uid', 'output_electricity_generation'])
y = train['output_electricity_generation']
X_test = test.drop(columns=['uid'])

# Scale Features
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

# Split Data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter Tuning for RandomForest
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20]
}

grid = GridSearchCV(RandomForestRegressor(), param_grid, cv=3, scoring='neg_root_mean_squared_error', verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print(f"Best Parameters for RandomForestRegressor: {grid.best_params_}")

# Evaluate on Validation Set
y_pred = best_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RandomForest RMSE: {rmse}")

# Generate Test Predictions
final_preds = best_model.predict(X_test)

# Create Submission File
submission = pd.DataFrame({"uid": test['uid'], "output_electricity_generation": final_preds})
submission.to_csv('final_submission.csv', index=False)

print("Submission file saved as 'final_submission.csv'")
