In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, StackingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv('/kaggle/input/aoml-ass/train (1).csv')  # Replace with actual dataset path

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Convert categorical columns to numeric
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoder for future use

# Handle missing values and categorical data

# Identify numeric and categorical columns
numeric_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Impute missing values for numeric columns
imputer_num = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer_num.fit_transform(df[numeric_cols])

# Convert categorical variables to numerical using one-hot encoding
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Define features and target
X = df.drop(columns=['output_electricity_generation'])  # Replace with actual target column
y = df['output_electricity_generation']

# Define features and target
X = df.drop(columns=['output_electricity_generation'])  
y = df['output_electricity_generation']


# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Bagging - Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
print("Random Forest MAE:", mean_absolute_error(y_val, y_pred_rf))

# 2. Boosting - AdaBoost
ab = AdaBoostRegressor(n_estimators=100, random_state=42)
ab.fit(X_train, y_train)
y_pred_ab = ab.predict(X_val)
print("AdaBoost MAE:", mean_absolute_error(y_val, y_pred_ab))

# 3. Boosting - Gradient Boosting
gb = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_val)
print("Gradient Boosting MAE:", mean_absolute_error(y_val, y_pred_gb))

# 4. Boosting - XGBoost
xgb = XGBRegressor(n_estimators=100, random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_val)
print("XGBoost MAE:", mean_absolute_error(y_val, y_pred_xgb))

# 5. Stacking
estimators = [('rf', rf), ('gb', gb), ('xgb', xgb)]
st = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
st.fit(X_train, y_train)
y_pred_st = st.predict(X_val)
print("Stacking MAE:", mean_absolute_error(y_val, y_pred_st))

# 6. Voting Regressor
vr = VotingRegressor(estimators=estimators)
vr.fit(X_train, y_train)
y_pred_vr = vr.predict(X_val)
print("Voting MAE:", mean_absolute_error(y_val, y_pred_vr))


Random Forest MAE: 0.8217390823582249
AdaBoost MAE: 18.58954486471503
Gradient Boosting MAE: 4.625791140922708
XGBoost MAE: 1.7084602334209094
Stacking MAE: 1.25808695750052
Voting MAE: 1.9664814804044426


In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, StackingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv('/kaggle/input/aoml-ass/train (1).csv')  # Replace with actual dataset path

# Use only 50% of data for faster testing
df_sample = df.sample(frac=0.5, random_state=42)

# Identify numeric and categorical columns
numeric_cols = df_sample.select_dtypes(include=['number']).columns
categorical_cols = df_sample.select_dtypes(include=['object']).columns

# Handle missing values
imputer_num = SimpleImputer(strategy='mean')
df_sample[numeric_cols] = imputer_num.fit_transform(df_sample[numeric_cols])

# Convert categorical variables to numerical using one-hot encoding
df_sample = pd.get_dummies(df_sample, columns=categorical_cols, drop_first=True)

# Define features and target
X = df_sample.drop(columns=['output_electricity_generation'])
y = df_sample['output_electricity_generation']

# Split data (smaller test size for faster training)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# 1. Bagging - Random Forest (Use n_jobs=-1 for parallel processing)
rf = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
print("Random Forest MAE:", mean_absolute_error(y_val, y_pred_rf))

# 2. Boosting - AdaBoost (Reduce n_estimators)
ab = AdaBoostRegressor(n_estimators=50, random_state=42)
ab.fit(X_train, y_train)
y_pred_ab = ab.predict(X_val)
print("AdaBoost MAE:", mean_absolute_error(y_val, y_pred_ab))

# 3. Boosting - Gradient Boosting (Reduce n_estimators)
gb = GradientBoostingRegressor(n_estimators=50, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_val)
print("Gradient Boosting MAE:", mean_absolute_error(y_val, y_pred_gb))

# 4. Boosting - XGBoost (Reduce n_estimators, Disable verbose)
xgb = XGBRegressor(n_estimators=50, random_state=42, verbosity=0)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_val)
print("XGBoost MAE:", mean_absolute_error(y_val, y_pred_xgb))

# 5. Stacking
estimators = [('rf', rf), ('gb', gb), ('xgb', xgb)]
st = StackingRegressor(estimators=estimators, final_estimator=LinearRegression(), n_jobs=-1)
st.fit(X_train, y_train)
y_pred_st = st.predict(X_val)
print("Stacking MAE:", mean_absolute_error(y_val, y_pred_st))

# 6. Voting Regressor
vr = VotingRegressor(estimators=estimators, n_jobs=-1)
vr.fit(X_train, y_train)
y_pred_vr = vr.predict(X_val)
print("Voting MAE:", mean_absolute_error(y_val, y_pred_vr))


Random Forest MAE: 0.9893979148288411
AdaBoost MAE: 17.55364284393764
Gradient Boosting MAE: 5.791378776626978
XGBoost MAE: 2.055901752047036
Stacking MAE: 1.0314701326094076
Voting MAE: 2.417699397143361
