In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import (
    LinearRegression,
    Ridge,
    Lasso,
    ElasticNet,
    BayesianRidge,
    SGDRegressor,
    HuberRegressor,
    TheilSenRegressor,
    RANSACRegressor
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor
)
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.cross_decomposition import PLSRegression # Partial Least Squares
from sklearn.kernel_ridge import KernelRidge # Kernel Ridge Regression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder

# Optional: Install these if you want to use them
try:
    import xgboost as xgb
except ImportError:
    xgb = None
    print("XGBoost not installed. Skipping XGBoost Regressor.")

try:
    import lightgbm as lgb
except ImportError:
    lgb = None
    print("LightGBM not installed. Skipping LightGBM Regressor.")

try:
    import catboost as cb
except ImportError:
    cb = None
    print("CatBoost not installed. Skipping CatBoost Regressor.")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/student-performance-multiple-linear-regression/Student_Performance.csv


In [2]:
df = pd.read_csv('/kaggle/input/student-performance-multiple-linear-regression/Student_Performance.csv')
df

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,Yes,4,2,23.0
9996,7,64,Yes,8,5,58.0
9997,6,83,Yes,8,5,74.0
9998,9,97,Yes,7,0,95.0


In [3]:
df.columns = ['hours_studied' , 'previous_score', 'extra_curr_activities' , 'sleep_hours' , 'simple_ques_papers_prac' , 'performance_index']

In [4]:
df

Unnamed: 0,hours_studied,previous_score,extra_curr_activities,sleep_hours,simple_ques_papers_prac,performance_index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,Yes,4,2,23.0
9996,7,64,Yes,8,5,58.0
9997,6,83,Yes,8,5,74.0
9998,9,97,Yes,7,0,95.0


In [5]:
# Getting Duplicates in data
df.duplicated().sum()

127

In [6]:
# Cleaning up duplicates to make unbiased and accurate predictions
new_df = df.drop_duplicates()

In [7]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9873 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   hours_studied            9873 non-null   int64  
 1   previous_score           9873 non-null   int64  
 2   extra_curr_activities    9873 non-null   object 
 3   sleep_hours              9873 non-null   int64  
 4   simple_ques_papers_prac  9873 non-null   int64  
 5   performance_index        9873 non-null   float64
dtypes: float64(1), int64(4), object(1)
memory usage: 539.9+ KB


In [8]:
# Splitting Dependent and Independent Features
X = new_df.drop(columns=['performance_index'] , axis=1)
y = new_df['performance_index']

In [9]:
# Getting Numeric and Categorical Columns for Data Cleaning and Imputation
numerical_cols = X.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist()

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        # Corrected line: (name, transformer_object, columns_to_apply_to)
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough' # Keep any other columns that weren't specified
)

In [11]:
# Splitting Data for training and testing
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state=42)

In [12]:
# Transforming training and testing data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [13]:
# Training Model over preprocessed Data
lr = LinearRegression()
lr.fit(X_train_transformed , y_train)

In [14]:
predictions = lr.predict(X_test_transformed)
score = r2_score(y_test , predictions)
score

0.9884301209927054

In [15]:
def evaluate_regression_models(X_train, y_train, X_test, y_test, random_state=42):
    """
    Trains and evaluates a variety of regression models on the given data.

    Args:
        X_train (pd.DataFrame or np.ndarray): Training features.
        y_train (pd.Series or np.ndarray): Training target.
        X_test (pd.DataFrame or np.ndarray): Test features.
        y_test (pd.Series or np.ndarray): Test target.
        random_state (int): Seed for reproducibility where applicable.

    Returns:
        dict: A dictionary where keys are model names and values are dictionaries
              containing 'MAE', 'MSE', 'RMSE', and 'R2_Score'.
    """

    # List of models to evaluate
    # Some models have specific parameters set for better default behavior or convergence
    models = [
        ("Linear Regression", LinearRegression()),
        ("Ridge Regression", Ridge(random_state=random_state)),
        ("Lasso Regression", Lasso(random_state=random_state)),
        ("ElasticNet Regression", ElasticNet(random_state=random_state)),
        ("Bayesian Ridge Regression", BayesianRidge()),
        ("SGD Regressor", SGDRegressor(random_state=random_state, max_iter=1000, tol=1e-3)), # Increased max_iter
        ("Huber Regressor", HuberRegressor(max_iter=1000)),
        ("Theil-Sen Regressor", TheilSenRegressor(random_state=random_state)),
        ("RANSAC Regressor", RANSACRegressor(random_state=random_state, min_samples=0.5)), # min_samples can be adjusted
        ("Decision Tree Regressor", DecisionTreeRegressor(random_state=random_state)),
        ("Random Forest Regressor", RandomForestRegressor(random_state=random_state, n_estimators=100)),
        ("Gradient Boosting Regressor", GradientBoostingRegressor(random_state=random_state, n_estimators=100)),
        ("AdaBoost Regressor", AdaBoostRegressor(random_state=random_state, n_estimators=100)),
        ("K-Neighbors Regressor", KNeighborsRegressor()),
        ("SVR (RBF Kernel)", SVR(kernel='rbf')), # RBF is common for non-linear
        ("SVR (Linear Kernel)", SVR(kernel='linear')),
        ("MLP Regressor", MLPRegressor(random_state=random_state, max_iter=500, early_stopping=True, n_iter_no_change=50)), # Increased max_iter, added early stopping
        ("PLS Regression", PLSRegression(n_components=2)), # n_components is important for PLS
        ("Kernel Ridge Regression", KernelRidge(alpha=1.0, kernel='rbf')) # alpha and kernel are important for KRR
    ]

    # Add external models if installed
    if xgb:
        models.append(("XGBoost Regressor", xgb.XGBRegressor(random_state=random_state, n_estimators=100, eval_metric='rmse', use_label_encoder=False)))
    if lgb:
        models.append(("LightGBM Regressor", lgb.LGBMRegressor(random_state=random_state, n_estimators=100)))
    if cb:
        models.append(("CatBoost Regressor", cb.CatBoostRegressor(random_state=random_state, verbose=0, n_estimators=100)))

    results = {}

    print("Starting model evaluation...")
    for name, model in models:
        print(f"\n--- Training {name} ---")
        try:
            # Fit the model
            model.fit(X_train, y_train)

            # Make predictions
            y_pred = model.predict(X_test)

            # Calculate metrics
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse) # RMSE is the square root of MSE
            r2 = r2_score(y_test, y_pred)

            results[name] = {
                'MAE': mae,
                'MSE': mse,
                'RMSE': rmse,
                'R2_Score': r2
            }
            print(f"{name} - MAE: {mae:.4f}, MSE: {mse:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")

        except Exception as e:
            results[name] = {'Error': str(e)}
            print(f"Error training/evaluating {name}: {e}")

    print("\nModel evaluation complete.")
    return results

In [16]:
results = evaluate_regression_models(X_train_transformed , y_train , X_test_transformed , y_test)

Starting model evaluation...

--- Training Linear Regression ---
Linear Regression - MAE: 1.6470, MSE: 4.3059, RMSE: 2.0751, R2: 0.9884

--- Training Ridge Regression ---
Ridge Regression - MAE: 1.6470, MSE: 4.3060, RMSE: 2.0751, R2: 0.9884

--- Training Lasso Regression ---
Lasso Regression - MAE: 2.2132, MSE: 7.7267, RMSE: 2.7797, R2: 0.9792

--- Training ElasticNet Regression ---
ElasticNet Regression - MAE: 6.0225, MSE: 52.4413, RMSE: 7.2416, R2: 0.8591

--- Training Bayesian Ridge Regression ---
Bayesian Ridge Regression - MAE: 1.6470, MSE: 4.3059, RMSE: 2.0751, R2: 0.9884

--- Training SGD Regressor ---
SGD Regressor - MAE: 1.6483, MSE: 4.3164, RMSE: 2.0776, R2: 0.9884

--- Training Huber Regressor ---
Huber Regressor - MAE: 1.6473, MSE: 4.3069, RMSE: 2.0753, R2: 0.9884

--- Training Theil-Sen Regressor ---
Theil-Sen Regressor - MAE: 1.6480, MSE: 4.3141, RMSE: 2.0771, R2: 0.9884

--- Training RANSAC Regressor ---
RANSAC Regressor - MAE: 1.6470, MSE: 4.3059, RMSE: 2.0751, R2: 0.98