## Model Training

In [7]:
# Importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

# Others
from typing import Tuple

**Import the CSV data as a pandas DataFrame**

In [8]:
df_students = pd.read_csv('../data/raw/stud.csv')

**Preparing features and target variable**

In [9]:
df_students['average_score'] = (df_students['math_score'] + df_students['reading_score'] + df_students['writing_score']) / 3
X = df_students.drop(columns=['math_score','writing_score','reading_score','average_score'])
y = df_students['average_score'].values

**Encoding**

In [10]:
# Create Column Transformer for encoding categorical features
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = X.columns
oh = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh, categorical_features)
    ],
    sparse_threshold=0
)

X = preprocessor.fit_transform(X)

**Train test split**

In [127]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

**Machine Learning models training and experimentation**

In [128]:
# Evaluator function
def evaluate_model(true: np.array, predicted: np.array) -> Tuple[float, float, float, float]:
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

# Trainer function
def model_trainer(model_dict: dict, 
                  X_train: np.array, 
                  y_train: np.array,
                  X_test: np.array,
                  y_test: np.array):
    
    results = {}

    for model_name, model in model_dict.items():
        model.fit(X_train, y_train)

        # Make the predictions
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        
        # Evaluate predictions
        train_mae, train_mse, train_rmse, train_r2_square = evaluate_model(y_train, y_pred_train)
        test_mae, test_mse, test_rmse, test_r2_square = evaluate_model(y_test, y_pred_test)

        # Print results
        print(model_name)
        print('Model performance for training set')
        print(f'- Mean Absolute error: {train_mae}')
        print(f'- Mean Squared Error: {train_mse}')
        print(f'- Root Mean Squared Error: {train_rmse}')
        print(f'- R2 Score: {train_r2_square}')

        print('----------------------------------')

        print('Model performance for test set')
        print(f'- Mean Absolute error: {test_mae}')
        print(f'- Mean Squared Error: {test_mse}')
        print(f'- Root Mean Squared Error: {test_rmse}')
        print(f'- R2 Score: {test_r2_square}')

        print('\n==================================\n')
        results[model_name] = test_mae
        
    return results

In [129]:
models_dict = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

results = model_trainer(models_dict, X_train, y_train, X_test, y_test)

Linear Regression
Model performance for training set
- Mean Absolute error: 10.038125
- Mean Squared Error: 156.14420138888892
- Root Mean Squared Error: 12.495767338938771
- R2 Score: 0.24300269967215204
----------------------------------
Model performance for test set
- Mean Absolute error: 10.054166666666667
- Mean Squared Error: 148.89975694444445
- Root Mean Squared Error: 12.202448809335134
- R2 Score: 0.20421915220371267


Lasso
Model performance for training set
- Mean Absolute error: 10.722523378542315
- Mean Squared Error: 182.14676265591294
- Root Mean Squared Error: 13.496175853030108
- R2 Score: 0.11694058205484459
----------------------------------
Model performance for test set
- Mean Absolute error: 10.669282569606043
- Mean Squared Error: 170.31874600546945
- Root Mean Squared Error: 13.050622437472837
- R2 Score: 0.08974736511891956


Ridge
Model performance for training set
- Mean Absolute error: 10.021669384072897
- Mean Squared Error: 155.95760345923273
- Root Mean

Random Forest Regressor
Model performance for training set
- Mean Absolute error: 8.705461149671837
- Mean Squared Error: 124.19173660019655
- Root Mean Squared Error: 11.144134627695259
- R2 Score: 0.3979103386924374
----------------------------------
Model performance for test set
- Mean Absolute error: 11.297281481533796
- Mean Squared Error: 196.16320592916898
- Root Mean Squared Error: 14.005827570306904
- R2 Score: -0.04837593777265137


XGBRegressor
Model performance for training set
- Mean Absolute error: 8.486024688084921
- Mean Squared Error: 122.08479427412924
- Root Mean Squared Error: 11.04919880688773
- R2 Score: 0.4081249328854494
----------------------------------
Model performance for test set
- Mean Absolute error: 11.815309098561602
- Mean Squared Error: 216.96097258255054
- Root Mean Squared Error: 14.729595126226332
- R2 Score: -0.15952765970509497


CatBoosting Regressor
Model performance for training set
- Mean Absolute error: 8.5765312646881
- Mean Squared Error

**Results**

In [131]:
results_df = pd.DataFrame([results]).T
results_df

Unnamed: 0,0
Linear Regression,10.054167
Lasso,10.669283
Ridge,10.032402
K-Neighbors Regressor,11.153
Decision Tree,11.820872
Random Forest Regressor,11.297281
XGBRegressor,11.815309
CatBoosting Regressor,11.562582
AdaBoost Regressor,10.433965
