## Model Training

### Import necessary packages and data

In [88]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import warnings

In [89]:
# Import data
df = pd.read_csv('../data/student-math.csv')

In [90]:
df = df.drop(columns=['Unnamed: 0'])

In [91]:
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


### Model Training with All Attributes

#### 1. Prepare X and Y Variables

In [92]:
# Load data and prepare X and y variables using all attributes
X = df.drop(columns=['G1', 'G2', 'G3'], axis=1)
X.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,yes,no,no,4,3,4,1,1,3,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,yes,yes,no,5,3,3,1,1,3,4
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,yes,yes,no,4,3,2,2,3,3,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,yes,yes,yes,3,2,2,1,1,5,2
4,GP,F,16,U,GT3,T,3,3,other,other,...,yes,no,no,4,3,2,1,2,5,4


In [93]:
y = df['G3']
y

0       6
1       6
2      10
3      15
4      10
       ..
390     9
391    16
392     7
393    10
394     9
Name: G3, Length: 395, dtype: int64

#### 2. Column Transformers

In [94]:
# Define column transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer (
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [95]:
X = preprocessor.fit_transform(X)
X.shape

(395, 56)

#### 3. Separate Dataset Into Train and Test

In [96]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((316, 56), (79, 56))

#### 4. Evaluate Function

In [97]:
# Define the evaluation function
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    
    return mae, mse, rmse, r2_square

#### 5. Models

In [98]:
# Train models using all attributes
# Evaluate model performance
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]

    # Train Model
    model.fit(X_train, y_train)

    # Make Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train & Test dataset
    model_train_mae, model_train_mse, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model Performance For Training Set')
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- Mean Squared Error: {:.4f}".format(model_train_mse))
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model Performance For Test Set')
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- Mean Squared Error: {:.4f}".format(model_test_mse))
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- R2 Score: {:.4f}".format(model_test_r2))

    r2_list.append(model_test_r2)
    
    print('-'*35)
    print('\n')

Linear Regression
Model Performance For Training Set
- Mean Absolute Error: 2.9433
- Mean Squared Error: 15.0674
- Root Mean Squared Error: 3.8817
- R2 Score: 0.2827
----------------------------------
Model Performance For Test Set
- Mean Absolute Error: 3.4399
- Mean Squared Error: 17.9609
- Root Mean Squared Error: 4.2380
- R2 Score: 0.1241
-----------------------------------


Lasso
Model Performance For Training Set
- Mean Absolute Error: 3.2275
- Mean Squared Error: 19.0928
- Root Mean Squared Error: 4.3695
- R2 Score: 0.0910
----------------------------------
Model Performance For Test Set
- Mean Absolute Error: 3.4759
- Mean Squared Error: 19.1649
- Root Mean Squared Error: 4.3778
- R2 Score: 0.0654
-----------------------------------


Ridge
Model Performance For Training Set
- Mean Absolute Error: 2.9461
- Mean Squared Error: 14.9400
- Root Mean Squared Error: 3.8652
- R2 Score: 0.2887
----------------------------------
Model Performance For Test Set
- Mean Absolute Error: 3.3

### Model Training with Selected Attributes

#### 1. Prepare X and Y Variables

In [100]:
# Load data and prepare X and y variables using all attributes
Xs = df.drop(columns=['school', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'famsup', 'paid', 'activities', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'health', 'absences', 'G1', 'G2', 'G3'], axis=1)
Xs.head()

Unnamed: 0,sex,traveltime,studytime,failures,schoolsup,nursery,higher,Dalc,Walc
0,F,2,2,0,yes,yes,yes,1,1
1,F,1,2,0,no,no,yes,1,1
2,F,1,2,3,yes,yes,yes,2,3
3,F,1,3,0,no,yes,yes,1,1
4,F,1,2,0,no,yes,yes,1,2


In [101]:
ys = df['G3']
ys

0       6
1       6
2      10
3      15
4      10
       ..
390     9
391    16
392     7
393    10
394     9
Name: G3, Length: 395, dtype: int64

#### 2. Column Transformers

In [102]:
# Define column transformers
num_features_selected = Xs.select_dtypes(exclude="object").columns
cat_features_selected = Xs.select_dtypes(include="object").columns

numeric_transformer_selected = StandardScaler()
oh_transformer_selected = OneHotEncoder()

preprocessor_selected = ColumnTransformer (
    [
        ("OneHotEncoder", oh_transformer_selected, cat_features_selected),
        ("StandardScaler", numeric_transformer_selected, num_features_selected),        
    ]
)

In [103]:
Xs = preprocessor_selected.fit_transform(Xs)
Xs.shape

(395, 13)

#### 3. Separate Dataset Into Train and Test

In [104]:
# Split the dataset into train and test sets
Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, ys, test_size=0.2, random_state=42)

Xs_train.shape, Xs_test.shape

((316, 13), (79, 13))

#### 4. Evaluate Function

In [None]:
# Define the evaluation function
def evaluate_model_selected(true, predicted):
    mae_selected = mean_absolute_error(true, predicted)
    mse_selected = mean_squared_error(true, predicted)
    rmse_selected = np.sqrt(mean_squared_error(true, predicted))
    r2_square_selected = r2_score(true, predicted)
    
    return mae_selected, mse_selected, rmse_selected, r2_square_selected