In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor

In [3]:
df = pd.read_csv('data/stud.csv')
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [29]:
X = df.drop(['math_score'],axis=1)
y= df['math_score'].values
y

array([ 72,  69,  90,  47,  76,  71,  88,  40,  64,  38,  58,  40,  65,
        78,  50,  69,  88,  18,  46,  54,  66,  65,  44,  69,  74,  73,
        69,  67,  70,  62,  69,  63,  56,  40,  97,  81,  74,  50,  75,
        57,  55,  58,  53,  59,  50,  65,  55,  66,  57,  82,  53,  77,
        53,  88,  71,  33,  82,  52,  58,   0,  79,  39,  62,  69,  59,
        67,  45,  60,  61,  39,  58,  63,  41,  61,  49,  44,  30,  80,
        61,  62,  47,  49,  50,  72,  42,  73,  76,  71,  58,  73,  65,
        27,  71,  43,  79,  78,  65,  63,  58,  65,  79,  68,  85,  60,
        98,  58,  87,  66,  52,  70,  77,  62,  54,  51,  99,  84,  75,
        78,  51,  55,  79,  91,  88,  63,  83,  87,  72,  65,  82,  51,
        89,  53,  87,  75,  74,  58,  51,  70,  59,  71,  76,  59,  42,
        57,  88,  22,  88,  73,  68, 100,  62,  77,  59,  54,  62,  70,
        66,  60,  61,  66,  82,  75,  49,  52,  81,  96,  53,  58,  68,
        67,  72,  94,  79,  63,  43,  81,  46,  71,  52,  97,  6

In [24]:
numerical_features = X.select_dtypes(exclude='object').columns
categorical_features = X.select_dtypes(include='object').columns

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
onehot_transformer = OneHotEncoder()

#pipeline for preprocessing
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",onehot_transformer,categorical_features),
        ('StandardScaler',numeric_transformer,numerical_features)
    ]
)


In [25]:
X = preprocessor.fit_transform(X)

In [26]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)

In [35]:
def evaluate_model(true,predicted):

    mae = round(mean_absolute_error(true,predicted),2)
    mse = round(mean_squared_error(true,predicted),2)
    rmse = round(np.sqrt(mse),2)
    r2 =round(r2_score(true,predicted),2)
    return mae,rmse,r2


In [36]:
models = {
    "Linear Regression": LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    "AdaBoost Regression": AdaBoostRegressor(),
    "Random Forest": RandomForestRegressor(),
    "CatBoost Regression": CatBoostRegressor(),
    "XGBoost Regression":  XGBRegressor(),
}
model_list = []
r2_list = []

for model_name,model in models.items():

    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae,model_train_rmse,model_train_r2 = evaluate_model(y_train,y_train_pred)
    model_test_mae,model_test_rmse,model_test_r2 = evaluate_model(y_test,y_test_pred)

    print(model_name)
    model_list.append(model_name)

    print("Model performance on Training Data")
    print("MAE: ",model_train_mae)
    print("RMSE: ",model_train_rmse)
    print("R2: ",model_train_r2)

    print("----------------------------------------")

    print("Model performance on Test Data")
    print("MAE: ",model_test_mae)
    print("RMSE: ",model_test_rmse)
    print("R2: ",model_test_r2)

    print('******************************************************')
    r2_list.append(model_test_r2)





Linear Regression
Model performance on Training Data
MAE:  4.27
RMSE:  5.32
R2:  0.87
----------------------------------------
Model performance on Test Data
MAE:  4.22
RMSE:  5.4
R2:  0.88
******************************************************
Decision Tree
Model performance on Training Data
MAE:  0.02
RMSE:  0.28
R2:  1.0
----------------------------------------
Model performance on Test Data
MAE:  6.42
RMSE:  8.04
R2:  0.73
******************************************************
AdaBoost Regression
Model performance on Training Data
MAE:  4.76
RMSE:  5.8
R2:  0.85
----------------------------------------
Model performance on Test Data
MAE:  4.75
RMSE:  6.17
R2:  0.84
******************************************************
Random Forest
Model performance on Training Data
MAE:  1.82
RMSE:  2.28
R2:  0.98
----------------------------------------
Model performance on Test Data
MAE:  4.56
RMSE:  5.93
R2:  0.86
******************************************************
Learning rate set to 0.03