In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

#models: 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings


In [2]:
df = pd.read_csv(r"C:\Users\haris\Downloads\stud.csv")
df.head(4)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44


X and y

In [3]:
X = df.drop(columns=['math_score'], axis = 1)
y = df['math_score']

In [4]:
X.shape

(1000, 7)

In [6]:
X.head(3)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93


We have some string/word features along with numerical. They need to be converted to numbers using ONE HOT encoding. The numeric ones have to be converted to standard scaler values

In [8]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
one_hot_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [ 
        ("OneHotEncoder", one_hot_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features)
    ]
)

In [9]:
X = preprocessor.fit_transform(X)

In [10]:
print(X)

[[ 1.          0.          0.         ...  1.          0.19399858
   0.39149181]
 [ 1.          0.          0.         ...  0.          1.42747598
   1.31326868]
 [ 1.          0.          0.         ...  1.          1.77010859
   1.64247471]
 ...
 [ 1.          0.          0.         ...  0.          0.12547206
  -0.20107904]
 [ 1.          0.          0.         ...  0.          0.60515772
   0.58901542]
 [ 1.          0.          0.         ...  1.          1.15336989
   1.18158627]]


In [11]:
X.shape

(1000, 19)

Train - Test Split

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

In [14]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(800, 19) (200, 19) (800,) (200,)


Create an EVALUATE function for comprehensive evaluation, after training

In [15]:
def evaluate_model(true, predicted):
    MAE = mean_absolute_error(true, predicted)
    MSE = mean_squared_error(true, predicted)
    RMSE = np.sqrt(mean_squared_error(true, predicted))
    R2_SQUARE = r2_score(true, predicted)
    return MAE, MSE, RMSE, R2_SQUARE

All models together:

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "KNN": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Reg": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "Catboost Regressor": CatBoostRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()


}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = 