In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from catboost import CatBoostRegressor
import warnings

In [10]:
df = pd.read_csv("data/stud.csv")

In [11]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [12]:
X = df.drop("math_score", axis = 1)

In [13]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [14]:
df.gender.value_counts()

gender
female    518
male      482
Name: count, dtype: int64

In [16]:
df["lunch"].value_counts()

lunch
standard        645
free/reduced    355
Name: count, dtype: int64

In [17]:
df.parental_level_of_education.value_counts()

parental_level_of_education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

In [18]:
df.race_ethnicity.value_counts()

race_ethnicity
group C    319
group D    262
group B    190
group E    140
group A     89
Name: count, dtype: int64

In [19]:
y = df["math_score"]

In [24]:
num_feature = X.select_dtypes(exclude= "object").columns
cat_feature = X.select_dtypes(include = "object").columns
print(num_feature)
print(cat_feature)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

preprocessor = ColumnTransformer([
    ("Numerical transformer", StandardScaler(), num_feature),
    ("Categorical transformer", OneHotEncoder(), cat_feature)
])

Index(['reading_score', 'writing_score'], dtype='object')
Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course'],
      dtype='object')


In [25]:
X = preprocessor.fit_transform(X)

In [27]:
X.shape

(1000, 19)

In [28]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)
x_train.shape, x_test.shape

((800, 19), (200, 19))

In [29]:
def evaluate(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    return mae, rmse, r2

In [32]:
models = {
    "Linear regression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "SVR": SVR(),
    "KNN": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "GradientBoost": GradientBoostingRegressor(),
    "CatBoost": CatBoostRegressor() 
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train)

    ytrain_pred = model.predict(x_train)
    ytest_pred = model.predict(x_test)

    train_mae, train_rmse, train_r2 = evaluate(y_train, ytrain_pred)
    test_mae, test_rmse, test_r2 = evaluate(y_test, ytest_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("trianing mae: ",train_mae)
    print("training rmse: ", train_rmse)
    print("training r2 score: ", train_r2)
    print("-"*35)
    print("testing mae: ", test_mae)
    print("testing rmse: ", test_rmse)
    print("testing r2 score: ", test_r2)
    r2_list.append(test_r2)
    print("\n")

Linear regression
trianing mae:  4.266711846071956
training rmse:  5.323050852720514
training r2 score:  0.8743172040139593
-----------------------------------
testing mae:  4.214763142474852
testing rmse:  5.393993869732843
testing r2 score:  0.8804332983749565


Ridge
trianing mae:  4.264987823725977
training rmse:  5.323324922741656
training r2 score:  0.8743042615212908
-----------------------------------
testing mae:  4.211100688014261
testing rmse:  5.390387016935636
testing r2 score:  0.880593148502874


Lasso
trianing mae:  5.206296077972952
training rmse:  6.593807540619166
training r2 score:  0.8071466723085148
-----------------------------------
testing mae:  5.157879138921815
testing rmse:  6.5196880562856245
testing r2 score:  0.825320079562973


SVR
trianing mae:  4.869189452384868
training rmse:  6.57700724251018
training r2 score:  0.8081281585902299
-----------------------------------
testing mae:  5.4015392444969965
testing rmse:  8.126623218622633
testing r2 score:  

In [34]:
pd.DataFrame(list(zip(model_list, r2_list)),columns= ["model name", "r2 score"]).sort_values(by = ["r2 score"], ascending= False)


Unnamed: 0,model name,r2 score
1,Ridge,0.880593
0,Linear regression,0.880433
8,GradientBoost,0.872172
9,CatBoost,0.851632
6,Random Forest,0.851429
7,AdaBoost,0.850334
2,Lasso,0.82532
4,KNN,0.78377
3,SVR,0.7286
5,Decision Tree,0.7178
