In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import RandomizedSearchCV

%pip install xgboost
from xgboost import XGBRegressor





In [21]:
pip install catboost




In [22]:
from catboost import CatBoostRegressor

In [23]:
df=pd.read_csv('data/StudentsPerformance.csv')

In [24]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [25]:
df['Average']=(df['math score']+df['reading score']+df['writing score'])/3

In [26]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,Average
0,female,group B,bachelor's degree,standard,none,72,72,74,72.666667
1,female,group C,some college,standard,completed,69,90,88,82.333333
2,female,group B,master's degree,standard,none,90,95,93,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,49.333333
4,male,group C,some college,standard,none,76,78,75,76.333333


In [27]:
y = df['Average']               # Save target variable


In [28]:
y

0      72.666667
1      82.333333
2      92.666667
3      49.333333
4      76.333333
         ...    
995    94.000000
996    57.333333
997    65.000000
998    74.333333
999    83.000000
Name: Average, Length: 1000, dtype: float64

In [29]:
X = df.drop('Average', axis=1)

In [30]:
y

0      72.666667
1      82.333333
2      92.666667
3      49.333333
4      76.333333
         ...    
995    94.000000
996    57.333333
997    65.000000
998    74.333333
999    83.000000
Name: Average, Length: 1000, dtype: float64

In [31]:
X.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [32]:
num_feature=X.select_dtypes(exclude='object').columns
cat_feature=X.select_dtypes(include='object').columns
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
numeric_transformer=StandardScaler()
oh_transformer=OneHotEncoder()
preprocessor=ColumnTransformer(
    [
        ('OneHotEncoder',oh_transformer,cat_feature),
        ('StandardScaler',numeric_transformer,num_feature)
    ]
)


In [33]:
X=preprocessor.fit_transform(X)

In [34]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [35]:
X_train.shape

(800, 20)

In [36]:
from sklearn.metrics import mean_absolute_error

In [37]:
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mse)
    score=r2_score(true,predicted)
    return mae,mse,rmse,score

In [41]:
models={
    "Linear Regression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "XGBregressor":XGBRegressor(),
    "CatBoostRegressor":CatBoostRegressor(verbose=False),
    "AdaBoostRegressor":AdaBoostRegressor()

}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)
    mae,mse,rmse,score=evaluate_model(y_train,y_train_pred)
    mae_,mse_,rmse_,score_=evaluate_model(y_test,y_test_pred)
    print('--------------------------------')
    print(list(models.keys())[i])
    print('*****************')
    print('Model perfomance for training test')
    print(f"mean squared error : { mse}")
    print(f"mean absolute error : {mae}")
    print(f"root mean squared error : {rmse}")
    print(f"r2_score  : { score}")
    print('******************')
    print('#######################')
    print("Model performance for test data ")
    print(f"mean squared error : { mse_}")
    print(f"mean absolute error : {mae_}")
    print(f"root mean squared error : {rmse_}")
    print(f"r2_score  : { score_}")
    print('########################')
    print('----------------------------------')


--------------------------------
Linear Regression
*****************
Model perfomance for training test
mean squared error : 8.98307467211375e-28
mean absolute error : 2.3638868640318832e-14
root mean squared error : 2.9971777845356037e-14
r2_score  : 1.0
******************
#######################
Model performance for test data 
mean squared error : 1.0169994790109285e-27
mean absolute error : 2.3856472353145363e-14
root mean squared error : 3.189042926978137e-14
r2_score  : 1.0
########################
----------------------------------
--------------------------------
Lasso
*****************
Model perfomance for training test
mean squared error : 1.1331525964023306
mean absolute error : 0.8506012933972129
root mean squared error : 1.0644964050678285
r2_score  : 0.9943185972010731
******************
#######################
Model performance for test data 
mean squared error : 1.2413568666700616
mean absolute error : 0.8768807904727827
root mean squared error : 1.1141619571094956
r2_s