In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso

In [22]:
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV


In [3]:
df=pd.read_csv('data/StudentsPerformance.csv')

In [4]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [6]:
df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

In [7]:
X=df.drop(columns=['math score'], axis=1)
Y=df['math score']

In [13]:
num_features=[feature for feature in X.columns if X[feature].dtype !='O' ]
categorical_features=[feature for feature in X.columns if X[feature].dtype =='O']


In [15]:
num_features

['reading score', 'writing score']

In [16]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

# columns transformer makes a pieline

numeric_transformer=StandardScaler()
oh_transformer=OneHotEncoder()
preprocessor=ColumnTransformer(
    [
        ("OneHotEncoder",oh_transformer,categorical_features),
        ("StandardScaler",numeric_transformer,num_features)
    ]
)


In [17]:
X=preprocessor.fit_transform(X)

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)


In [29]:
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    r2=r2_score(true,predicted)

    return mae,r2

In [30]:
models={
    "Linear Regression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "K Neighbours":KNeighborsRegressor(),
    "Decision Tree":DecisionTreeRegressor(),
    "Random Forest Regressor":RandomForestRegressor(),
    "XGBRegressor":XGBRegressor(),
    "CatBooosting Regressor":CatBoostRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor()
}


In [33]:



model_list=[]
r2_list=[]
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,Y_train)
    
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    model_train_mae,model_train_r2=evaluate_model(Y_train,y_train_pred)
    model_test_mae,model_test_r2=evaluate_model(Y_test,y_test_pred)
    
    print("Model performance")
    print("R2_score{:.4f}".format(model_test_r2))
    print("MAE {:.4f}".format(model_test_mae))
    r2_list.append(model_test_r2)
    model_list.append(list(models.keys())[i])



Model performance
R2_score0.8797
MAE 4.2259
Model performance
R2_score0.8253
MAE 5.1579
Model performance
R2_score0.8806
MAE 4.2111
Model performance
R2_score0.7839
MAE 5.6160
Model performance
R2_score0.7406
MAE 6.2500
Model performance
R2_score0.8543
MAE 4.5756
Model performance
R2_score0.8278
MAE 5.0577
Learning rate set to 0.039525
0:	learn: 14.5987177	total: 1.12ms	remaining: 1.12s
1:	learn: 14.2251886	total: 1.96ms	remaining: 976ms
2:	learn: 13.8866124	total: 2.99ms	remaining: 995ms
3:	learn: 13.5235688	total: 3.98ms	remaining: 991ms
4:	learn: 13.1887021	total: 4.84ms	remaining: 964ms
5:	learn: 12.9124226	total: 5.66ms	remaining: 938ms
6:	learn: 12.6000335	total: 6.41ms	remaining: 909ms
7:	learn: 12.3299057	total: 7.24ms	remaining: 898ms
8:	learn: 12.0660619	total: 8.14ms	remaining: 896ms
9:	learn: 11.7730981	total: 9.1ms	remaining: 901ms
10:	learn: 11.4922764	total: 10.1ms	remaining: 905ms
11:	learn: 11.2626483	total: 11.1ms	remaining: 913ms
12:	learn: 11.0426039	total: 12ms	rem

In [46]:
df2=pd.DataFrame(list(zip(model_list,r2_list)),columns=['Model',"R2_score"]).sort_values(by=['R2_score'],ascending=False)

In [47]:
df2.reset_index(drop=True,inplace=True)

In [48]:
df2

Unnamed: 0,Model,R2_score
0,Ridge,0.880593
1,Linear Regression,0.87974
2,Random Forest Regressor,0.854334
3,CatBooosting Regressor,0.851632
4,AdaBoostRegressor,0.84949
5,XGBRegressor,0.827797
6,Lasso,0.82532
7,K Neighbours,0.783898
8,Decision Tree,0.740649
