# Gradient Boost - SkLearn on Toy data

In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.precision',2)
df0 = pd.DataFrame({
    "Height" : [1.6, 1.6, 1.5, 1.8, 1.5, 1.4],
    "Fav Color": ["Blue", "Green", "Blue", "Red", "Green", "Blue"],
    "Gender": ["Male", "Female", "Female", "Male", "Male", "Female"],
    "Weight": [88, 76, 56, 73, 77, 57]
})
df0

Unnamed: 0,Height,Fav Color,Gender,Weight
0,1.6,Blue,Male,88
1,1.6,Green,Female,76
2,1.5,Blue,Female,56
3,1.8,Red,Male,73
4,1.5,Green,Male,77
5,1.4,Blue,Female,57


In [2]:
# one hot encoding of categorical columns
df = pd.get_dummies(df0)
df.drop(columns=['Gender_Female'],inplace=True)
df = df[['Height', 'Fav Color_Blue', 'Fav Color_Green', 'Fav Color_Red', 'Gender_Male', 'Weight']] # little rearrangement
df

Unnamed: 0,Height,Fav Color_Blue,Fav Color_Green,Fav Color_Red,Gender_Male,Weight
0,1.6,True,False,False,True,88
1,1.6,False,True,False,False,76
2,1.5,True,False,False,False,56
3,1.8,False,False,True,True,73
4,1.5,False,True,False,True,77
5,1.4,True,False,False,False,57


In [3]:
from sklearn.ensemble import GradientBoostingRegressor

regressor = GradientBoostingRegressor()

x = np.array(df[['Height','Fav Color_Blue', 'Fav Color_Green', 'Fav Color_Red', 'Gender_Male']])
y = np.array(df['Weight'])

regressor.fit(x,y)

y_predict = regressor.predict(x)
df['W_pred'] = y_predict
# check for loss
from sklearn.metrics import mean_squared_error
mean_squared_error(y,y_predict)

9.1108507711039e-08

In [4]:
df


Unnamed: 0,Height,Fav Color_Blue,Fav Color_Green,Fav Color_Red,Gender_Male,Weight,W_pred
0,1.6,True,False,False,True,88,88.0
1,1.6,False,True,False,False,76,76.0
2,1.5,True,False,False,False,56,56.0
3,1.8,False,False,True,True,73,73.0
4,1.5,False,True,False,True,77,77.0
5,1.4,True,False,False,False,57,57.0


# Little bigger dataset. I will use the gradebook of my class

In [6]:
import numpy as np
import pandas as pd
pd.set_option('display.precision',3)

DATA_PATH = "data/MAT104-Spring2023-Recordbook.csv"

df = pd.read_csv(DATA_PATH)
df.tail()

Unnamed: 0,Quiz 1,Quiz 2,Quiz 3,Quiz 4,Quiz 5,Quiz 6,Quiz 7,Quiz 8,Quiz 9,Quiz 10,...,Challenge 1,Challenge 2,Challenge 3,Apr 15 - Challenge,Goal Setting Task,Proxy Penalty,Total Extra points,Total - best,Total/100,Grade
221,10,10,10.0,9.5,10,0.0,0.0,10,0,7,...,,,,10.0,6.9,,16.9,114,100,A
222,10,9,0.0,9.5,9,9.0,0.0,10,10,7,...,1.0,,,10.0,1.2,,12.2,103,100,A
223,10,10,10.0,10.0,10,10.0,3.0,10,10,10,...,,1.0,,,3.2,,4.2,104,100,A
224,10,0,10.0,9.0,10,5.0,10.0,9,3,10,...,1.0,1.0,,8.0,0.6,,10.6,107,100,A
225,8,10,10.0,10.0,9,5.0,10.0,10,5,10,...,,1.0,1.0,,9.4,,11.4,101,100,A


In [11]:
df.fillna(0,inplace=True)
df.tail()

Unnamed: 0,Quiz 1,Quiz 2,Quiz 3,Quiz 4,Quiz 5,Quiz 6,Quiz 7,Quiz 8,Quiz 9,Quiz 10,...,Challenge 1,Challenge 2,Challenge 3,Apr 15 - Challenge,Goal Setting Task,Proxy Penalty,Total Extra points,Total - best,Total/100,Grade
221,10,10,10.0,9.5,10,0.0,0.0,10,0,7,...,0.0,0.0,0.0,10.0,6.9,0.0,16.9,114,100,A
222,10,9,0.0,9.5,9,9.0,0.0,10,10,7,...,1.0,0.0,0.0,10.0,1.2,0.0,12.2,103,100,A
223,10,10,10.0,10.0,10,10.0,3.0,10,10,10,...,0.0,1.0,0.0,0.0,3.2,0.0,4.2,104,100,A
224,10,0,10.0,9.0,10,5.0,10.0,9,3,10,...,1.0,1.0,0.0,8.0,0.6,0.0,10.6,107,100,A
225,8,10,10.0,10.0,9,5.0,10.0,10,5,10,...,0.0,1.0,1.0,0.0,9.4,0.0,11.4,101,100,A


In [12]:
# split the data into train and test
from sklearn.model_selection import train_test_split

In [17]:
x = np.array(df[['Quiz 1', 'Quiz 2', 'Quiz 3', 'Quiz 4', 'Quiz 5', 'Quiz 6', 'Quiz 7',
       'Quiz 8', 'Quiz 9', 'Quiz 10', 'Quiz - best 7', 'Mid Sem',
       'EndSemMarks', 'Challenge 1', 'Challenge 2', 'Challenge 3',
       'Apr 15 - Challenge', 'Goal Setting Task', 'Proxy Penalty',
       'Total  Extra points']])

y = np.array(df['Total - best'])

x_train, x_test, y_train, y_test = train_test_split(x,y)

In [18]:
from sklearn.ensemble import GradientBoostingRegressor

regressor = GradientBoostingRegressor()
regressor.fit(x_train,y_train)

y_train_predict = regressor.predict(x_train)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_train,y_train_predict)

0.39300450462243885

In [19]:
y_test_predict = regressor.predict(x_test)
mean_squared_error(y_test,y_test_predict)

11.30041648720978

In [20]:
df['pred'] = regressor.predict(x)
df

Unnamed: 0,Quiz 1,Quiz 2,Quiz 3,Quiz 4,Quiz 5,Quiz 6,Quiz 7,Quiz 8,Quiz 9,Quiz 10,...,Challenge 2,Challenge 3,Apr 15 - Challenge,Goal Setting Task,Proxy Penalty,Total Extra points,Total - best,Total/100,Grade,pred
0,0,0,0.0,0.0,0,0.0,0.0,0,0,0,...,0.0,0.0,0.0,0.0,-10.0,-10.0,9,9,F,9.269
1,0,0,0.0,0.0,0,0.0,0.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,10,10,F,10.129
2,0,0,0.0,0.0,0,0.0,0.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,10,10,F,10.129
3,0,0,0.0,0.0,0,0.0,0.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,10,10,F,10.129
4,0,0,0.0,0.0,0,0.0,0.0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,10,10,F,10.129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,10,10,10.0,9.5,10,0.0,0.0,10,0,7,...,0.0,0.0,10.0,6.9,0.0,16.9,114,100,A,113.871
222,10,9,0.0,9.5,9,9.0,0.0,10,10,7,...,0.0,0.0,10.0,1.2,0.0,12.2,103,100,A,102.902
223,10,10,10.0,10.0,10,10.0,3.0,10,10,10,...,1.0,0.0,0.0,3.2,0.0,4.2,104,100,A,103.869
224,10,0,10.0,9.0,10,5.0,10.0,9,3,10,...,1.0,0.0,8.0,0.6,0.0,10.6,107,100,A,104.320


# A bit of parameter tuning

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
regressor = GradientBoostingRegressor()

param_grid = {
    'learning_rate':[0.1,0.5]
}

gs = GridSearchCV(estimator=regressor, param_grid=param_grid)
result = gs.fit(x_train,y_train)

In [27]:
result.cv_results_

{'mean_fit_time': array([0.09466009, 0.09053721]),
 'std_fit_time': array([0.01457865, 0.01089184]),
 'mean_score_time': array([0.00040398, 0.0031249 ]),
 'std_score_time': array([0.00049479, 0.00624981]),
 'param_learning_rate': masked_array(data=[0.1, 0.5],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'learning_rate': 0.1}, {'learning_rate': 0.5}],
 'split0_test_score': array([0.97573054, 0.95178497]),
 'split1_test_score': array([0.98172205, 0.97237772]),
 'split2_test_score': array([0.94731258, 0.92716886]),
 'split3_test_score': array([0.97050021, 0.95224035]),
 'split4_test_score': array([0.97345863, 0.95753666]),
 'mean_test_score': array([0.9697448 , 0.95222171]),
 'std_test_score': array([0.01180552, 0.01457635]),
 'rank_test_score': array([1, 2])}