In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import xgboost as xgb

In [2]:
!wget https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv

--2025-09-17 20:18:11--  https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexeygrigorev/datasets/refs/heads/master/jamb_exam_results.csv [following]
--2025-09-17 20:18:11--  https://raw.githubusercontent.com/alexeygrigorev/datasets/refs/heads/master/jamb_exam_results.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 391501 (382K) [text/plain]
Saving to: ‘jamb_exam_results.csv’


2025-09-17 20:18:11 (74.3 MB/s) - ‘jamb_exam_results.csv’ saved [391501/391501]



# Data preparation

* Remove the `student_id` column.
* Fill missing values with zeros.
* Do train/validation/test split with 60%/20%/20% distribution. 
* Use the `train_test_split` function and set the `random_state` parameter to 1.
* Use `DictVectorizer(sparse=True)` to turn the dataframes into matrices.

In [16]:
df=pd.read_csv("jamb_exam_results.csv")

df.head(5)

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [9]:
df.isnull().sum()

JAMB_Score                        0
Study_Hours_Per_Week              0
Attendance_Rate                   0
Teacher_Quality                   0
Distance_To_School                0
School_Type                       0
School_Location                   0
Extra_Tutorials                   0
Access_To_Learning_Materials      0
Parent_Involvement                0
IT_Knowledge                      0
Student_ID                        0
Age                               0
Gender                            0
Socioeconomic_Status              0
Parent_Education_Level          891
Assignments_Completed             0
dtype: int64

In [17]:
from sklearn.model_selection import train_test_split

df.columns = df.columns.str.lower().str.replace(' ', '_')

df.fillna(0, inplace=True)

del df['student_id']

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

y_train = df_train['jamb_score'].values
y_val = df_val['jamb_score'].values
y_test = df_test['jamb_score'].values

del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

In [18]:
df.isnull().sum()

jamb_score                      0
study_hours_per_week            0
attendance_rate                 0
teacher_quality                 0
distance_to_school              0
school_type                     0
school_location                 0
extra_tutorials                 0
access_to_learning_materials    0
parent_involvement              0
it_knowledge                    0
age                             0
gender                          0
socioeconomic_status            0
parent_education_level          0
assignments_completed           0
dtype: int64

In [19]:
columns = df.columns

categorical_columns = list(set(df.dtypes[(df.dtypes == 'object') & (df.columns != 'jamb_score')].index) & set(columns))
numerical_columns = list(set(df.dtypes[(df.dtypes != 'object') & (df.columns != 'jamb_score')].index) & set(columns))

In [20]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=True)

train_dict = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dict)

# Q1

Let's train a decision tree regressor to predict the `jamb_score` variable. 

* Train a model with `max_depth=1`.

In [21]:
from sklearn.tree import DecisionTreeRegressor, export_text

model = DecisionTreeRegressor(max_depth=1)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)


print(export_text(model, feature_names=list(dv.get_feature_names_out())))


|--- study_hours_per_week <= 18.50
|   |--- value: [155.24]
|--- study_hours_per_week >  18.50
|   |--- value: [188.59]



# Q2

Train a random forest regressor with these parameters:

* `n_estimators=10`
* `random_state=1`
* `n_jobs=-1` (optional - to make training faster)

In [23]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

np.sqrt(np.mean((y_val-y_pred)**2))

np.float64(42.13724207871227)

# Q3

Now let's experiment with the `n_estimators` parameter

* Try different values of this parameter from 10 to 200 with step 10.
* Set `random_state` to `1`.
* Evaluate the model on the validation dataset.

In [26]:
for n in range(10, 201, 10):
    model = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)

    print(f"{n}: RMSE = {np.sqrt(np.mean((y_val-y_pred)**2)):0.3f}")

10: RMSE = 42.137
20: RMSE = 41.461
30: RMSE = 41.106
40: RMSE = 40.917
50: RMSE = 40.852
60: RMSE = 40.784
70: RMSE = 40.677
80: RMSE = 40.539
90: RMSE = 40.504
100: RMSE = 40.517
110: RMSE = 40.593
120: RMSE = 40.625
130: RMSE = 40.651
140: RMSE = 40.595
150: RMSE = 40.597
160: RMSE = 40.604
170: RMSE = 40.628
180: RMSE = 40.641
190: RMSE = 40.631
200: RMSE = 40.601


# Q4

Let's select the best `max_depth`:

* Try different values of `max_depth`: `[10, 15, 20, 25]`
* For each of these values,
  * try different values of `n_estimators` from 10 till 200 (with step 10)
  * calculate the mean RMSE 
* Fix the random seed: `random_state=1`

In [45]:
rmses = []

for d in 10, 15, 20, 25:
    rmse = []
    for n in range(10, 201, 10):
        model = RandomForestRegressor(n_estimators=n, random_state=1, max_depth=d, n_jobs=-1)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)

        print(f"d={d}, n={n}: RMSE = {np.sqrt(np.mean((y_val-y_pred)**2)):0.3f}")
        rmse.append(np.sqrt(np.mean((y_val-y_pred)**2)))

    rmses.append(rmse)

d=10, n=10: RMSE = 41.258
d=10, n=20: RMSE = 40.881
d=10, n=30: RMSE = 40.625
d=10, n=40: RMSE = 40.270
d=10, n=50: RMSE = 40.317
d=10, n=60: RMSE = 40.277
d=10, n=70: RMSE = 40.285
d=10, n=80: RMSE = 40.210
d=10, n=90: RMSE = 40.174
d=10, n=100: RMSE = 40.250
d=10, n=110: RMSE = 40.286
d=10, n=120: RMSE = 40.315
d=10, n=130: RMSE = 40.329
d=10, n=140: RMSE = 40.300
d=10, n=150: RMSE = 40.314
d=10, n=160: RMSE = 40.354
d=10, n=170: RMSE = 40.360
d=10, n=180: RMSE = 40.364
d=10, n=190: RMSE = 40.354
d=10, n=200: RMSE = 40.325
d=15, n=10: RMSE = 42.004
d=15, n=20: RMSE = 41.456
d=15, n=30: RMSE = 41.168
d=15, n=40: RMSE = 40.931
d=15, n=50: RMSE = 40.783
d=15, n=60: RMSE = 40.724
d=15, n=70: RMSE = 40.689
d=15, n=80: RMSE = 40.534
d=15, n=90: RMSE = 40.497
d=15, n=100: RMSE = 40.505
d=15, n=110: RMSE = 40.531
d=15, n=120: RMSE = 40.580
d=15, n=130: RMSE = 40.558
d=15, n=140: RMSE = 40.520
d=15, n=150: RMSE = 40.529
d=15, n=160: RMSE = 40.523
d=15, n=170: RMSE = 40.534
d=15, n=180: RMSE =

In [48]:
np.asarray(rmses).shape

(4, 20)

In [47]:
np.mean(np.asarray(rmses),axis=1)

array([40.39249799, 40.73528172, 40.73973432, 40.78786566])

# Q5

We can extract feature importance information from tree-based models. 

At each step of the decision tree learning algorithm, it finds the best split. 
When doing it, we can calculate "gain" - the reduction in impurity before and after the split. 
This gain is quite useful in understanding what are the important features for tree-based models.

In Scikit-Learn, tree-based models contain this information in the
[`feature_importances_`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor.feature_importances_)
field. 

For this homework question, we'll find the most important feature:

* Train the model with these parameters:
  * `n_estimators=10`,
  * `max_depth=20`,
  * `random_state=1`,
  * `n_jobs=-1` (optional)
* Get the feature importance information from this model

In [41]:
model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1,  n_jobs=-1)
model.fit(X_train, y_train)

importances = model.feature_importances_
feature_names = dv.get_feature_names_out()

In [42]:
feature_importance_df = pd.DataFrame({'Feature': feature_names,'Importance': importances}).sort_values(by='Importance', ascending=False)

feature_importance_df.head(5)

Unnamed: 0,Feature,Importance
27,study_hours_per_week,0.248354
4,attendance_rate,0.149729
5,distance_to_school,0.136486
28,teacher_quality,0.082682
2,age,0.069311


# Q6

Now let's train an XGBoost model! For this question, we'll tune the `eta` parameter:

* Install XGBoost
* Create DMatrix for train and validation
* Create a watchlist
* Train a model with these parameters for 100 rounds:

```
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
```

Now change `eta` from `0.3` to `0.1`.

In [43]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

watchlist = [(dtrain, 'train'), (dval, 'val')]

model =  xgb.train(xgb_params, dtrain, num_boost_round=200, verbose_eval=5, evals=watchlist)
y_pred = model.predict(dval)

np.sqrt(np.mean((y_pred - y_val)**2))

[0]	train-rmse:42.69384	val-rmse:44.89114
[5]	train-rmse:34.57756	val-rmse:40.69096
[10]	train-rmse:31.63404	val-rmse:40.48319
[15]	train-rmse:29.41497	val-rmse:40.86107
[20]	train-rmse:27.49658	val-rmse:41.27921
[25]	train-rmse:26.34353	val-rmse:41.57975
[30]	train-rmse:24.21076	val-rmse:41.72928
[35]	train-rmse:22.46394	val-rmse:42.03417
[40]	train-rmse:21.35340	val-rmse:42.24363
[45]	train-rmse:20.24355	val-rmse:42.27966
[50]	train-rmse:19.25157	val-rmse:42.43824
[55]	train-rmse:18.28398	val-rmse:42.54750
[60]	train-rmse:17.12178	val-rmse:42.64446
[65]	train-rmse:16.41573	val-rmse:42.77416
[70]	train-rmse:15.78314	val-rmse:42.84909
[75]	train-rmse:14.80007	val-rmse:43.00760
[80]	train-rmse:13.96907	val-rmse:43.08250
[85]	train-rmse:13.39102	val-rmse:43.16297
[90]	train-rmse:12.46485	val-rmse:43.25161
[95]	train-rmse:11.95568	val-rmse:43.37919
[100]	train-rmse:11.27088	val-rmse:43.45019
[105]	train-rmse:10.64345	val-rmse:43.57431
[110]	train-rmse:10.23002	val-rmse:43.70054
[115]	trai

np.float64(44.289253983871625)

In [44]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

watchlist = [(dtrain, 'train'), (dval, 'val')]

model =  xgb.train(xgb_params, dtrain, num_boost_round=200, verbose_eval=5, evals=watchlist)
y_pred = model.predict(dval)

np.sqrt(np.mean((y_pred - y_val)**2))

[0]	train-rmse:45.49999	val-rmse:47.00533
[5]	train-rmse:40.17514	val-rmse:43.11181
[10]	train-rmse:37.07490	val-rmse:41.39235
[15]	train-rmse:35.08521	val-rmse:40.61341
[20]	train-rmse:33.67389	val-rmse:40.25010
[25]	train-rmse:32.55850	val-rmse:40.12003
[30]	train-rmse:31.76039	val-rmse:40.13806
[35]	train-rmse:31.01425	val-rmse:40.16103
[40]	train-rmse:30.13427	val-rmse:40.17753
[45]	train-rmse:29.49040	val-rmse:40.27366
[50]	train-rmse:28.75947	val-rmse:40.29573
[55]	train-rmse:28.17535	val-rmse:40.40072
[60]	train-rmse:27.77264	val-rmse:40.47477
[65]	train-rmse:27.10119	val-rmse:40.47659
[70]	train-rmse:26.61847	val-rmse:40.55225
[75]	train-rmse:26.21281	val-rmse:40.62564
[80]	train-rmse:25.69135	val-rmse:40.61309
[85]	train-rmse:25.14363	val-rmse:40.66530
[90]	train-rmse:24.60413	val-rmse:40.84708
[95]	train-rmse:24.03404	val-rmse:40.99952
[100]	train-rmse:23.47206	val-rmse:41.06487
[105]	train-rmse:22.90972	val-rmse:41.04114
[110]	train-rmse:22.51649	val-rmse:41.08693
[115]	trai

np.float64(41.77134546070907)