In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np


In [9]:
data = pd.read_csv('https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv')
data.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [10]:
data.columns = data.columns.str.lower().str.replace(' ', '_')
data.columns

Index(['jamb_score', 'study_hours_per_week', 'attendance_rate',
       'teacher_quality', 'distance_to_school', 'school_type',
       'school_location', 'extra_tutorials', 'access_to_learning_materials',
       'parent_involvement', 'it_knowledge', 'student_id', 'age', 'gender',
       'socioeconomic_status', 'parent_education_level',
       'assignments_completed'],
      dtype='object')

### Data preparation

In [11]:
#Remove the student_id column
data = data.drop(columns=['student_id'])
#fill the missing values with 0
data = data.fillna(0)
#Split the data set

df_train, df_temp = train_test_split(data, test_size=0.4, random_state=1)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=1)

#Separate the target variable (JAMB_Score) from the feature matrices
y_train = df_train['jamb_score']
y_val = df_val['jamb_score']
y_test = df_test['jamb_score']

X_train = df_train.drop(columns=['jamb_score'])
X_val = df_val.drop(columns=['jamb_score'])
X_test = df_test.drop(columns=['jamb_score'])


#Convert dataframes to matrices using DictVectorizer:
dv = DictVectorizer(sparse=True)

train_dict = X_train.to_dict(orient='records')
X_train_matrix = dv.fit_transform(train_dict)

val_dict = X_val.to_dict(orient='records')
X_val_matrix = dv.transform(val_dict)

test_dict = X_test.to_dict(orient='records')
X_test_matrix = dv.transform(test_dict)


Question 1

In [12]:
#Let's train a decision tree regressor to predict the jamb_score variable.
from sklearn.tree import DecisionTreeRegressor
dt_regressor = DecisionTreeRegressor(max_depth=1, random_state=1)
dt_regressor.fit(X_train_matrix, y_train)


In [13]:
feature_names = dv.get_feature_names_out()
split_feature_index = dt_regressor.tree_.feature[0]
split_feature = feature_names[split_feature_index]

print(f"The feature used for splitting is: {split_feature}")


The feature used for splitting is: study_hours_per_week


Question 2


In [15]:
rf_regressor = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf_regressor.fit(X_train_matrix, y_train)
y_val_pred = rf_regressor.predict(X_val_matrix)


In [16]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"RMSE on the validation data: {rmse:.2f}")

RMSE on the validation data: 43.16


Question 3

In [18]:
rmse_values = []

for n in range(10, 201, 10):
    rf_regressor = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf_regressor.fit(X_train_matrix, y_train)
    y_val_pred = rf_regressor.predict(X_val_matrix)
    
    # Calculate RMSE and store it
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    rmse_values.append((n, rmse))

for n, rmse in rmse_values:
    print(f"n_estimators: {n}, RMSE: {rmse:.3f}")

n_estimators: 10, RMSE: 43.158
n_estimators: 20, RMSE: 41.790
n_estimators: 30, RMSE: 41.556
n_estimators: 40, RMSE: 41.076
n_estimators: 50, RMSE: 40.957
n_estimators: 60, RMSE: 40.774
n_estimators: 70, RMSE: 40.588
n_estimators: 80, RMSE: 40.503
n_estimators: 90, RMSE: 40.435
n_estimators: 100, RMSE: 40.365
n_estimators: 110, RMSE: 40.348
n_estimators: 120, RMSE: 40.302
n_estimators: 130, RMSE: 40.286
n_estimators: 140, RMSE: 40.263
n_estimators: 150, RMSE: 40.254
n_estimators: 160, RMSE: 40.200
n_estimators: 170, RMSE: 40.187
n_estimators: 180, RMSE: 40.136
n_estimators: 190, RMSE: 40.152
n_estimators: 200, RMSE: 40.138


Question 5

In [19]:
rf_regressor = RandomForestRegressor(
    n_estimators=10, 
    max_depth=20, 
    random_state=1, 
    n_jobs=-1
)

rf_regressor.fit(X_train_matrix, y_train)

In [20]:
importances = rf_regressor.feature_importances_
feature_names = dv.get_feature_names_out()
feature_importance_dict = dict(zip(feature_names, importances))
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

most_important_feature = sorted_features[0]
print(f"The most important feature is: {most_important_feature[0]} with importance {most_important_feature[1]:.3f}")


The most important feature is: study_hours_per_week with importance 0.254


Question 6


In [21]:
#Install XGBoost
%pip install xgboost


Collecting xgboost
  Downloading xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.2
Note: you may need to restart the kernel to use updated packages.


In [22]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train_matrix, label=y_train)
dval = xgb.DMatrix(X_val_matrix, label=y_val)


In [23]:
watchlist = [(dtrain, 'train'), (dval, 'eval')]

In [24]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}


In [None]:
# Train the model
model = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=100,
    evals=watchlist,
    early_stopping_rounds=10  
)


[0]	train-rmse:42.84835	eval-rmse:44.52338
[1]	train-rmse:39.96423	eval-rmse:42.83406
[2]	train-rmse:37.91231	eval-rmse:41.62607
[3]	train-rmse:36.51126	eval-rmse:41.25491
[4]	train-rmse:35.52212	eval-rmse:40.84075
[5]	train-rmse:34.77126	eval-rmse:40.71677
[6]	train-rmse:34.03898	eval-rmse:40.72669
[7]	train-rmse:33.62820	eval-rmse:40.68822
[8]	train-rmse:32.94729	eval-rmse:40.81273
[9]	train-rmse:32.27703	eval-rmse:40.84939
[10]	train-rmse:31.73818	eval-rmse:40.83759
[11]	train-rmse:31.31360	eval-rmse:40.80575
[12]	train-rmse:30.72949	eval-rmse:40.84238
[13]	train-rmse:30.11486	eval-rmse:40.96020
[14]	train-rmse:29.43538	eval-rmse:40.98775
[15]	train-rmse:29.23018	eval-rmse:41.04798
[16]	train-rmse:28.64113	eval-rmse:41.08375
[17]	train-rmse:28.42128	eval-rmse:41.15979


In [26]:
# Update eta to 0.1
xgb_params['eta'] = 0.1

# Train the model with eta=0.1
model_eta_0_1 = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=100,
    evals=watchlist,
    early_stopping_rounds=10
)


[0]	train-rmse:45.64414	eval-rmse:46.63724
[1]	train-rmse:44.26862	eval-rmse:45.58724
[2]	train-rmse:43.08569	eval-rmse:44.76209
[3]	train-rmse:42.05227	eval-rmse:44.02498
[4]	train-rmse:41.10533	eval-rmse:43.40640
[5]	train-rmse:40.28309	eval-rmse:42.92195
[6]	train-rmse:39.54133	eval-rmse:42.49211
[7]	train-rmse:38.87686	eval-rmse:42.15780
[8]	train-rmse:38.27674	eval-rmse:41.84104
[9]	train-rmse:37.74058	eval-rmse:41.58026
[10]	train-rmse:37.26338	eval-rmse:41.35829
[11]	train-rmse:36.82810	eval-rmse:41.19143
[12]	train-rmse:36.41091	eval-rmse:41.02571
[13]	train-rmse:36.01019	eval-rmse:40.90308
[14]	train-rmse:35.67454	eval-rmse:40.79701
[15]	train-rmse:35.33492	eval-rmse:40.66274
[16]	train-rmse:35.01425	eval-rmse:40.60840
[17]	train-rmse:34.72687	eval-rmse:40.55942
[18]	train-rmse:34.40588	eval-rmse:40.46321
[19]	train-rmse:34.16207	eval-rmse:40.42760
[20]	train-rmse:33.94837	eval-rmse:40.40272
[21]	train-rmse:33.67900	eval-rmse:40.33790
[22]	train-rmse:33.44365	eval-rmse:40.2589