In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer  # Updated import
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import numpy as np

# Load dataset
url = 'https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv'
df = pd.read_csv(url)

df

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,183,20,74,2,10.6,Public,Urban,Yes,No,Low,Low,4996,16,Male,Medium,Primary,2
4996,179,0,80,2,20.0,Public,Rural,No,Yes,Medium,Medium,4997,22,Male,Low,Secondary,1
4997,261,17,89,3,11.3,Public,Urban,No,No,Low,High,4998,18,Male,Medium,Primary,3
4998,183,15,96,2,15.9,Public,Rural,No,No,Low,Medium,4999,18,Male,Medium,Secondary,1


In [4]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.2-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m754.5 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:05[0m
[?25hDownloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 MB[0m [31m979.8 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:06[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m


In [6]:

# Prepare the dataset
df.columns = df.columns.str.lower().str.replace(' ', '_')
df = df.drop(columns=['student_id'])  # Remove the student_id column
df = df.fillna(0)  # Fill missing values with zeros

# Train/Validation/Test split
train, temp = train_test_split(df, test_size=0.4, random_state=1)
val, test = train_test_split(temp, test_size=0.5, random_state=1)

# Use DictVectorizer to convert DataFrames to matrices
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train.drop(columns=['jamb_score']).to_dict(orient='records'))
y_train = train['jamb_score'].values
X_val = dv.transform(val.drop(columns=['jamb_score']).to_dict(orient='records'))
y_val = val['jamb_score'].values


In [7]:
# Train a Decision Tree Regressor with max_depth=1
tree = DecisionTreeRegressor(max_depth=1, random_state=1)
tree.fit(X_train, y_train)

# Determine which feature is used for splitting
feature_importances = tree.feature_importances_
most_important_feature_index = np.argmax(feature_importances)
most_important_feature = dv.get_feature_names_out()[most_important_feature_index]

print(f"The feature used for splitting is: {most_important_feature}")

The feature used for splitting is: study_hours_per_week


In [15]:
# Train a Random Forest model
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# Calculate RMSE on validation
val_predictions = rf.predict(X_val)
rmse_val = np.sqrt(np.mean((val_predictions - y_val) ** 2))

print(f"RMSE on validation dataset: {rmse_val}")


RMSE on validation dataset: 43.157758977963624


In [9]:
rmse_results = []
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    val_predictions = rf.predict(X_val)
    rmse_val = np.sqrt(np.mean((val_predictions - y_val) ** 2))
    rmse_results.append(rmse_val)

# Determine after which value RMSE stops improving
optimal_n_estimators = np.argmax(np.diff(rmse_results)) * 10 + 10  # +10 to get the n_estimators value
print(f"Optimal n_estimators where RMSE stops improving: {optimal_n_estimators}")


Optimal n_estimators where RMSE stops improving: 180


In [10]:
best_rmse = float('inf')
best_max_depth = None

for max_depth in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=max_depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        val_predictions = rf.predict(X_val)
        rmse_val = np.sqrt(np.mean((val_predictions - y_val) ** 2))

        if rmse_val < best_rmse:
            best_rmse = rmse_val
            best_max_depth = max_depth

print(f"Best max_depth: {best_max_depth}")


Best max_depth: 10


In [11]:
# Train a model to get feature importance
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# Get feature importance
importances = rf.feature_importances_
most_important_feature_index = np.argmax(importances)
most_important_feature = dv.get_feature_names_out()[most_important_feature_index]

print(f"The most important feature is: {most_important_feature}")


The most important feature is: study_hours_per_week


In [12]:
# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Watchlist for evaluation
watchlist = [(dtrain, 'train'), (dval, 'val')]

# Train model with eta=0.3
params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model_03 = xgb.train(params, dtrain, num_boost_round=100, evals=watchlist)

# Evaluate RMSE for eta=0.3
val_predictions_03 = model_03.predict(dval)
rmse_val_03 = np.sqrt(np.mean((val_predictions_03 - y_val) ** 2))

# Train model with eta=0.1
params['eta'] = 0.1
model_01 = xgb.train(params, dtrain, num_boost_round=100, evals=watchlist)

# Evaluate RMSE for eta=0.1
val_predictions_01 = model_01.predict(dval)
rmse_val_01 = np.sqrt(np.mean((val_predictions_01 - y_val) ** 2))

print(f"RMSE for eta=0.3: {rmse_val_03}")
print(f"RMSE for eta=0.1: {rmse_val_01}")

if rmse_val_03 < rmse_val_01:
    print("Best eta is 0.3")
elif rmse_val_03 > rmse_val_01:
    print("Best eta is 0.1")
else:
    print("Both give equal value")


[0]	train-rmse:42.84835	val-rmse:44.52338
[1]	train-rmse:39.96423	val-rmse:42.83406
[2]	train-rmse:37.91231	val-rmse:41.62607
[3]	train-rmse:36.51126	val-rmse:41.25491
[4]	train-rmse:35.52212	val-rmse:40.84075
[5]	train-rmse:34.77126	val-rmse:40.71677
[6]	train-rmse:34.03898	val-rmse:40.72669
[7]	train-rmse:33.62820	val-rmse:40.68822
[8]	train-rmse:32.94729	val-rmse:40.81273
[9]	train-rmse:32.27703	val-rmse:40.84939
[10]	train-rmse:31.73818	val-rmse:40.83759
[11]	train-rmse:31.31360	val-rmse:40.80575
[12]	train-rmse:30.72949	val-rmse:40.84238
[13]	train-rmse:30.11486	val-rmse:40.96020
[14]	train-rmse:29.43538	val-rmse:40.98775
[15]	train-rmse:29.23018	val-rmse:41.04798
[16]	train-rmse:28.64113	val-rmse:41.08375
[17]	train-rmse:28.42128	val-rmse:41.15979
[18]	train-rmse:28.36245	val-rmse:41.18138
[19]	train-rmse:27.97787	val-rmse:41.23551
[20]	train-rmse:27.52551	val-rmse:41.28381
[21]	train-rmse:27.27774	val-rmse:41.36930
[22]	train-rmse:26.85970	val-rmse:41.42601
[23]	train-rmse:26.82