In [1]:
# Import libraries
import pandas as pd

In [2]:
# Read the csv file into a Dataframe object
df = pd.read_csv('jamb_exam_results.csv', delimiter=',')

In [3]:
# Show the firts rows
df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


### Data cleaning and preparation

In [4]:
# Make the column names lowercase
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [5]:
# Show the firts rows with the modified column names
df.head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,student_id,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [6]:
# Remove the student_id column
del df['student_id']

In [7]:
# The is no numeric column with missing values to fill with zeros.
df.describe().round()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,age,assignments_completed
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,174.0,20.0,84.0,3.0,10.0,18.0,2.0
std,48.0,10.0,9.0,1.0,5.0,2.0,1.0
min,100.0,0.0,50.0,1.0,0.0,15.0,1.0
25%,135.0,13.0,78.0,2.0,7.0,16.0,1.0
50%,170.0,19.0,84.0,2.0,10.0,18.0,1.0
75%,209.0,26.0,91.0,3.0,13.0,20.0,2.0
max,367.0,40.0,100.0,5.0,20.0,22.0,5.0


In [8]:
# Show the Dataframe information.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   jamb_score                    5000 non-null   int64  
 1   study_hours_per_week          5000 non-null   int64  
 2   attendance_rate               5000 non-null   int64  
 3   teacher_quality               5000 non-null   int64  
 4   distance_to_school            5000 non-null   float64
 5   school_type                   5000 non-null   object 
 6   school_location               5000 non-null   object 
 7   extra_tutorials               5000 non-null   object 
 8   access_to_learning_materials  5000 non-null   object 
 9   parent_involvement            5000 non-null   object 
 10  it_knowledge                  5000 non-null   object 
 11  age                           5000 non-null   int64  
 12  gender                        5000 non-null   object 
 13  soc

In [9]:
# The are Nan values in 'parent_education_level' column
df.parent_education_level.isna().sum()

np.int64(891)

In [10]:
# Fill Nan values in 'parent_education_level' with the most frequent value
most_frequent_value = df.parent_education_level.mode()[0]
print(f'most_frequent_value: {most_frequent_value}')

df['parent_education_level'] = df['parent_education_level'].fillna(most_frequent_value)
df.parent_education_level

most_frequent_value: Secondary


0        Tertiary
1       Secondary
2        Tertiary
3        Tertiary
4        Tertiary
          ...    
4995      Primary
4996    Secondary
4997      Primary
4998    Secondary
4999    Secondary
Name: parent_education_level, Length: 5000, dtype: object

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# Split the data into 3 parts: train/validation/test with 60%/20%/20% distribution.
# Use the train_test_split function and set the random_state parameter to 1.
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

In [13]:
# Reset the Dataframe indexes
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [16]:
# Setup the target arrays 
y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

In [17]:
# Drop the target column from train/validation/test Dataframes
del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

### Decision trees

In [18]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.tree import export_text

In [19]:
train_dicts = df_train.to_dict(orient='records')

In [20]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [24]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [25]:
# Decition tree with max_depth=1
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))

|--- study_hours_per_week <= 18.50
|   |--- value: [155.24]
|--- study_hours_per_week >  18.50
|   |--- value: [188.59]



In [26]:
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [27]:
y_pred = dt.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("RMSE:", rmse)

RMSE: 45.80335905740839


### Ensembles and random forest

In [28]:
from sklearn.ensemble import RandomForestRegressor

In [29]:
# Train a random forest model with these parameters:
# n_estimators=10
# random_state=1
# n_jobs=-1 (optional - to make training faster)

rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
print("RMSE:", round(rmse, 3))

RMSE: 42.253


### Parameter tuning

In [30]:
# Try different values of n_estimators from 10 to 200 with step 10.
# Set random_state to 1.
# Evaluate the model on the validation dataset.

scores = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    scores.append((n, rmse))

In [31]:
df_scores = pd.DataFrame(scores, columns=['n_estimators', 'rmse'])
df_scores

Unnamed: 0,n_estimators,rmse
0,10,42.253361
1,20,41.505156
2,30,41.107667
3,40,40.832698
4,50,40.729474
5,60,40.702008
6,70,40.661194
7,80,40.554018
8,90,40.542533
9,100,40.570061


In [32]:
# Select the best max_depth:
# Try different values of max_depth: [10, 15, 20, 25]
# For each of these values, try different values of n_estimators from 10 till 200 (with step 10)
# Calculate the mean RMSE. Fix the random seed: random_state=1

for d in [10, 15, 20, 25]:
    rmse_list = []  
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n,
                                    max_depth=d,
                                    random_state=1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        rmse_list.append(rmse)
        
    # Calculate mean RMSE for the current max_depth
    mean_rmse = np.mean(rmse_list)
    print(f"Mean RMSE for max_depth {d}: {mean_rmse}")


Mean RMSE for max_depth 10: 40.45470810847495
Mean RMSE for max_depth 15: 40.74116012810272
Mean RMSE for max_depth 20: 40.83127965552485
Mean RMSE for max_depth 25: 40.77762977339274


### Feature importance

In [33]:
# Train the model with these parameters:
# n_estimators=10,
# max_depth=20,
# random_state=1,
# n_jobs=-1 (optional)
# Get the feature importance information from this model

rf = RandomForestRegressor(n_estimators=10,
                            max_depth=20,
                            random_state=1,
                            n_jobs=1)
rf.fit(X_train, y_train)

In [34]:
# Access the feature importances
feature_importances = rf.feature_importances_

# If you have a DataFrame, get the feature names
feature_names=list(dv.get_feature_names_out())

# Create a DataFrame to display feature importances with their corresponding names
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Display the feature importances
print(importance_df)

                             Feature  Importance
26              study_hours_per_week    0.246282
4                    attendance_rate    0.150772
5                 distance_to_school    0.138008
27                   teacher_quality    0.083338
2                                age    0.069844
3              assignments_completed    0.031370
23         socioeconomic_status=High    0.026226
16           parent_involvement=High    0.023173
10                 it_knowledge=High    0.018012
13    parent_education_level=Primary    0.014293
15   parent_education_level=Tertiary    0.014208
14  parent_education_level=Secondary    0.014001
17            parent_involvement=Low    0.012685
11                  it_knowledge=Low    0.012609
6                 extra_tutorials=No    0.012254
18         parent_involvement=Medium    0.011838
1   access_to_learning_materials=Yes    0.011491
24          socioeconomic_status=Low    0.011153
8                      gender=Female    0.011132
19             schoo

### XGBoost

In [35]:
import xgboost as xgb

In [36]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [37]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [38]:
%%capture output

xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100,
                  verbose_eval=5,
                  evals=watchlist)

In [39]:
s = output.stdout

In [40]:
print(s)

[0]	train-rmse:42.69552	val-rmse:44.86028
[5]	train-rmse:34.49649	val-rmse:40.84184
[10]	train-rmse:31.56153	val-rmse:40.80227
[15]	train-rmse:29.01959	val-rmse:41.36759
[20]	train-rmse:27.27990	val-rmse:41.78413
[25]	train-rmse:25.18211	val-rmse:41.99333
[30]	train-rmse:23.68205	val-rmse:42.21573
[35]	train-rmse:22.33470	val-rmse:42.52157
[40]	train-rmse:20.98280	val-rmse:43.02910
[45]	train-rmse:19.98490	val-rmse:43.19633
[50]	train-rmse:18.61074	val-rmse:43.38986
[55]	train-rmse:17.64030	val-rmse:43.42739
[60]	train-rmse:16.84922	val-rmse:43.48350
[65]	train-rmse:16.18551	val-rmse:43.53254
[70]	train-rmse:15.18989	val-rmse:43.71754
[75]	train-rmse:14.55442	val-rmse:43.86634
[80]	train-rmse:13.73842	val-rmse:43.95759
[85]	train-rmse:13.16943	val-rmse:43.94845
[90]	train-rmse:12.61807	val-rmse:44.02690
[95]	train-rmse:11.99401	val-rmse:44.02270
[99]	train-rmse:11.43062	val-rmse:44.07732



In [41]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100,
                  verbose_eval=5,
                  evals=watchlist)

[0]	train-rmse:45.50072	val-rmse:46.99373
[5]	train-rmse:40.15644	val-rmse:43.06403
[10]	train-rmse:37.10597	val-rmse:41.56735
[15]	train-rmse:35.08682	val-rmse:40.73805
[20]	train-rmse:33.60731	val-rmse:40.24213
[25]	train-rmse:32.51863	val-rmse:40.21095
[30]	train-rmse:31.66233	val-rmse:40.17986
[35]	train-rmse:30.74873	val-rmse:40.15090
[40]	train-rmse:29.90334	val-rmse:40.31970
[45]	train-rmse:29.19963	val-rmse:40.35637
[50]	train-rmse:28.55035	val-rmse:40.41019
[55]	train-rmse:27.94288	val-rmse:40.44086
[60]	train-rmse:27.46119	val-rmse:40.51542
[65]	train-rmse:26.88442	val-rmse:40.62075
[70]	train-rmse:26.16724	val-rmse:40.73249
[75]	train-rmse:25.58513	val-rmse:40.86778
[80]	train-rmse:24.96647	val-rmse:40.90245
[85]	train-rmse:24.60413	val-rmse:40.96153
[90]	train-rmse:24.11593	val-rmse:41.09852
[95]	train-rmse:23.57277	val-rmse:41.14289
[99]	train-rmse:23.18554	val-rmse:41.18903
