In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [2]:
data = pd.read_csv('jamb_exam_results.csv')

In [3]:
data.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [4]:
data.columns = data.columns.str.lower().str.replace(' ', '_')

In [5]:
del data['student_id']

In [6]:
data.fillna(0, inplace=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   jamb_score                    5000 non-null   int64  
 1   study_hours_per_week          5000 non-null   int64  
 2   attendance_rate               5000 non-null   int64  
 3   teacher_quality               5000 non-null   int64  
 4   distance_to_school            5000 non-null   float64
 5   school_type                   5000 non-null   object 
 6   school_location               5000 non-null   object 
 7   extra_tutorials               5000 non-null   object 
 8   access_to_learning_materials  5000 non-null   object 
 9   parent_involvement            5000 non-null   object 
 10  it_knowledge                  5000 non-null   object 
 11  age                           5000 non-null   int64  
 12  gender                        5000 non-null   object 
 13  soc

In [8]:
data['jamb_score'].describe()

count    5000.000000
mean      174.074600
std        47.616477
min       100.000000
25%       135.000000
50%       170.000000
75%       209.000000
max       367.000000
Name: jamb_score, dtype: float64

In [9]:
df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['jamb_score']
y_val = df_val['jamb_score']
y_test = df_test['jamb_score']

del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

In [10]:
len(df_train), len(df_val), len(df_test)

(3000, 1000, 1000)

In [11]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(df_train.to_dict(orient='records'))
X_val = dv.transform(df_val.to_dict(orient='records'))
X_test = dv.transform(df_test.to_dict(orient='records'))

# Q 1

In [12]:
def answer_to_question_1(max_depth):
    dt = DecisionTreeClassifier(max_depth=max_depth)
    dt.fit(X_train, y_train)
    print(export_text(dt, feature_names=list(dv.get_feature_names_out())))

In [13]:
max_depth = 1

In [14]:
answer_to_question_1(max_depth)

|--- study_hours_per_week <= 18.50
|   |--- class: 118
|--- study_hours_per_week >  18.50
|   |--- class: 190



# Q 2

In [15]:
def answer_to_question_2(n_estimators, random_state, n_jobs):
    random_forest = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state, n_jobs=n_jobs)
    random_forest.fit(X_train, y_train)
    y_val_predict = random_forest.predict(X_val)
    return round(root_mean_squared_error(y_val, y_val_predict), 3)

In [16]:
n_estimators = 10
random_state = 1
n_jobs = -1

In [17]:
answer_to_question_2(n_estimators, random_state, n_jobs)

42.137

# Q 3

In [18]:
def answer_to_question_3(range_of_estimators, random_state):
    for n in range_of_estimators:
        model = RandomForestRegressor(n_estimators=n, random_state=random_state)
        model.fit(X_train, y_train)
        score = round(root_mean_squared_error(model.predict(X_val), y_val), 3)
        print(f'n_estimators={n}, score={score}')

In [19]:
random_state = 1
range_of_estimators = range(10, 201, 10)

In [20]:
answer_to_question_3(range_of_estimators, random_state)

n_estimators=10, score=42.137
n_estimators=20, score=41.461
n_estimators=30, score=41.106
n_estimators=40, score=40.917
n_estimators=50, score=40.852
n_estimators=60, score=40.784
n_estimators=70, score=40.677
n_estimators=80, score=40.539
n_estimators=90, score=40.504
n_estimators=100, score=40.517
n_estimators=110, score=40.593
n_estimators=120, score=40.625
n_estimators=130, score=40.651
n_estimators=140, score=40.595
n_estimators=150, score=40.597
n_estimators=160, score=40.604
n_estimators=170, score=40.628
n_estimators=180, score=40.641
n_estimators=190, score=40.631
n_estimators=200, score=40.601


# Q 4

In [21]:
def answer_to_question_4(range_of_estimators, range_of_depth, random_state):
    answer = []
    for d in range_of_depth:
        scores = []
        for n in range_of_estimators:
            model = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=random_state)
            model.fit(X_train, y_train)
            score = root_mean_squared_error(model.predict(X_val), y_val)
            scores.append(score)
        answer.append((d, np.mean(scores)))
    answer.sort(key=lambda x: x[1], reverse=False)
    return answer

In [22]:
range_of_estimators = range(10, 201, 10)
range_of_depth = [10, 15, 20, 25]
random_state = 1

In [23]:
answer_to_question_4(range_of_estimators, range_of_depth, random_state)

[(10, 40.39249798892396),
 (15, 40.73528172486332),
 (20, 40.739734321829275),
 (25, 40.78786565962805)]

# Q 5

In [24]:
def answer_to_question_5(params, working_features):
    features = list(df_train.columns)
    feature_names = []
    for feature in features:
        if df_train[feature].dtype == 'object':
            feature_names.extend([feature] * df_train[feature].nunique())
        else:
            feature_names.append(feature)
    assert len(feature_names) == X_train.shape[1]

    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    importances = dict(zip(feature_names, model.feature_importances_))
    
    working_importances = []
    for feature in working_features:
        working_importances.append((feature, importances[feature]))
    working_importances.sort(key=lambda x: x[1], reverse=True)
    return working_importances
    

In [25]:
params = {"n_estimators": 10, "max_depth": 20,"random_state": 1, "n_jobs": -1}
working_features = ['study_hours_per_week', 'attendance_rate', 'distance_to_school','teacher_quality']

In [26]:
answer_to_question_5(params, working_features)

[('teacher_quality', 0.06931145469695396),
 ('distance_to_school', 0.031516789454600694),
 ('study_hours_per_week', 0.012325395818561702),
 ('attendance_rate', 0.010261891957053595)]

# Q 6

In [77]:
def answer_to_question_6(dv, y_val, xgb_params, num_boost_round, eta_range):
    answer = []
    
    features = list(dv.get_feature_names_out())
    dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
    dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

    watchlist = [(dtrain, 'train'), (dval, 'val')]
    for eta in eta_range:
        xgb_params['eta'] = eta

        model = xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=num_boost_round)
        answer.append((eta, root_mean_squared_error(model.predict(dval), y_val)))
    
    answer.sort(key=lambda x: x[1], reverse=False)
    return answer

In [71]:
eta_range = [0.1, 0.3]
num_boost_round = 100
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

In [78]:
answer = answer_to_question_6(dv, y_val, xgb_params, num_boost_round, eta_range)

[0]	train-rmse:45.50072	val-rmse:46.99373
[1]	train-rmse:44.12583	val-rmse:45.89771
[2]	train-rmse:42.92815	val-rmse:45.01746
[3]	train-rmse:41.90445	val-rmse:44.27569
[4]	train-rmse:40.94238	val-rmse:43.62411
[5]	train-rmse:40.15460	val-rmse:43.05644
[6]	train-rmse:39.43451	val-rmse:42.64605
[7]	train-rmse:38.77902	val-rmse:42.28889
[8]	train-rmse:38.17103	val-rmse:42.00035
[9]	train-rmse:37.60436	val-rmse:41.73633
[10]	train-rmse:37.11353	val-rmse:41.55631
[11]	train-rmse:36.65501	val-rmse:41.33670
[12]	train-rmse:36.26337	val-rmse:41.18307
[13]	train-rmse:35.84634	val-rmse:41.03057
[14]	train-rmse:35.45796	val-rmse:40.89024
[15]	train-rmse:35.07766	val-rmse:40.70892
[16]	train-rmse:34.73181	val-rmse:40.60601
[17]	train-rmse:34.40922	val-rmse:40.47982
[18]	train-rmse:34.13466	val-rmse:40.43780
[19]	train-rmse:33.88018	val-rmse:40.38390
[20]	train-rmse:33.57997	val-rmse:40.37859
[21]	train-rmse:33.33993	val-rmse:40.33974
[22]	train-rmse:33.08144	val-rmse:40.31700
[23]	train-rmse:32.90

In [79]:
answer

[(0.1, 41.043354592351655), (0.3, 44.432098039002526)]