In [12]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline 
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error 
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv('jamb_exam_results.csv')
df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   JAMB_Score                    5000 non-null   int64  
 1   Study_Hours_Per_Week          5000 non-null   int64  
 2   Attendance_Rate               5000 non-null   int64  
 3   Teacher_Quality               5000 non-null   int64  
 4   Distance_To_School            5000 non-null   float64
 5   School_Type                   5000 non-null   object 
 6   School_Location               5000 non-null   object 
 7   Extra_Tutorials               5000 non-null   object 
 8   Access_To_Learning_Materials  5000 non-null   object 
 9   Parent_Involvement            5000 non-null   object 
 10  IT_Knowledge                  5000 non-null   object 
 11  Student_ID                    5000 non-null   int64  
 12  Age                           5000 non-null   int64  
 13  Gen

In [4]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [5]:
df.columns

Index(['jamb_score', 'study_hours_per_week', 'attendance_rate',
       'teacher_quality', 'distance_to_school', 'school_type',
       'school_location', 'extra_tutorials', 'access_to_learning_materials',
       'parent_involvement', 'it_knowledge', 'student_id', 'age', 'gender',
       'socioeconomic_status', 'parent_education_level',
       'assignments_completed'],
      dtype='object')

## Preparation:
- Remove the student_id column.
- Fill missing values with zeros.
- Do train/validation/test split with 60%/20%/20% distribution.
- Use the train_test_split function and set the random_state parameter to 1.
- Use DictVectorizer(sparse=True) to turn the dataframes into matrices.

In [8]:
df = df.drop('student_id', axis=1)

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True) 

y_train = df_train.jamb_score
y_val = df_val.jamb_score
y_test = df_test.jamb_score

del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

train_dicts = df_train.fillna(0).to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

## Question 1. Most important feature

In [9]:
dtr_q1 = DecisionTreeRegressor(max_depth=1)
dtr_q1.fit(X_train, y_train)

In [10]:
print(export_text(dtr_q1, feature_names=list(dv.get_feature_names_out())))

|--- study_hours_per_week <= 18.50
|   |--- value: [155.24]
|--- study_hours_per_week >  18.50
|   |--- value: [188.59]



## Question 2. Random forest RMSE

In [17]:
rfr_q2 = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

# validation dataset 
val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

rfr_q2.fit(X_train, y_train)
y_pred_q2 = rfr_q2.predict(X_val)
rmse_q2 = root_mean_squared_error(y_val, y_pred_q2)
print('Random forest RMSE : ', rmse_q2)

Random forest RMSE :  42.13724207871227


## Question 3. Number of estimators

In [32]:
s_q3 = np.arange(10, 210, 10)
rmse_q3 = []

for i in s_q3:
    rfr_q3 = RandomForestRegressor(n_estimators=i, random_state=1)
    rfr_q3.fit(X_train, y_train)
    y_pred_q3 = rfr_q3.predict(X_val) 
    rmse = root_mean_squared_error(y_val, y_pred_q3)
    rmse_q3.append(rmse)
                   
df_q3 = pd.DataFrame(s_q3, rmse_q3)#, columns=['n_estimators', 'rmse'])
df_q3

Unnamed: 0,0
42.137242,10
41.461215,20
41.106171,30
40.917194,40
40.852279,50
40.784281,60
40.677098,70
40.539333,80
40.504346,90
40.516805,100


In [33]:
df_q3.reset_index(drop=False).rename(columns={'index' : 'rmse', 0 : 'n_estimators'})

Unnamed: 0,rmse,n_estimators
0,42.137242,10
1,41.461215,20
2,41.106171,30
3,40.917194,40
4,40.852279,50
5,40.784281,60
6,40.677098,70
7,40.539333,80
8,40.504346,90
9,40.516805,100


## Question 4. Best max_depth

## Question 5. Most important feature

## Question 6. Eta for XGBoost 

## Train, test and validation dataset

In [None]:
df