# CRS Math Profs Predictor

By:

Mariano, Isaiah

Montealto, Meluisa

Regalario, Jeremiah

# Import Dependencies

In [147]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import re
import datetime
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from xgboost import XGBRFClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
import matplotlib.pyplot as plt
import random
from sklearn.metrics import accuracy_score, classification_report
from datetime import datetime

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Load DataFrame

In [62]:
model_df = pd.read_csv("CRS Math Profs (2018-2024).csv")
model_df = model_df[['Number', 'Day', 'Room', 'Prof', 'Year', 'Semester', 'Start_time', 'End_time']]
model_df.head()

Unnamed: 0,Number,Day,Room,Prof,Year,Semester,Start_time,End_time
0,2.0,TTh,TBA,"WALO, MA. LAILANI",2018,1st sem,08:30,10:00
1,2.0,TTh,TBA,"CEJO, ROBERT JAY",2018,1st sem,10:00,11:30
2,2.0,TTh,TBA,"WALO, MA. LAILANI",2018,1st sem,11:30,13:00
3,2.0,WF,TBA,"AGUILAR, ADJANI",2018,1st sem,11:30,13:00
4,10.0,TTh,TBA,"ARCEO, CARLENE PERPETUA",2018,1st sem,08:30,10:00


In [63]:
model_df.shape

(3114, 8)

# Preprocessing

## Midyear

In midyear classes, the days are not listed. We add "TWThF".

In [64]:
model_df.loc[model_df['Semester'] == "Midyear", 'Day'] = 'TWThF'
model_df[model_df['Semester'] == 'Midyear'].head()

Unnamed: 0,Number,Day,Room,Prof,Year,Semester,Start_time,End_time
482,2.0,TWThF,MBAN 306,"CEJO, ROBERT JAY",2018,Midyear,07:00,09:00
483,10.0,TWThF,TBA,"ARCEO, CARLENE PERPETUA",2018,Midyear,09:00,11:00
484,11.0,TWThF,MB 121,"CEJO, ROBERT JAY",2018,Midyear,11:00,13:00
485,14.0,TWThF,MB 107,"RAMOS, AARON",2018,Midyear,09:00,11:00
486,20.0,TWThF,MB 126,"STA. ANA, ANGELICA",2018,Midyear,09:35,12:10


## Drop Rows with Missing Critical Values

In [65]:
model_df = model_df.dropna(subset=["Prof", "Day", "Start_time", "End_time"]).copy()
model_df.head()

Unnamed: 0,Number,Day,Room,Prof,Year,Semester,Start_time,End_time
0,2.0,TTh,TBA,"WALO, MA. LAILANI",2018,1st sem,08:30,10:00
1,2.0,TTh,TBA,"CEJO, ROBERT JAY",2018,1st sem,10:00,11:30
2,2.0,TTh,TBA,"WALO, MA. LAILANI",2018,1st sem,11:30,13:00
3,2.0,WF,TBA,"AGUILAR, ADJANI",2018,1st sem,11:30,13:00
4,10.0,TTh,TBA,"ARCEO, CARLENE PERPETUA",2018,1st sem,08:30,10:00


In [66]:
model_df.shape

(2552, 8)

## Convert Time

In [67]:
# Convert times to minutes since midnight
def time_to_minutes(t):
    h, m = map(int, t.split(":"))
    return h * 60 + m

model_df["Start_minutes"] = model_df["Start_time"].apply(time_to_minutes)
model_df["End_minutes"] = model_df["End_time"].apply(time_to_minutes)
model_df.head()

Unnamed: 0,Number,Day,Room,Prof,Year,Semester,Start_time,End_time,Start_minutes,End_minutes
0,2.0,TTh,TBA,"WALO, MA. LAILANI",2018,1st sem,08:30,10:00,510,600
1,2.0,TTh,TBA,"CEJO, ROBERT JAY",2018,1st sem,10:00,11:30,600,690
2,2.0,TTh,TBA,"WALO, MA. LAILANI",2018,1st sem,11:30,13:00,690,780
3,2.0,WF,TBA,"AGUILAR, ADJANI",2018,1st sem,11:30,13:00,690,780
4,10.0,TTh,TBA,"ARCEO, CARLENE PERPETUA",2018,1st sem,08:30,10:00,510,600


## Remove Profs with One Record

In [68]:
prof_counts = model_df["Prof"].value_counts()
valid_profs = prof_counts[prof_counts > 1].index
model_df = model_df[model_df["Prof"].isin(valid_profs)].copy()

## Encode Categorical Variables

In [69]:
# Encode categorical variables
label_encoders = {}
categorical_cols = ["Day", "Room", "Semester", "Prof"]

for col in categorical_cols:
    le = LabelEncoder()
    model_df[col] = le.fit_transform(model_df[col].astype(str))
    label_encoders[col] = le
    
model_df.head()

Unnamed: 0,Number,Day,Room,Prof,Year,Semester,Start_time,End_time,Start_minutes,End_minutes
0,2.0,2,43,338,2018,0,08:30,10:00,510,600
1,2.0,2,43,49,2018,0,10:00,11:30,600,690
2,2.0,2,43,338,2018,0,11:30,13:00,690,780
3,2.0,4,43,3,2018,0,11:30,13:00,690,780
4,10.0,2,43,9,2018,0,08:30,10:00,510,600


## Remap Prof to Consecutive IDs

In [70]:
prof_map = {prof: i for i, prof in enumerate(model_df["Prof"].unique())}
model_df["Prof"] = model_df["Prof"].map(prof_map)

## Train-Test Split

In [71]:
X = model_df[["Number", "Day", "Room", "Semester", "Start_minutes", "End_minutes"]]
y = model_df["Prof"]

# Ensure all classes are represented in training data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, stratify=y
)

# XGBoost

In [154]:
xgb_model = XGBClassifier(
    n_estimators = 100,
    max_depth = 3,
    learning_rate = 0.1,
    colsample_bytree = 0.95,
    reg_lambda = 1,
    objective="multi:softprob",
    eval_metric="mlogloss",
    tree_method="hist",
    random_state=42
)

# Train model
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)]
)

[0]	validation_0-mlogloss:5.54930	validation_1-mlogloss:5.60474
[1]	validation_0-mlogloss:5.30743	validation_1-mlogloss:5.40942
[2]	validation_0-mlogloss:5.10013	validation_1-mlogloss:5.25041
[3]	validation_0-mlogloss:4.91876	validation_1-mlogloss:5.11405
[4]	validation_0-mlogloss:4.75493	validation_1-mlogloss:4.98931
[5]	validation_0-mlogloss:4.61083	validation_1-mlogloss:4.88472
[6]	validation_0-mlogloss:4.47776	validation_1-mlogloss:4.78885
[7]	validation_0-mlogloss:4.36045	validation_1-mlogloss:4.70796
[8]	validation_0-mlogloss:4.25007	validation_1-mlogloss:4.63808
[9]	validation_0-mlogloss:4.14939	validation_1-mlogloss:4.57174
[10]	validation_0-mlogloss:4.05591	validation_1-mlogloss:4.51165
[11]	validation_0-mlogloss:3.97006	validation_1-mlogloss:4.45949
[12]	validation_0-mlogloss:3.88790	validation_1-mlogloss:4.40869
[13]	validation_0-mlogloss:3.81053	validation_1-mlogloss:4.36186
[14]	validation_0-mlogloss:3.73708	validation_1-mlogloss:4.31530
[15]	validation_0-mlogloss:3.66754	

In [155]:
# Make predictions
probs = xgb_model.predict_proba(X_test) 
y_pred = np.argmax(probs, axis=1)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.4f}")

Test Accuracy: 0.2585


## Hyperparameter Tuning

In [156]:
'''
# Parameter search space
param_dist = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [3, 5, 7, 10],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 1, 5],
    "min_child_weight": [1, 3, 5]
}

# Base model: multi:softprob gives probability distribution
xgb_clf = XGBClassifier(
    objective="multi:softprob",
    eval_metric="mlogloss",
    use_label_encoder=False,
    tree_method="hist",
    random_state=42
)

# Stratified CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Randomized search (optimize for accuracy)
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    n_iter=30,
    scoring="accuracy",
    cv=cv,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit search
random_search.fit(X, y)

print("Best parameters:", random_search.best_params_)
print("Best CV accuracy:", random_search.best_score_)
'''

Fitting 5 folds for each of 30 candidates, totalling 150 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best parameters: {'subsample': 1.0, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 7, 'learning_rate': 0.2, 'gamma': 1, 'colsample_bytree': 0.6}
Best CV accuracy: 0.21840476447515203


- Best parameters: {'subsample': 1.0, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 7, 'learning_rate': 0.2, 'gamma': 1, 'colsample_bytree': 0.6}- 
Best CV accuracy: 0.21840476447515203

## Apply Best Parameters

In [157]:
best_xgb_model = XGBClassifier(
    subsample = 1.0,
    n_estimators= 100,
    min_child_weight = 1,
    max_depth = 7,
    learning_rate = 0.2,
    gamma = 1,
    colsample_bytree = 0.6,
    objective="multi:softprob",
    eval_metric="mlogloss",
    tree_method="hist",
    random_state=42
)

# Train model
best_xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)]
)

[0]	validation_0-mlogloss:5.38327	validation_1-mlogloss:5.47336
[1]	validation_0-mlogloss:5.01223	validation_1-mlogloss:5.17340
[2]	validation_0-mlogloss:4.75998	validation_1-mlogloss:4.99298
[3]	validation_0-mlogloss:4.54435	validation_1-mlogloss:4.82824
[4]	validation_0-mlogloss:4.34892	validation_1-mlogloss:4.68098
[5]	validation_0-mlogloss:4.19234	validation_1-mlogloss:4.55828
[6]	validation_0-mlogloss:4.05764	validation_1-mlogloss:4.45399
[7]	validation_0-mlogloss:3.94010	validation_1-mlogloss:4.37334
[8]	validation_0-mlogloss:3.83805	validation_1-mlogloss:4.30054
[9]	validation_0-mlogloss:3.74568	validation_1-mlogloss:4.23743
[10]	validation_0-mlogloss:3.67195	validation_1-mlogloss:4.19722
[11]	validation_0-mlogloss:3.61740	validation_1-mlogloss:4.16066
[12]	validation_0-mlogloss:3.56684	validation_1-mlogloss:4.12693
[13]	validation_0-mlogloss:3.52010	validation_1-mlogloss:4.09517
[14]	validation_0-mlogloss:3.48308	validation_1-mlogloss:4.06801
[15]	validation_0-mlogloss:3.45056	

In [158]:
# Make predictions
probs = best_xgb_model.predict_proba(X_test) 
y_pred = np.argmax(probs, axis=1)

# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.4f}")

Test Accuracy: 0.2331


# Test Predictor

In [182]:
def predictor(class_number, day, room, semester, start_minutes, end_minutes):

    # Convert
    day = label_encoders['Day'].classes_.tolist().index(day)
    room = label_encoders['Room'].classes_.tolist().index(room)
    semester = label_encoders['Semester'].classes_.tolist().index(semester)
    start_minutes = pd.to_datetime(start_minutes).hour * 60 + pd.to_datetime(start_minutes).minute
    end_minutes = pd.to_datetime(end_minutes).hour * 60 + pd.to_datetime(end_minutes).minute

    # Dataframe
    input_data = {
        "Number": class_number,
        "Day": day,
        "Room" : room,
        "Semester": semester,
        "Start_minutes": start_minutes,
        "End_minutes": end_minutes
    }
    
    input_df = pd.DataFrame([input_data])

    # Prediction
    probs = best_xgb_model.predict_proba(input_df) 
    y_pred = np.argmax(probs, axis=1)[0]

    return le.inverse_transform([y_pred])[0]

In [186]:
predictor(126, 'WF', 'MB 126', '2nd sem', '14:30', '16:00')

'CONSORTE, ODESSA; DACAYMAT, JOHN MEL'