# CRS Math Profs Predictor

By:

Mariano, Isaiah

Montealto, Meluisa

Regalario, Jeremiah

# Import Dependencies

In [181]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
import re
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import matplotlib.pyplot as plt
import random
from sklearn.metrics import accuracy_score, classification_report
from datetime import datetime
import pickle

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Load DataFrame

In [154]:
model_df = pd.read_csv("CRS Math Profs (2018-2024).csv", dtype={"Number": object})
model_df = model_df[['Number', 'Day', 'Room', 'Prof', 'Year', 'Semester', 'Start_time', 'End_time']]
model_df.head()

Unnamed: 0,Number,Day,Room,Prof,Year,Semester,Start_time,End_time
0,2,TTh,TBA,"WALO, MA. LAILANI",2018,1st sem,08:30,10:00
1,2,TTh,TBA,"CEJO, ROBERT JAY",2018,1st sem,10:00,11:30
2,2,TTh,TBA,"WALO, MA. LAILANI",2018,1st sem,11:30,13:00
3,2,WF,TBA,"AGUILAR, ADJANI",2018,1st sem,11:30,13:00
4,10,TTh,TBA,"ARCEO, CARLENE PERPETUA",2018,1st sem,08:30,10:00


In [155]:
model_df.shape

(3114, 8)

In [156]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3114 entries, 0 to 3113
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Number      3114 non-null   object
 1   Day         2435 non-null   object
 2   Room        3114 non-null   object
 3   Prof        3112 non-null   object
 4   Year        3114 non-null   int64 
 5   Semester    3114 non-null   object
 6   Start_time  2560 non-null   object
 7   End_time    2560 non-null   object
dtypes: int64(1), object(7)
memory usage: 194.8+ KB


# Preprocessing

# Midyear

In midyear classes, the days are not listed. We add "TWThF".

In [157]:
model_df.loc[model_df['Semester'] == "Midyear", 'Day'] = 'TWThF'
model_df[model_df['Semester'] == 'Midyear'].head()

Unnamed: 0,Number,Day,Room,Prof,Year,Semester,Start_time,End_time
482,2,TWThF,MBAN 306,"CEJO, ROBERT JAY",2018,Midyear,07:00,09:00
483,10,TWThF,TBA,"ARCEO, CARLENE PERPETUA",2018,Midyear,09:00,11:00
484,11,TWThF,MB 121,"CEJO, ROBERT JAY",2018,Midyear,11:00,13:00
485,14,TWThF,MB 107,"RAMOS, AARON",2018,Midyear,09:00,11:00
486,20,TWThF,MB 126,"STA. ANA, ANGELICA",2018,Midyear,09:35,12:10


## Drop Rows with Missing Critical Values

In [158]:
model_df = model_df.dropna(subset=["Prof", "Day", "Start_time", "End_time"]).copy()
model_df.head()

Unnamed: 0,Number,Day,Room,Prof,Year,Semester,Start_time,End_time
0,2,TTh,TBA,"WALO, MA. LAILANI",2018,1st sem,08:30,10:00
1,2,TTh,TBA,"CEJO, ROBERT JAY",2018,1st sem,10:00,11:30
2,2,TTh,TBA,"WALO, MA. LAILANI",2018,1st sem,11:30,13:00
3,2,WF,TBA,"AGUILAR, ADJANI",2018,1st sem,11:30,13:00
4,10,TTh,TBA,"ARCEO, CARLENE PERPETUA",2018,1st sem,08:30,10:00


In [159]:
model_df.shape

(2552, 8)

## Convert Time

In [160]:
# Convert times to minutes since midnight
def time_to_minutes(t):
    h, m = map(int, t.split(":"))
    return h * 60 + m

model_df["Start_minutes"] = model_df["Start_time"].apply(time_to_minutes)
model_df["End_minutes"] = model_df["End_time"].apply(time_to_minutes)
model_df.head()

Unnamed: 0,Number,Day,Room,Prof,Year,Semester,Start_time,End_time,Start_minutes,End_minutes
0,2,TTh,TBA,"WALO, MA. LAILANI",2018,1st sem,08:30,10:00,510,600
1,2,TTh,TBA,"CEJO, ROBERT JAY",2018,1st sem,10:00,11:30,600,690
2,2,TTh,TBA,"WALO, MA. LAILANI",2018,1st sem,11:30,13:00,690,780
3,2,WF,TBA,"AGUILAR, ADJANI",2018,1st sem,11:30,13:00,690,780
4,10,TTh,TBA,"ARCEO, CARLENE PERPETUA",2018,1st sem,08:30,10:00,510,600


## Encode Categorical Variables

In [161]:
# Encode categorical variables
label_encoders = {}
categorical_cols = ["Semester", "Prof"]

for col in categorical_cols:
    le = LabelEncoder()
    model_df[col] = le.fit_transform(model_df[col].astype(str))
    label_encoders[col] = le



model_df = pd.get_dummies(model_df, columns=['Day', 'Number', 'Room'])

model_df.head()

Unnamed: 0,Prof,Year,Semester,Start_time,End_time,Start_minutes,End_minutes,Day_M,Day_S,Day_TTh,Day_TWThF,Day_WF,Number_10,Number_100,Number_108,Number_109,Number_11,Number_110.1,Number_110.2,Number_110.3,Number_117,Number_121,Number_122,Number_123.1,Number_123.2,Number_126,Number_128,Number_131,Number_133,Number_14,Number_140,Number_142,Number_146,Number_147,Number_148,Number_150.1,Number_150.2,Number_158,Number_162,Number_164,Number_166,Number_17,Number_171,Number_180.1,Number_190,Number_196,Number_197,Number_2,Number_20,Number_201,Number_202.1,Number_202.2,Number_203,Number_204,Number_205,Number_208,Number_209.1,Number_209.2,Number_21,Number_210.1,Number_210.2,Number_211,Number_214,Number_216,Number_217,Number_218,Number_22,Number_220.1,Number_220.2,Number_221,Number_222,Number_227,Number_228,Number_229,Number_23,Number_235,Number_236,Number_240,Number_241,Number_242,Number_243,Number_246,Number_247,Number_249,Number_250,Number_260,Number_261,Number_262,Number_262.1,Number_262.2,Number_265,Number_266,Number_268.2,Number_271.1,Number_271.2,Number_280,Number_281,Number_288,Number_290,Number_294,Number_295,Number_296,Number_297,Number_30,Number_300,Number_40,Number_400,Number_53,Number_54,Number_55,Room_MB 105,Room_MB 106,Room_MB 107,Room_MB 108,Room_MB 115,Room_MB 116,Room_MB 117,Room_MB 118,Room_MB 120,Room_MB 121,Room_MB 126,Room_MB 301,Room_MB 302,Room_MB 303,Room_MB 304,Room_MB 305,Room_MB 306,Room_MB 307,Room_MB 308,Room_MB 312,Room_MB 313,Room_MB 314,Room_MB 318,Room_MB 319,Room_MB 320,Room_MB 321,Room_MB 322,Room_MB 323,Room_MB 328,Room_MB 329,Room_MBAN 102,Room_MBAN 103,Room_MBAN 301,Room_MBAN 304,Room_MBAN 305,Room_MBAN 306,Room_MBAN 307,Room_MBAN 310,Room_MBAN 312,Room_MBAN 313,Room_MBAN 314,Room_MBAN 401,Room_MBAN 403,Room_TBA
0,532,2018,0,08:30,10:00,510,600,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
1,94,2018,0,10:00,11:30,600,690,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2,532,2018,0,11:30,13:00,690,780,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
3,4,2018,0,11:30,13:00,690,780,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
4,22,2018,0,08:30,10:00,510,600,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True


## Handle One-Class Professors

In [162]:
# Step 1: Count classes per professor
prof_counts = model_df["Prof"].value_counts()
one_class_profs = prof_counts[prof_counts == 1].index

# Step 2: Map features → professor for one-class profs
one_class_mapping = {}
one_class_data = model_df[model_df["Prof"].isin(one_class_profs)]

for _, row in one_class_data.iterrows():
    # Build a tuple key using ALL feature columns except "Prof"
    key = tuple(row.drop("Prof").values)
    one_class_mapping[key] = row["Prof"]

# Step 3: Keep only multi-class profs for training
multi_class_data = model_df[~model_df["Prof"].isin(one_class_profs)]

# Train-Test Split

In [163]:
elements_to_remove = ['Prof', 'Year', 'Start_time', 'End_time']
interest = [item for item in multi_class_data.columns.tolist() if item not in elements_to_remove]

X = multi_class_data[interest]
y = multi_class_data["Prof"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier

In [164]:
# === Train model ===
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

## Custom predictor

In [165]:
# === Custom predictor with special rule: if there is a class with one prof, simply predict that prof ===
def predict_with_rule(X_input, model):
    preds = []
    for _, row in X_input.iterrows():
        key = tuple(row.values)
        if key in one_class_mapping:  # special rule
            preds.append(one_class_mapping[key])
        else:
            # Ensure input has correct feature names
            row_df = pd.DataFrame([row], columns=X_train.columns)
            preds.append(model.predict(row_df)[0])
    return preds

# === Evaluate on test set ===
y_pred = predict_with_rule(X_test, rf_model)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.3199152542372881


## Hyperparameter Tuning

In [175]:
# Define parameter grid
param_grid = {
    "n_estimators": [200, 400],        # number of trees
    "max_depth": [None, 20, 40],       # depth of trees
    "min_samples_split": [2, 5, 10],   # min samples to split
    "min_samples_leaf": [1, 2, 4],     # min samples per leaf
    "max_features": ["sqrt", None]     # features considered for best split
}

# Stratified K-Fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize model
rf = RandomForestClassifier(random_state=42)

# Grid Search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=cv,
    scoring="accuracy",
    verbose=10,
    n_jobs=-1
)

# Fit
grid_search.fit(X_train, y_train)

# Best parameters and CV score
print("Best Params:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits




Best Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best CV Score: 0.2990610921645404


- Best Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
- Best CV Score: **0.2990610921645404**

## Apply Best Parameters

In [176]:
# === Train model ===
'''best_params = {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}'''
best_params = {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
best_rf_model = RandomForestClassifier(**best_params, random_state=42)
best_rf_model.fit(X_train, y_train)
# === Evaluate on test set ===
y_pred = predict_with_rule(X_test, best_rf_model)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.3326271186440678


# Test Predictor

In [177]:
def predictor(class_number, day, room, semester, start_time, end_time):
    # Convert semester using LabelEncoder
    semester = label_encoders['Semester'].classes_.tolist().index(semester)

    # Convert start and end times to minutes
    start_minutes = pd.to_datetime(start_time).hour * 60 + pd.to_datetime(start_time).minute
    end_minutes = pd.to_datetime(end_time).hour * 60 + pd.to_datetime(end_time).minute

    # Build base input dict
    input_data = {
        "Semester": semester,
        "Start_minutes": start_minutes,
        "End_minutes": end_minutes
    }

    # Handle one-hot encoded categorical variables
    for col in model_df.columns:
        if col.startswith("Day_"):
            input_data[col] = 1 if col == f"Day_{day}" else 0
        elif col.startswith("Number_"):
            input_data[col] = 1 if col == f"Number_{class_number}" else 0
        elif col.startswith("Room_"):
            input_data[col] = 1 if col == f"Room_{room}" else 0

    # Create dataframe with same columns as training data
    input_df = pd.DataFrame([input_data], columns=model_df.drop(columns=elements_to_remove).columns)

    # Prediction
    pred_num = best_rf_model.predict(input_df)[0]
    return le.inverse_transform([pred_num])[0]

In [180]:
predictor(171, 'TTh', 'MB 120', '2nd sem', '14:30', '16:00')

'MENDOZA, VICTORIA MAY'

# Save Model

In [182]:
with open(r'best_rf_model.pkl', 'wb') as f:
    pickle.dump(best_rf_model, f)