In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import ndcg_score, average_precision_score, precision_score

student_info = pd.read_csv("studentInfo.csv")
assessments = pd.read_csv("assessments.csv")
student_assessment = pd.read_csv("studentAssessment.csv")

df = student_assessment.merge(assessments, on="id_assessment", how="left")
df = df.dropna(subset=["date_submitted", "date"])

df["delta_days"] = df["date"] - df["date_submitted"]
df["days_left"] = df["date"] - 5
df["late_rate"] = df.groupby("id_student")["delta_days"].transform(lambda x: (x < 0).mean())
df["urgency"] = df["weight"] / df["days_left"].replace(0,1)
df["risk"] = df["late_rate"] * df["urgency"]

categorical_cols = ["assessment_type","code_module","code_presentation"]
for col in categorical_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

df["label_float"] = df["urgency"] + df["risk"]
df["label"] = df.groupby("id_student")["label_float"].rank(method="min", ascending=False).astype(int)

feature_cols = [c for c in df.columns if c not in ["id_student","id_assessment","label","label_float","delta_days"]]

train, test = train_test_split(df, test_size=0.2, random_state=42)

X_train = train[feature_cols]
y_train = train["label"]
group_train = train.groupby("id_student").size().tolist()

X_test = test[feature_cols]
y_test = test["label"]
group_test = test.groupby("id_student").size().tolist()

lgb_ranker = lgb.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    n_estimators=300
)
lgb_ranker.fit(
    X_train, y_train,
    group=group_train,
    eval_set=[(X_test, y_test)],
    eval_group=[group_test],
    eval_at=[3]
)
lgb_preds = lgb_ranker.predict(X_test)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtrain.set_group(group_train)
dtest = xgb.DMatrix(X_test, label=y_test)
dtest.set_group(group_test)

params = {
    "objective": "rank:pairwise",
    "eval_metric": "ndcg@3"
}
xgb_ranker = xgb.train(
    params,
    dtrain,
    num_boost_round=300,
    evals=[(dtest, "test")]
)
xgb_preds = xgb_ranker.predict(dtest)

rf = RandomForestRegressor(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

def eval_ranking(preds, y_true, groups, topk=3):
    ndcg_list, map_list, prec_list = [], [], []
    start = 0
    for g in groups:
        end = start + g
        if g <= 1:
            start = end
            continue
        true = y_true[start:end].values.reshape(1, -1)
        pred = preds[start:end].reshape(1, -1)

        ndcg_list.append(ndcg_score(true, pred, k=topk))

        topk_idx = np.argsort(-pred[0])[:topk]
        prec = (true[0][topk_idx] >= 1).mean()
        prec_list.append(prec)

        rel = (true[0] >= 1).astype(int)
        map_list.append(average_precision_score(rel, pred[0]))
        start = end
    return np.mean(ndcg_list), np.mean(map_list), np.mean(prec_list)

lgb_metrics = eval_ranking(lgb_preds, y_test, group_test)
xgb_metrics = eval_ranking(xgb_preds, y_test, group_test)
rf_metrics = eval_ranking(rf_preds, y_test, group_test)

print("Model\t\tNDCG@3\tMAP@3\tPrecision@3")
print(f"LightGBM\t{lgb_metrics[0]:.4f}\t{lgb_metrics[1]:.4f}\t{lgb_metrics[2]:.4f}")
print(f"XGBoost\t\t{xgb_metrics[0]:.4f}\t{xgb_metrics[1]:.4f}\t{xgb_metrics[2]:.4f}")
print(f"RandomForest\t{rf_metrics[0]:.4f}\t{rf_metrics[1]:.4f}\t{rf_metrics[2]:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005641 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1037
[LightGBM] [Info] Number of data points in the train set: 136837, number of used features: 13
[0]	test-ndcg@3:0.97297
[1]	test-ndcg@3:0.98736
[2]	test-ndcg@3:0.98831
[3]	test-ndcg@3:0.98925
[4]	test-ndcg@3:0.98942
[5]	test-ndcg@3:0.98958
[6]	test-ndcg@3:0.98962
[7]	test-ndcg@3:0.98982
[8]	test-ndcg@3:0.99022
[9]	test-ndcg@3:0.99072
[10]	test-ndcg@3:0.99064
[11]	test-ndcg@3:0.99044
[12]	test-ndcg@3:0.99047
[13]	test-ndcg@3:0.99052
[14]	test-ndcg@3:0.99083
[15]	test-ndcg@3:0.99093
[16]	test-ndcg@3:0.99096
[17]	test-ndcg@3:0.99111
[18]	test-ndcg@3:0.99110
[19]	test-ndcg@3:0.99109
[20]	test-ndcg@3:0.99106
[21]	test-ndcg@3:0.99114
[22]	test-ndcg@3:0.99116
[23]	test-ndcg@3:0.99113
[24]	test-ndcg@3:0.99120
[25]	test-ndcg@3:0.99123
[26]	t