In [22]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [23]:
CSV_PATH = "A:/DLSA/Project_work/Datasets/forecasting_dataset3.csv"

metadata_cols = [
    "publication_year",
    "is_eng",
    "abstract_len",
    "title_len",
    "core_research",
    "sec_research",
    "low_novelty",
    "num_authors",
    "avg_author_citations",
    "avg_author_productivity",
    "is_top_institution",
]

In [24]:
TRAIN_YEARS = (1990, 2010)
VAL_YEARS   = (2011, 2013)
TEST_YEARS  = (2014, 2015)

In [25]:
RANDOM_STATE = 42


In [26]:
#GLOBAL_POS_N = 130000
#GLOBAL_NEG_N = 260000

In [27]:
df = pd.read_csv(CSV_PATH)

df["publication_year"] = pd.to_numeric(df["publication_year"], errors="coerce")
df = df[(df["publication_year"] >= 1990) & (df["publication_year"] <= 2015)].copy()
df["publication_year"] = df["publication_year"].astype(int)
for c in metadata_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0)

In [28]:
# df_pos_all = df[df["label"] == 1]
# df_neg_all = df[df["label"] == 0]

# if len(df_pos_all) == 0 or len(df_neg_all) == 0:
#     raise ValueError("Dataset has only one class; cannot sample pos/neg.")

# n_pos = min(GLOBAL_POS_N, len(df_pos_all))
# n_neg = min(GLOBAL_NEG_N, len(df_neg_all))

# df_pos = df_pos_all.sample(n=n_pos, random_state=RANDOM_STATE)
# df_neg = df_neg_all.sample(n=n_neg, random_state=RANDOM_STATE)

# df = (
#     pd.concat([df_pos, df_neg], axis=0)
#       .sample(frac=1, random_state=RANDOM_STATE)
#       .reset_index(drop=True)
# )

# print(
#     f"After global sampling: {len(df)} rows | "
#     f"pos: {(df['label']==1).sum()} | neg: {(df['label']==0).sum()}")

In [29]:
train_df = df[(df["publication_year"] >= TRAIN_YEARS[0]) & (df["publication_year"] <= TRAIN_YEARS[1])].copy()
val_df   = df[(df["publication_year"] >= VAL_YEARS[0])   & (df["publication_year"] <= VAL_YEARS[1])].copy()
test_df  = df[(df["publication_year"] >= TEST_YEARS[0])  & (df["publication_year"] <= TEST_YEARS[1])].copy()

In [30]:
print("Split sizes:", "Train", len(train_df), "Val", len(val_df), "Test", len(test_df))
print("Train label counts:\n", train_df["label"].value_counts())


Split sizes: Train 323965 Val 82168 Test 58539
Train label counts:
 label
0    219088
1    104877
Name: count, dtype: int64


In [31]:
X_train = train_df[metadata_cols].values
y_train = train_df["label"].astype(int).values

X_val = val_df[metadata_cols].values
y_val = val_df["label"].astype(int).values

X_test = test_df[metadata_cols].values
y_test = test_df["label"].astype(int).values

In [32]:
n_pos = int((y_train == 1).sum())
n_neg = int((y_train == 0).sum())
scale_pos_weight = n_neg / max(n_pos, 1)
print(f"n_pos={n_pos}, n_neg={n_neg}, scale_pos_weight={scale_pos_weight:.3f}")

n_pos=104877, n_neg=219088, scale_pos_weight=2.089


In [33]:
xgb = XGBClassifier(
    n_estimators=3000,
    max_depth=4,
    min_child_weight=5,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
)

In [34]:
xgb.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=200
)

[0]	validation_0-logloss:0.69226
[200]	validation_0-logloss:0.62196
[400]	validation_0-logloss:0.61646
[600]	validation_0-logloss:0.61583
[800]	validation_0-logloss:0.61527
[1000]	validation_0-logloss:0.61551
[1200]	validation_0-logloss:0.61542
[1400]	validation_0-logloss:0.61540
[1600]	validation_0-logloss:0.61555
[1800]	validation_0-logloss:0.61534
[2000]	validation_0-logloss:0.61563
[2200]	validation_0-logloss:0.61587
[2400]	validation_0-logloss:0.61564
[2600]	validation_0-logloss:0.61607
[2800]	validation_0-logloss:0.61599
[2999]	validation_0-logloss:0.61592


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [35]:
def report(y_true, p, thr=0.5, name=""):
    pred = (p >= thr).astype(int)
    print(f"\n{name} @ thr={thr:.2f}")
    print("accuracy :", round(accuracy_score(y_true, pred), 4))
    print("precision:", round(precision_score(y_true, pred, zero_division=0), 4))
    print("recall   :", round(recall_score(y_true, pred, zero_division=0), 4))
    print("f1       :", round(f1_score(y_true, pred, zero_division=0), 4))

p_val  = xgb.predict_proba(X_val)[:, 1]
p_test = xgb.predict_proba(X_test)[:, 1]

report(y_val,  p_val,  thr=0.5, name="VAL")
report(y_test, p_test, thr=0.5, name="TEST")


VAL @ thr=0.50
accuracy : 0.6639
precision: 0.351
recall   : 0.3597
f1       : 0.3553

TEST @ thr=0.50
accuracy : 0.6747
precision: 0.3306
recall   : 0.3473
f1       : 0.3388


In [36]:
def find_best_threshold(y_true, probs, thr_min=0.05, thr_max=0.95, step=0.02):
    best = {"f1": -1, "thr": None, "p": None, "r": None}
    for thr in np.arange(thr_min, thr_max + 1e-9, step):
        pred = (probs >= thr).astype(int)
        f1 = f1_score(y_true, pred, zero_division=0)
        if f1 > best["f1"]:
            best = {
                "f1": float(f1),
                "thr": float(thr),
                "p": float(precision_score(y_true, pred, zero_division=0)),
                "r": float(recall_score(y_true, pred, zero_division=0)),
            }
    return best

best_thr = find_best_threshold(y_val, p_val)
print("\nBest threshold on VAL:", best_thr)


Best threshold on VAL: {'f1': 0.4397753276471812, 'thr': 0.39000000000000007, 'p': 0.30330529600602596, 'r': 0.7995083439701225}


In [37]:
report(y_test, p_test, thr=best_thr["thr"], name="TEST (tuned on VAL)")


TEST (tuned on VAL) @ thr=0.39
accuracy : 0.4685
precision: 0.2837
recall   : 0.7969
f1       : 0.4184


In [38]:
gain = xgb.get_booster().get_score(importance_type="gain")
gain_full = {f: gain.get(f"f{idx}", 0.0) for idx, f in enumerate(metadata_cols)}

imp_gain = (
    pd.DataFrame({"feature": list(gain_full.keys()), "gain": list(gain_full.values())})
      .sort_values("gain", ascending=False)
      .reset_index(drop=True)
)

In [39]:
print("\nTop metadata features by GAIN importance:")
print(imp_gain.head(20))



Top metadata features by GAIN importance:
                    feature       gain
0                    is_eng  72.466560
1             core_research  29.223148
2              sec_research  15.594431
3              abstract_len  13.501014
4                 title_len   7.725040
5          publication_year   7.599453
6        is_top_institution   6.580956
7               low_novelty   3.142884
8               num_authors   0.000000
9      avg_author_citations   0.000000
10  avg_author_productivity   0.000000


In [40]:
weight = xgb.get_booster().get_score(importance_type="weight")
weight_full = {f: weight.get(f"f{idx}", 0.0) for idx, f in enumerate(metadata_cols)}

imp_weight = (
    pd.DataFrame({"feature": list(weight_full.keys()), "weight": list(weight_full.values())})
      .sort_values("weight", ascending=False)
      .reset_index(drop=True))

In [41]:
print("\nTop metadata features by WEIGHT (split count):")
print(imp_weight.head(20))


Top metadata features by WEIGHT (split count):
                    feature   weight
0              abstract_len  14140.0
1                 title_len  11316.0
2          publication_year   9709.0
3        is_top_institution   2050.0
4             core_research   1319.0
5              sec_research   1171.0
6               low_novelty    781.0
7                    is_eng    201.0
8               num_authors      0.0
9      avg_author_citations      0.0
10  avg_author_productivity      0.0
