In [1]:
# --- Cell 1: Setup ---
import os, re, warnings
from pathlib import Path
import numpy as np, pandas as pd
warnings.filterwarnings("ignore")

DATA = Path(r"C:\Users\kule9\final\data")
ART  = Path(r"C:\Users\kule9\final\artifacts")
DATA.mkdir(parents=True, exist_ok=True)
ART.mkdir(parents=True, exist_ok=True)

def pick_col(df, patterns, must=False, default=None, name=""):
    for pat in patterns:
        for c in df.columns:
            if re.search(pat, str(c), flags=re.I):
                return c
    if must and default is None:
        raise RuntimeError(f"[{name}] missing any of {patterns}; cols={list(df.columns)}")
    return default

def map_to_binary(series):
    s = series.copy()
    if pd.api.types.is_numeric_dtype(s):
        uniq = set(pd.unique(s.dropna()))
        if uniq <= {0,1}: return s.astype(int)
        return (s >= s.dropna().median()).astype(int)
    low = s.astype(str).str.strip().str.lower()
    pass_set = {"pass","passed","approve","approved","success","completed","graduate","graduated","yes","y"}
    fail_set = {"fail","failed","withdraw","withdrawn","dropout","incomplete","no","n"}
    out=[]
    for v in low:
        if v in pass_set: out.append(1)
        elif v in fail_set: out.append(0)
        elif v in {"1","0"}: out.append(int(v))
        else: out.append(np.nan)
    out = pd.Series(out, index=s.index)
    return out.fillna(out.mode().iloc[0] if not out.mode().empty else 0).astype(int)


In [2]:
# --- Cell 2: Load data + merge region ---
frames = []
for fname, tag in [("institution_A.csv","INST_A"), ("institution_B.csv","INST_B")]:
    p = DATA/fname
    if p.exists():
        raw = pd.read_csv(p)
        df = pd.DataFrame()
        sid = pick_col(raw,[r"^student_?id$",r"\bid$"], default=raw.columns[0])
        df["student_id"] = pd.to_numeric(raw[sid], errors="coerce").fillna(-1).astype(int)
        df["institution"] = tag
        df["country"] = raw.get(pick_col(raw,["country","iso","nation"], default="country"), "Unknown")
        df["region"]  = raw.get(pick_col(raw,["region"], default="region"), "Unknown")
        df["year"]    = raw.get(pick_col(raw,["year","academic_year","ay"], default="year"), 2024)
        df["gender"]  = raw.get(pick_col(raw,["gender","sex"], default="gender"), "Unknown")
        df["age"]     = pd.to_numeric(raw.get(pick_col(raw,["age"], default="age"), np.nan), errors="coerce")
        df["language_level"] = raw.get(pick_col(raw,["language","lang","ielts","b2","c1"], default="language"), "Unknown")
        df["program"] = raw.get(pick_col(raw,["program","course","major"], default="program"), f"Prog_{tag[-1]}")
        df["current_grade"] = pd.to_numeric(raw.get(pick_col(raw,["gpa","grade","score","G3"], default="G3"), np.nan), errors="coerce")
        df["attendance"] = pd.to_numeric(raw.get(pick_col(raw,["attendance","presence","absences"], default="attendance"), np.nan), errors="coerce")
        tgt = pick_col(raw,[r"^target_?success$",r"^success$",r"^pass_fail$",r"^passed$",r"^final_result$",r"^dropout$",r"^label$"], default=None)
        df["pass_fail"] = map_to_binary(raw[tgt]) if tgt else map_to_binary(df["current_grade"])
        frames.append(df)

# Synthesize small dataset if none found (so the notebook always runs)
if not frames:
    n=400
    def synth(inst, countries):
        rng=np.random.default_rng(42 if inst=="INST_A" else 777)
        df=pd.DataFrame({
            "student_id": np.arange(n),
            "institution": inst,
            "country": rng.choice(countries, n),
            "region": "Unknown",
            "year": 2024,
            "gender": rng.choice(list("FM"), n),
            "age": rng.integers(18,45,n),
            "language_level": rng.choice(["A2","B1","B2","C1","C2"], n),
            "program": rng.choice(["CS","ENG","BUS"], n),
            "current_grade": rng.normal(3.0 if inst=="INST_A" else 2.7, 0.7, n).clip(0,4),
            "attendance": rng.uniform(60, 98, n),
        })
        df["pass_fail"] = (0.6*df["current_grade"] + 0.01*df["attendance"] + rng.normal(0,0.2,n) > 2.8).astype(int)
        return df
    frames=[synth("INST_A",["LV","LT","IN","LK","IT","FR"]),
            synth("INST_B",["IN","LK","PK","BD","DE","PL"])]

data = pd.concat(frames, ignore_index=True)

# merge region via culture_map.csv (you already created this)
cult_path = DATA/"culture_map.csv"
if cult_path.exists():
    cult = pd.read_csv(cult_path)
    data["country"] = data["country"].astype(str).str.upper().str.strip()
    data = data.merge(cult, on="country", how="left", suffixes=("","_map"))
    mask = data["region"].isna() | (data["region"].astype(str).str.strip().eq("")) | (data["region"]=="Unknown")
    data.loc[mask, "region"] = data.loc[mask, "region_map"]
    data.drop(columns=["region_map"], inplace=True, errors="ignore")

print("Data preview:")
data.head(5)


Data preview:


Unnamed: 0,student_id,institution,country,region,year,gender,age,language_level,program,current_grade,attendance,pass_fail,hofstede_idv
0,0,INST_A,LV,Baltics,2024,F,39,Unknown,CS,2.185283,,1,70
1,1,INST_A,LK,Asia,2024,F,37,Unknown,ENG,3.344838,,1,35
2,2,INST_A,IN,Asia,2024,F,33,Unknown,CS,2.078382,,1,48
3,3,INST_A,DE,Europe,2024,F,41,Unknown,BA,2.62818,,0,67
4,4,INST_A,DE,Europe,2024,M,21,Unknown,BA,2.448369,,0,67


In [3]:
# --- Cell 3: Temporal features (optional) ---
def load_temporal(name):
    p = DATA/name
    if not p.exists(): return None
    t = pd.read_csv(p)
    sid = pick_col(t,[r"^student_?id$",r"\bid$"], default=t.columns[0])
    wk  = pick_col(t,[r"^week$","wk","time","t"], default=t.columns[1])
    clk = pick_col(t,["click"], default="clicks"); 
    vid = pick_col(t,["video"], default="videos_watched")
    ass = pick_col(t,["assign","submission"], default="assignments_submitted")
    for c in [clk,vid,ass]: 
        if c not in t.columns: t[c]=0
    t = t.rename(columns={sid:"student_id", wk:"week", clk:"clicks", vid:"videos_watched", ass:"assignments_submitted"})
    for c in ["week","clicks","videos_watched","assignments_submitted"]:
        t[c] = pd.to_numeric(t[c], errors="coerce").fillna(0)
    return t[["student_id","week","clicks","videos_watched","assignments_submitted"]]

tA = load_temporal("temporal_A.csv")
tB = load_temporal("temporal_B.csv")
temporal = pd.concat([x for x in [tA,tB] if x is not None], ignore_index=True) if (tA is not None or tB is not None) else None

if temporal is not None and not temporal.empty:
    eng = temporal.groupby('student_id').agg(
        clicks_sum=('clicks','sum'),
        clicks_mean=('clicks','mean'),
        clicks_std=('clicks','std'),
        videos_sum=('videos_watched','sum'),
        assign_sum=('assignments_submitted','sum'),
        weeks=('week','nunique'),
        week_max=('week','max')
    ).reset_index()

    early = temporal[temporal['week']<=3].groupby('student_id').agg(
        early_clicks=('clicks','sum'),
        early_videos=('videos_watched','sum'),
        early_assign=('assignments_submitted','sum'),
    ).reset_index()

    late_wk = temporal.groupby('student_id')['week'].max().rename('wk_last').reset_index()
    tmp_late = temporal.merge(late_wk, on='student_id', how='left')
    late = tmp_late[tmp_late['week'] >= tmp_late['wk_last']-2].groupby('student_id').agg(
        late_clicks=('clicks','sum'),
        late_videos=('videos_watched','sum'),
        late_assign=('assignments_submitted','sum'),
    ).reset_index()

    eng = eng.merge(early, on='student_id', how='left').merge(late, on='student_id', how='left').fillna(0)
    eng['eng_rate'] = eng['assign_sum'] / eng['weeks'].replace(0, 1)
    eng['clicks_per_week'] = eng['clicks_sum'] / eng['weeks'].replace(0, 1)
    eng['late_drop'] = (eng['late_clicks'] < eng['early_clicks']*0.5).astype(int)

    data = data.merge(eng, on='student_id', how='left').fillna(0)
else:
    # create zeros if no temporal
    for c in ["clicks_sum","clicks_mean","clicks_std","videos_sum","assign_sum","weeks",
              "early_clicks","early_videos","early_assign","late_clicks","late_videos","late_assign",
              "eng_rate","clicks_per_week","late_drop"]:
        if c not in data.columns: data[c]=0

print("Cols now:", len(data.columns))


Cols now: 29


In [4]:
# --- Cell 4: Build X/y and split ---
from sklearn.model_selection import train_test_split

data["target_success"] = map_to_binary(data["pass_fail"]).astype(int)

cat_cols = ["institution","country","region","gender","program","language_level"]
for c in cat_cols:
    if c not in data.columns: data[c]="Unknown"

num_cols = [c for c in data.columns if c not in cat_cols + ["student_id","pass_fail","target_success"]
            and pd.api.types.is_numeric_dtype(data[c])]

X = data[cat_cols + num_cols].copy()
y = data["target_success"].copy()
groups = data["institution"].copy()  # used for institution-holdout CV later

Xtr, Xte, ytr, yte = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)
len(Xtr), len(Xte), y.mean()


(1500, 500, np.float64(0.674))

In [5]:
# --- Cell 5: Hybrid model ---
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support, accuracy_score, precision_recall_curve

preproc = ColumnTransformer([
    ("num", Pipeline([("imp", SimpleImputer(strategy="median")), ("sc", StandardScaler(with_mean=False))]), num_cols),
    ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                      ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True))]), cat_cols)
])

rf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
pipe = Pipeline([("prep", preproc), ("rf", rf)])
pipe.fit(Xtr, ytr)

proba_tree = pipe.predict_proba(Xte)[:,1]
prec, rec, thr = precision_recall_curve(yte, proba_tree)
f1s = 2*prec[:-1]*rec[:-1]/(prec[:-1]+rec[:-1]+1e-9)
t_best = thr[np.nanargmax(f1s)] if len(thr) else 0.5
pred_tree = (proba_tree >= t_best).astype(int)

print(f"[RF] AUROC={roc_auc_score(yte, proba_tree):.3f}  "
      f"F1={precision_recall_fscore_support(yte, pred_tree, average='binary')[2]:.3f}  thr={t_best:.3f}")

# Optional LSTM on compact 3-step sequence (early/total/late)
USE_DL = False
try:
    import tensorflow as tf
    from tensorflow.keras import layers, models
    USE_DL = True
except Exception:
    pass

if USE_DL:
    def make_seq(df_rows):
        return np.stack([
            df_rows[["early_clicks","clicks_sum","late_clicks"]].fillna(0).values,
            df_rows[["early_videos","videos_sum","late_videos"]].fillna(0).values,
            df_rows[["early_assign","assign_sum","late_assign"]].fillna(0).values
        ], axis=-1)  # (n, 3, 3)

    tr_mask = X.index.isin(Xtr.index); te_mask = X.index.isin(Xte.index)
    seq_X_tr = make_seq(data.loc[tr_mask]); seq_X_te = make_seq(data.loc[te_mask])

    inp = layers.Input(shape=(3,3))
    x = layers.LSTM(32)(inp); x = layers.Dropout(0.2)(x)
    out = layers.Dense(1, activation="sigmoid")(x)
    lstm = models.Model(inp,out)
    lstm.compile(optimizer="adam", loss="binary_crossentropy", metrics=["AUC"])
    lstm.fit(seq_X_tr, ytr.values, epochs=12, batch_size=64, validation_split=0.2, verbose=0)
    proba_lstm = lstm.predict(seq_X_te, verbose=0).ravel()
    proba_final = 0.5*proba_tree + 0.5*proba_lstm
else:
    proba_final = proba_tree

pred_final = (proba_final >= t_best).astype(int)
au = roc_auc_score(yte, proba_final)
p, r, f1, _ = precision_recall_fscore_support(yte, pred_final, average="binary", zero_division=0)
acc = accuracy_score(yte, pred_final)
print(f"[Hybrid] AUROC={au:.3f}  F1={f1:.3f}  P={p:.3f}  R={r:.3f}  Acc={acc:.3f}")


[RF] AUROC=0.516  F1=0.805  thr=0.372
[Hybrid] AUROC=0.516  F1=0.805  P=0.674  R=1.000  Acc=0.674


In [6]:
# --- Cell 6: Institution-holdout (domain generalization) ---
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

gkf = GroupKFold(n_splits=2)  # with A/B
cv_scores=[]
for tr_idx, te_idx in gkf.split(X, y, groups=groups):
    X_tr, X_te = X.iloc[tr_idx], X.iloc[te_idx]
    y_tr, y_te = y.iloc[tr_idx], y.iloc[te_idx]
    p = Pipeline([("prep", preproc), ("rf", RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1))])
    p.fit(X_tr, y_tr)
    au = roc_auc_score(y_te, p.predict_proba(X_te)[:,1]) if len(np.unique(y_te))>1 else np.nan
    cv_scores.append(au)
print("Institution-holdout AUROCs:", [f"{x:.3f}" for x in cv_scores], " | mean=", f"{np.nanmean(cv_scores):.3f}")


Institution-holdout AUROCs: ['0.494', '0.509']  | mean= 0.501


In [7]:
# --- Cell 7: Culture/region metrics + fairness ---
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

test_df = Xte.reset_index(drop=True).copy()
test_df["success"] = yte.reset_index(drop=True).values
test_df["y_prob"]  = proba_final[:len(test_df)]
test_df["y_pred"]  = (proba_final >= t_best).astype(int)

for c in ["country","region","institution"]:
    if c not in test_df.columns: test_df[c] = "Unknown"

def group_metrics(df, group_col):
    rows=[]
    for g, sub in df.groupby(group_col):
        yt, yp, yhat = sub["success"].values, sub["y_prob"].values, sub["y_pred"].values
        rows.append({
            group_col: g, "n": int(len(yt)),
            "auroc": float(roc_auc_score(yt, yp)) if len(np.unique(yt))>1 else np.nan,
            "f1": float(f1_score(yt, yhat)) if len(np.unique(yt))>1 else np.nan,
            "acc": float(accuracy_score(yt, yhat)),
            "pos_rate": float(np.mean(yhat))
        })
    return pd.DataFrame(rows).sort_values("n", ascending=False)

by_country = group_metrics(test_df, "country")
by_region  = group_metrics(test_df, "region")
by_inst    = group_metrics(test_df, "institution")

def dp_gap(df, col):
    rates = df.groupby(col)["y_pred"].mean()
    return float(rates.max()-rates.min()) if len(rates)>1 else 0.0

def eo_gap(df, col):
    vals=[]
    for g, sub in df.groupby(col):
        pos = (sub["success"]==1)
        if pos.sum(): vals.append((sub.loc[pos,"y_pred"]==1).mean())
    return float(np.nanmax(vals)-np.nanmin(vals)) if len(vals)>1 else 0.0

fairness = pd.Series({
    "dp_gap_country": dp_gap(test_df,"country"),
    "dp_gap_region" : dp_gap(test_df,"region"),
    "eo_gap_country": eo_gap(test_df,"country"),
    "eo_gap_region" : eo_gap(test_df,"region"),
})

bins = np.linspace(0,1,11)
bin_ids = np.digitize(test_df["y_prob"], bins)-1
cal_df = (test_df.assign(bin=bin_ids)
          .groupby("bin")
          .agg(prob_pred_bin_mean=("y_prob","mean"),
               frac_pos=("y_pred","mean"))
          .dropna()
          .reset_index(drop=True))

cm = confusion_matrix(test_df["success"], test_df["y_pred"])
cm_df = pd.DataFrame(cm, index=["Actual 0","Actual 1"], columns=["Pred 0","Pred 1"])

print("By country:\n", by_country.head(10))
print("\nFairness gaps:\n", fairness)


By country:
   country   n     auroc        f1       acc  pos_rate
5      LK  84  0.490434  0.800000  0.666667       1.0
7      LV  82  0.632277  0.836879  0.719512       1.0
0      DE  72  0.488511  0.789916  0.652778       1.0
4      IT  66  0.500000  0.810811  0.681818       1.0
3      IN  66  0.561376  0.810811  0.681818       1.0
1      EE  38  0.544872  0.812500  0.684211       1.0
8      PL  35  0.340909  0.813559  0.685714       1.0
2      ES  31  0.504202  0.708333  0.548387       1.0
6      LT  26  0.586806  0.818182  0.692308       1.0

Fairness gaps:
 dp_gap_country    0.0
dp_gap_region     0.0
eo_gap_country    0.0
eo_gap_region     0.0
dtype: float64


In [8]:
# --- Cell 8: Save artifacts + dashboard ---
by_country.to_csv(ART/"metrics_by_country.csv", index=False)
by_region.to_csv(ART/"metrics_by_region.csv", index=False)
by_inst.to_csv(ART/"metrics_by_institution.csv", index=False)
fairness.to_csv(ART/"fairness_gaps.csv", header=False)
cal_df.to_csv(ART/"calibration_curve.csv", index=False)
cm_df.to_csv(ART/"confusion_matrix.csv")
test_df.to_csv(ART/"scored_students.csv", index=False)

from sklearn.metrics import roc_auc_score, precision_recall_fscore_support, accuracy_score
p, r, f1, _ = precision_recall_fscore_support(test_df["success"], test_df["y_pred"], average="binary", zero_division=0)
acc = accuracy_score(test_df["success"], test_df["y_pred"])
au  = roc_auc_score(test_df["success"], test_df["y_prob"]) if len(np.unique(test_df["success"]))>1 else np.nan

html = f"""
<html><head><meta charset='utf-8'><title>Cross-Cultural Hybrid Framework</title>
<style>body{{font-family:Arial;padding:18px}} table{{border-collapse:collapse}} td,th{{border:1px solid #ccc;padding:6px}}</style>
</head><body>
<h1>Predicting Student Academic Success — Hybrid Model</h1>
<h3>Global Test Metrics</h3>
<p>AUROC={au:.3f} | F1={f1:.3f} | P={p:.3f} | R={r:.3f} | Acc={acc:.3f} | Threshold={float(np.round(test_df['y_pred'].mean(),3))}</p>
<h3>By Country</h3>{by_country.to_html(index=False)}
<h3>By Region</h3>{by_region.to_html(index=False)}
<h3>By Institution</h3>{by_inst.to_html(index=False)}
<h3>Fairness Gaps</h3><pre>{fairness.to_string()}</pre>
<h3>Calibration bins</h3>{cal_df.to_html(index=False)}
<h3>Confusion Matrix</h3>{cm_df.to_html()}
</body></html>
"""
(ART/"dashboard.html").write_text(html, encoding="utf-8")
print("Artifacts saved to:", ART)
print("Open:", ART/"dashboard.html")


Artifacts saved to: C:\Users\kule9\final\artifacts
Open: C:\Users\kule9\final\artifacts\dashboard.html
