In [187]:
# imports
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, f1_score, accuracy_score, roc_auc_score, precision_recall_curve, precision_score, recall_score
from sklearn.base import clone
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

In [240]:
import warnings

warnings.filterwarnings(
    "ignore",
    message="Currently, when `keep_empty_feature=False` and `strategy=\"constant\"`",
    category=FutureWarning,
    module=r"sklearn\.impute\._base",
)

In [188]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [189]:
# baseline
full_csv = pd.read_csv('../data/s9/s9_working.csv')
full_csv = full_csv.copy()

print(f'majority class: {int(np.argmax(full_csv['outcome']))}')

majority class: 0


In [190]:
# preprocessing

to_drop = ['rank', 'affinity', 'type'] +  ['ds2_def_type', 'ds2_invul', 'as1_heals_one_or_many', 'as2_def_type', 'as2_invul', 'as2_heals_one_or_many']\
+ [c for c in full_csv.columns if c.endswith('_PARTIAL')]

full_csv = full_csv.drop(columns=to_drop)

full_csv = full_csv.replace(to_replace='?', value=np.nan)

In [191]:
# the target var
y = full_csv['outcome']
X = full_csv.drop(columns=['outcome'])


print(X.shape)
print(y.shape)

(201, 571)
(201,)


In [192]:
# collect features

# splitting by numeric and cat features. note: all features with numbers are num in this dataset
def split_num_cat_by_value(X: pd.DataFrame):
    coerced = X.apply(lambda s: pd.to_numeric(s, errors="coerce"))
    is_numeric_col = coerced.notna().sum(axis=0).eq(X.notna().sum(axis=0))

    num_ftrs = X.columns[is_numeric_col].tolist()
    cat_ftrs = X.columns[~is_numeric_col].tolist()
    
    return num_ftrs, cat_ftrs

num_ftrs, cat_ftrs = split_num_cat_by_value(X)

# no ordinal ftrs in this dataset
ordinal_ftrs = []
ordinal_cats = []

# no overlapping features
assert set(num_ftrs) & set(cat_ftrs) == set()

In [193]:
test = X.drop(columns=cat_ftrs)
cols_with_missing = test.columns[test.isna().any()].tolist()

print(cols_with_missing)

bad_rows = test.index[test[cols_with_missing].isna().any(axis=1)]
print("num na rows:", len(bad_rows))
print("na row indices:", bad_rows.tolist()[:50])

['ttk_ds1_to_atk_str']
num na rows: 3
na row indices: [38, 39, 40]


In [194]:
# fill any non-attacking unit with high ttk
X["ttk_ds1_to_atk_str"] = X["ttk_ds1_to_atk_str"].fillna(1e6)

len(X.columns)

571

In [195]:
print(X.shape)
print(y.shape)

(201, 571)
(201,)


In [235]:
random_states = [0,1,2,3,5]
#,8,9,42,69,100

In [197]:
def eval_from_proba(y_true, y_proba, thr=0.5):
    y_hat = (y_proba >= thr).astype(int)
    
    return {
        "acc": accuracy_score(y_true, y_hat),
        "roc_auc": roc_auc_score(y_true, y_proba),
        "precision": precision_score(y_true, y_hat, zero_division=0),
        "recall": recall_score(y_true, y_hat, zero_division=0),
        "f1": f1_score(y_true, y_hat, zero_division=0),
    }

def baseline_over_seeds(X, y, random_states, test_size=0.1):
    rows = []

    for r in random_states:
        _, _, y_other, y_test = train_test_split(X, y, test_size=test_size, random_state=r, stratify=y)
        p = float(np.mean(y_other))
        y_proba = np.full(shape=len(y_test), fill_value=p, dtype=float)
        m = eval_from_proba(y_test, y_proba)

        rows.append({"seed": r, "p_train": p, **{f"test_{k}": v for k, v in m.items()}})
    
    out = pd.DataFrame(rows)
    summary = out.drop(columns=["seed"]).agg(["mean", "std"])
    return out, summary

baseline_df, baseline_summary = baseline_over_seeds(X, y, random_states)
print(baseline_summary)
baseline_df.to_csv('../results/baseline.csv')

      p_train  test_acc  test_roc_auc  test_precision  test_recall   test_f1
mean      0.5   0.47619           0.5         0.47619          1.0  0.645161
std       0.0   0.00000           0.0         0.00000          0.0  0.000000


for each random state:
    initiate the parameter grid for these five models
    - logistic regression, elastic net
    - random forest, svc, xgboost, knn
    train 5 models over this random state

In [219]:
# make the preprocessors for data
def make_preprocessor(num_ftrs, cat_ftrs) -> ColumnTransformer:
    # define pipelines
    numeric_transformer = Pipeline(steps=[
        ('scalar', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        # fill nans with missing, treating as separate cat
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(sparse_output=False,handle_unknown='ignore'))])

    return ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_ftrs),
            ('cat', categorical_transformer, cat_ftrs)])

In [241]:
for r in random_states:
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.1, random_state=r, stratify=y)

    prep = make_preprocessor(num_ftrs, cat_ftrs)
    prep.fit(X_other)

    Xt_other = prep.transform(X_other)
    Xt_test  = prep.transform(X_test)

    print(Xt_other.shape)
    print(Xt_test.shape)

    assert np.isnan(Xt_other).any() == False
    assert np.isnan(Xt_test).any() == False

    print('\n')

(180, 811)
(21, 811)


(180, 820)
(21, 820)


(180, 818)
(21, 818)


(180, 817)
(21, 817)


(180, 820)
(21, 820)




In [237]:
# function to make models n grids
def make_models_and_grids(random_state: int) -> tuple:
    models = {
        'logreg_l2': LogisticRegression(max_iter=10000, random_state=random_state, penalty='l2', solver='lbfgs'),
        'logreg_elastic': LogisticRegression(max_iter=10000, random_state=random_state, penalty='elasticnet', solver='saga', n_jobs=3),
        'rfc': RandomForestClassifier(n_jobs=3, random_state=random_state, n_estimators=500),
        'svc': SVC(kernel='rbf', probability=True, random_state=random_state),
        'knn': KNeighborsClassifier(n_jobs=3),
        'xgb': xgb.XGBClassifier(random_state=random_state, eval_metric='logloss',tree_method='hist',n_jobs=3)}

    grids = {
        "logreg_l2": {
            "clf__C": [0.01, 0.1, 1, 10, 100],
            "clf__class_weight": [None, "balanced"],
        },

        "logreg_elastic": {
            "clf__C": [0.01, 0.1, 1, 10, 100],
            "clf__l1_ratio": [0.1, 0.5, 0.9],
            "clf__class_weight": [None, "balanced"],
        },

        "rfc": {
            "clf__n_estimators": [300, 800],
            "clf__max_depth": [None, 5, 10],
            "clf__min_samples_leaf": [1, 3, 5, 8],
            "clf__max_features": ["sqrt", 0.25, 0.5],
            "clf__class_weight": [None, "balanced"],
        },

        "svc": {
            "clf__C": [0.1, 1, 10],
            "clf__gamma": ["scale", 0.01, 0.1],
            "clf__class_weight": [None, "balanced"],
        },

        "knn": {
            "clf__n_neighbors": [3, 5, 9, 15],
            "clf__weights": ["uniform", "distance"],
            "clf__p": [1, 2],
        },

        "xgb": {
            "clf__n_estimators": [200, 600],
            "clf__max_depth": [2, 3, 5],
            "clf__learning_rate": [0.03, 0.1],
            "clf__subsample": [0.6, 1.0],
            "clf__colsample_bytree": [0.6, 1.0],
            "clf__reg_lambda": [1.0, 10.0],
        },
    }

    return models, grids

In [222]:
# return 1d array of pos class
def get_cont_scores(fitted_pipeline, X):
    if hasattr(fitted_pipeline, "decision_function"):
        scores = fitted_pipeline.decision_function(X)
        return np.asarray(scores).ravel()

    if hasattr(fitted_pipeline, "predict_proba"):
        proba = fitted_pipeline.predict_proba(X)
        return np.asarray(proba)[:, 1].ravel()

In [229]:
# run a model over all randoms
def one_model(X, y, model_n: str, num_ftrs, cat_ftrs, random_states, test_size=0.1, scoring='accuracy'):
    rows = []

    for r in random_states:
        models, grids = make_models_and_grids(r)
        base_model = models[model_n]
        param_grid = grids[model_n]
        
        X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=test_size, random_state=r, stratify=y)

        preprocessor = make_preprocessor(num_ftrs, cat_ftrs)

        pipe = Pipeline(steps=[
            ('prep', preprocessor),
            ('clf', clone(base_model)),
        ])

        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=r)

        gs = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=scoring, cv=cv, n_jobs=-1, refit=True)
        gs.fit(X_other, y_other)

        # evaluate on held out test
        best_pipe = gs.best_estimator_
        y_pred = best_pipe.predict(X_test)
        scores = get_cont_scores(best_pipe, X_test)
        
        # label metrics
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec  = recall_score(y_test, y_pred, zero_division=0)
        f1   = f1_score(y_test, y_pred, zero_division=0)
        roc_auc_s = roc_auc_score(y_test, scores)

        rows.append({
            "seed": r,
            'model': model_n,
            'best_cv_acc': gs.best_score_,
            'test_acc': accuracy_score(y_test, y_pred),
            'test_roc_auc': roc_auc_score(y_test, scores),
            'best_params': gs.best_params_,
            "test_precision": prec,
            "test_recall": rec,
            "test_f1": f1,
            "test_roc_auc": roc_auc_s})
        
        print('done with random state {r}')

    
    out = pd.DataFrame(rows)
    metric_cols = ["best_cv_acc", "test_acc", "test_precision", "test_recall", "test_f1","test_roc_auc"]
    summary = out[metric_cols].agg(['mean','std']).T

    return out, summary

In [249]:
models = ['xgb']

# 'logreg_l2', 'logreg_elastic', 'rfc', 'svc', 'knn', 'xgb'

for m in models:
    out, summary = one_model(
        X=X,
        y=y,
        model_n=m,
        num_ftrs=num_ftrs,
        cat_ftrs=cat_ftrs,
        random_states=random_states,
        test_size=0.1,
        scoring="accuracy",
    )

    print(summary)
    out.to_csv(f'../results/{m}.csv', index=False)



                    mean       std
best_cv_acc     0.563333  0.021373
test_acc        0.552381  0.079682
test_precision  0.567215  0.141184
test_recall     0.460000  0.054772
test_f1         0.496524  0.030586
test_roc_auc    0.611818  0.114839
