In [20]:
import numpy as np
import pandas as pd

train_df = pd.read_csv('train.csv')
if 'ID' in train_df.columns:
    train_df = train_df.drop(columns=['ID'])

X_full = train_df.drop(columns=['Y'])
y_full = train_df['Y'].values

X_filled = X_full.fillna(X_full.median(numeric_only=True))
X_values = X_filled.values
mu = X_values.mean(axis=0)
sigma = X_values.std(axis=0) + 1e-8
std = lambda a: (a - mu) / sigma
X_base = std(X_values)

In [26]:
class Model:
    def __init__(self):
        self.n_estimators = 400
        self.max_depth = 20
        self.min_samples_split = 2
        self.min_samples_leaf = 1
        self.max_features = 'sqrt'
        self.bootstrap = True
        self.trees = []
        self.oob_indices = []
        self.feature_importance_ = None
        self.patience = 10
        self.best_oob = -1
        self.no_improve = 0

    def _gini_impurity(self, y):
        if len(y) == 0:
            return 0
        p = np.sum(y == 1) / len(y)
        return 2 * p * (1 - p)

    def _information_gain(self, y, left_y, right_y):
        n = len(y)
        n_left, n_right = len(left_y), len(right_y)
        if n == 0:
            return 0
        parent_gini = self._gini_impurity(y)
        left_gini = self._gini_impurity(left_y)
        right_gini = self._gini_impurity(right_y)
        weighted_gini = (n_left / n) * left_gini + (n_right / n) * right_gini
        return parent_gini - weighted_gini

    def _build_tree(self, X, y, depth=0, importance_tracker=None):
        n_samples, n_features = X.shape
        if (depth >= self.max_depth or
            n_samples < self.min_samples_split or
            len(np.unique(y)) == 1):
            return {'leaf': True, 'prediction': np.round(np.mean(y))}

        max_features = int(np.sqrt(n_features)) if self.max_features == 'sqrt' else n_features
        feature_indices = np.random.choice(n_features, max_features, replace=False)
        best_gain = -1
        best_feature = None
        best_threshold = None

        for feature_idx in feature_indices:
            thresholds = np.percentile(X[:, feature_idx], [10,25,50,75,90])
            for threshold in thresholds:
                left_mask = X[:, feature_idx] <= threshold
                right_mask = ~left_mask
                if np.sum(left_mask) < self.min_samples_leaf or np.sum(right_mask) < self.min_samples_leaf:
                    continue
                gain = self._information_gain(y, y[left_mask], y[right_mask])
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature_idx
                    best_threshold = threshold

        if best_feature is None:
            return {'leaf': True, 'prediction': np.round(np.mean(y))}

        if importance_tracker is not None:
            importance_tracker[best_feature] += best_gain

        left_mask = X[:, best_feature] <= best_threshold
        right_mask = ~left_mask
        left_tree = self._build_tree(X[left_mask], y[left_mask], depth + 1, importance_tracker)
        right_tree = self._build_tree(X[right_mask], y[right_mask], depth + 1, importance_tracker)
        return {
            'leaf': False,
            'feature': best_feature,
            'threshold': best_threshold,
            'left': left_tree,
            'right': right_tree
        }

    def _predict_tree(self, tree, X):
        if tree['leaf']:
            return np.full(len(X), tree['prediction'])
        predictions = np.zeros(len(X))
        left_mask = X[:, tree['feature']] <= tree['threshold']
        right_mask = ~left_mask
        if np.any(left_mask):
            predictions[left_mask] = self._predict_tree(tree['left'], X[left_mask])
        if np.any(right_mask):
            predictions[right_mask] = self._predict_tree(tree['right'], X[right_mask])
        return predictions

    def fit(self, X, y):
        print(f"Training with OOB-based Early Stopping")
        n_samples, n_features = X.shape

        self.oob_indices = []
        self.trees = []
        self.best_oob = -1
        self.no_improve = 0

        total_importance = np.zeros(n_features)

        for i in range(self.n_estimators):
            indices = np.random.choice(n_samples, n_samples, replace=True)
            oob_idx = np.setdiff1d(np.arange(n_samples), indices)
            self.oob_indices.append(oob_idx)

            X_bootstrap, y_bootstrap = X[indices], y[indices]
            importance_tracker = np.zeros(n_features)
            tree = self._build_tree(X_bootstrap, y_bootstrap, importance_tracker=importance_tracker)
            self.trees.append(tree)
            total_importance += importance_tracker

            if (i + 1) % 10 == 0:
                oob_preds = self._get_oob_predictions(X, i + 1)
                oob_acc = np.mean(oob_preds == y)
                print(f"[{i + 1}] OOB Accuracy: {oob_acc*100:.2f}%")

                if oob_acc > self.best_oob + 1e-6:
                    self.best_oob = oob_acc
                    self.no_improve = 0
                else:
                    self.no_improve += 1
                    if self.no_improve >= self.patience:
                        print(f"Early stopping at tree {i + 1} (OOB Accuracy stagnated)")
                        break

        total = np.sum(total_importance)
        self.feature_importance_ = total_importance / total if total > 0 else np.zeros(n_features)
        print("Model training completed.\n")


    def _get_oob_predictions(self, X, n_trees):
        n_samples = X.shape[0]
        votes = np.zeros(n_samples)
        counts = np.zeros(n_samples)
        for i in range(n_trees):
            oob_idx = self.oob_indices[i]
            if len(oob_idx) == 0:
                continue
            preds = self._predict_tree(self.trees[i], X[oob_idx])
            votes[oob_idx] += preds
            counts[oob_idx] += 1
        final = np.zeros(n_samples)
        mask = counts > 0
        final[mask] = (votes[mask] / counts[mask] > 0.5).astype(int)
        return final

    def predict_proba(self, X):
        predictions = np.zeros(X.shape[0])
        for tree in self.trees:
            predictions += self._predict_tree(tree, X)
        return predictions / len(self.trees)

    def predict(self, X):
        return (self.predict_proba(X) > 0.5).astype(int)


def k_fold_indices(n_samples, k, seed=42):
    rng = np.random.default_rng(seed)
    indices = rng.permutation(n_samples)
    fold_sizes = [n_samples // k] * k
    for i in range(n_samples % k):
        fold_sizes[i] += 1
    folds, current = [], 0
    for size in fold_sizes:
        folds.append(indices[current: current + size])
        current += size
    return folds

def cross_val_score(X, y, params, k=3):
    folds = k_fold_indices(len(X), k)
    scores = []
    print(f"Cross-validation (k={k}) started...")
    for i in range(k):
        print(f"Fold {i + 1}/{k}")
        val_idx = folds[i]
        train_idx = np.hstack([folds[j] for j in range(k) if j != i])
        model = Model()
        for key, value in params.items():
            setattr(model, key, value)
        model.fit(X[train_idx], y[train_idx])
        preds = model.predict(X[val_idx])
        acc = np.mean(preds == y[val_idx])
        scores.append(acc)
        print(f"Fold {i + 1} Accuracy: {acc*100:.2f}%")
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    print(f"\nCV Mean Accuracy: {mean_score*100:.2f}% ± {std_score*100:.2f}%\n")
    return mean_score

In [27]:
# Feature Engineering
correlated_pairs = [(4, 9), (3, 9), (10, 16), (11, 16), (4, 10), (6, 8)]

def add_interactions_combined(X, important_indices, correlated_pairs):
    X_new = X.copy()

    for i in range(len(important_indices)):
        for j in range(i + 1, len(important_indices)):
            fi, fj = important_indices[i], important_indices[j]
            X_new = np.column_stack([X_new, X[:, fi] * X[:, fj]])

    for i in range(len(important_indices)):
        for j in range(len(important_indices)):
            if i == j:
                continue
            fi, fj = important_indices[i], important_indices[j]
            X_new = np.column_stack([X_new, X[:, fi] / (X[:, fj] + 1e-8)])

    for i, j in correlated_pairs:
        X_new = np.column_stack([X_new, X[:, i] * X[:, j]])

    return X_new

init_model = Model()
init_model.max_depth = 15
init_model.fit(X_base, y_full)
init_importance = init_model.feature_importance_
top_feats = np.argsort(init_importance)[::-1][:5]

X_eng = add_interactions_combined(X_base, top_feats, correlated_pairs)

Training with OOB-based Early Stopping
[10] OOB Accuracy: 64.08%
[20] OOB Accuracy: 68.42%
[30] OOB Accuracy: 70.55%
[40] OOB Accuracy: 71.66%
[50] OOB Accuracy: 72.17%
[60] OOB Accuracy: 72.79%
[70] OOB Accuracy: 73.06%
[80] OOB Accuracy: 73.34%
[90] OOB Accuracy: 73.79%
[100] OOB Accuracy: 73.96%
[110] OOB Accuracy: 74.24%
[120] OOB Accuracy: 74.46%
[130] OOB Accuracy: 74.78%
[140] OOB Accuracy: 75.02%
[150] OOB Accuracy: 75.06%
[160] OOB Accuracy: 75.17%
[170] OOB Accuracy: 75.17%
[180] OOB Accuracy: 75.31%
[190] OOB Accuracy: 75.52%
[200] OOB Accuracy: 75.48%
[210] OOB Accuracy: 75.42%
[220] OOB Accuracy: 75.52%
[230] OOB Accuracy: 75.49%
[240] OOB Accuracy: 75.90%
[250] OOB Accuracy: 76.01%
[260] OOB Accuracy: 76.04%
[270] OOB Accuracy: 76.29%
[280] OOB Accuracy: 76.11%
[290] OOB Accuracy: 76.14%
[300] OOB Accuracy: 76.24%
[310] OOB Accuracy: 76.17%
[320] OOB Accuracy: 76.06%
[330] OOB Accuracy: 76.05%
[340] OOB Accuracy: 76.12%
[350] OOB Accuracy: 76.20%
[360] OOB Accuracy: 76.15

In [28]:
# Hyperparameter Search
param_grid = {
    'max_depth': [20],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
}
k_list = [36, 38, 40, 42, 44]

best_score = -1
best_params = None
best_k = None
best_idx = None

full_model = Model()
full_model.fit(X_eng, y_full)
final_importance = full_model.feature_importance_

for k_feat in k_list:
    sel_idx = np.argsort(final_importance)[::-1][:k_feat]
    X_sel = X_eng[:, sel_idx]

    for md in param_grid['max_depth']:
        for mss in param_grid['min_samples_split']:
            for msl in param_grid['min_samples_leaf']:
                params = {'max_depth': md, 'min_samples_split': mss, 'min_samples_leaf': msl}
                score = cross_val_score(X_sel, y_full, params, k=10)
                print(f'k={k_feat} depth={md} split={mss} leaf={msl} -> CV Acc {score*100:.2f}%')

                if score > best_score:
                    best_score = score
                    best_params = params
                    best_k = k_feat
                    best_idx = sel_idx

final_model = Model()
for k, v in best_params.items():
    setattr(final_model, k, v)
X_final = X_eng[:, best_idx]
final_model.fit(X_final, y_full)

print(f'\nBest k = {best_k}')
print(f'Best params = {best_params}')
print(f'Best CV Accuracy = {best_score*100:.2f}%')

feat_names = []

for i in range(20):
    feat_names.append(f'X{i}')

for i in range(30):
    feat_names.append(f'I{i}')

for i in range(6):
    feat_names.append(f'C{i}')

print(f"\n Top {best_k} features selected by importance:\n")
for rank, idx in enumerate(best_idx):
    name = feat_names[idx] if idx < len(feat_names) else f'F{idx}'
    score = final_importance[idx]
    print(f"{rank+1:2d}. {name:>4}  | importance = {score:.4f}")


Training with OOB-based Early Stopping
[10] OOB Accuracy: 68.84%
[20] OOB Accuracy: 72.00%
[30] OOB Accuracy: 72.91%
[40] OOB Accuracy: 74.00%
[50] OOB Accuracy: 74.41%
[60] OOB Accuracy: 75.02%
[70] OOB Accuracy: 75.56%
[80] OOB Accuracy: 75.61%
[90] OOB Accuracy: 75.48%
[100] OOB Accuracy: 75.66%
[110] OOB Accuracy: 75.88%
[120] OOB Accuracy: 76.10%
[130] OOB Accuracy: 76.16%
[140] OOB Accuracy: 76.17%
[150] OOB Accuracy: 76.15%
[160] OOB Accuracy: 76.38%
[170] OOB Accuracy: 76.49%
[180] OOB Accuracy: 76.59%
[190] OOB Accuracy: 76.74%
[200] OOB Accuracy: 76.69%
[210] OOB Accuracy: 76.62%
[220] OOB Accuracy: 76.65%
[230] OOB Accuracy: 76.72%
[240] OOB Accuracy: 76.72%
[250] OOB Accuracy: 76.65%
[260] OOB Accuracy: 76.84%
[270] OOB Accuracy: 77.01%
[280] OOB Accuracy: 76.95%
[290] OOB Accuracy: 76.94%
[300] OOB Accuracy: 76.91%
[310] OOB Accuracy: 76.98%
[320] OOB Accuracy: 77.04%
[330] OOB Accuracy: 76.98%
[340] OOB Accuracy: 76.85%
[350] OOB Accuracy: 76.90%
[360] OOB Accuracy: 76.78

: 

: 

In [None]:
# Load Test Data
test_df = pd.read_csv('test.csv')
test_ids = test_df['ID'].values if 'ID' in test_df.columns else test_df.index
X_test_raw = test_df.drop(columns=['ID'], errors='ignore')

X_test_filled = X_test_raw.fillna(X_full.median(numeric_only=True))
X_test_std = std(X_test_filled.values) 

X_test_eng = add_interactions_combined(X_test_std, top_feats, correlated_pairs)

X_test_final = X_test_eng[:, best_idx]

preds = final_model.predict(X_test_final)

submission_df = pd.DataFrame({
    'ID': test_ids,
    'Potability': preds
})
submission_df.to_csv('submission.csv', index=False)
print("Saved predictions to submission.csv")