In [129]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('/kaggle/input/mldl-2025/train.csv')

if 'ID' in df.columns:
    df = df.drop(columns=['ID'])

features = df.drop(columns=['Y'])
X = features.fillna(features.median(numeric_only=True)).values
y = df['Y'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=32, stratify=y)

mu, sigma = X_train.mean(axis=0), X_train.std(axis=0) + 1e-8
std = lambda a: (a - mu) / sigma
X_train_std, X_test_std = std(X_train), std(X_test)

def add_interactions(X):
    """Add interaction features based on correlation analysis"""
    X_new = X.copy()
    
    # Top correlated pairs from the analysis
    interactions = [
        (4, 9),   # 0.787 correlation
        (3, 9),   # 0.755 correlation
        (10, 16), # 0.695 correlation
        (11, 16), # 0.677 correlation
        (4, 10),  # 0.667 correlation
        (6, 8),   # 0.588 correlation
    ]
    
    # Add multiplication interactions
    for i, j in interactions:
        X_new = np.column_stack([X_new, X[:, i] * X[:, j]])
    
    # Add ratio features for features with high individual importance
    important_features = [11, 13, 15, 10, 6, 17]
    for i in range(len(important_features)-1):
        for j in range(i+1, len(important_features)):
            fi, fj = important_features[i], important_features[j]
            # Avoid division by zero
            ratio = X[:, fi] / (X[:, fj] + 1e-8)
            X_new = np.column_stack([X_new, ratio])
    
    return X_new

# Apply feature engineering
X_train_eng = add_interactions(X_train_std)
X_test_eng = add_interactions(X_test_std)

In [127]:
class Model:
    def __init__(self):
        # Random Forest parameters
        self.n_estimators = 400
        self.max_depth = 20
        self.min_samples_split = 2
        self.min_samples_leaf = 1
        self.max_features = 'sqrt'  # sqrt of total features
        self.bootstrap = True
        
        self.trees = []
        self.feature_indices = []
        
        self.oob_indices = []
        self.patience = 10
        self.best_oob = -1
        self.no_improve = 0
        
    def _gini_impurity(self, y):
        """Calculate Gini impurity"""
        if len(y) == 0:
            return 0
        p = np.sum(y == 1) / len(y)
        return 2 * p * (1 - p)
    
    def _information_gain(self, y, left_y, right_y):
        """Calculate information gain"""
        n = len(y)
        if n == 0:
            return 0
        
        n_left = len(left_y)
        n_right = len(right_y)
        
        parent_gini = self._gini_impurity(y)
        left_gini = self._gini_impurity(left_y)
        right_gini = self._gini_impurity(right_y)
        
        weighted_gini = (n_left / n) * left_gini + (n_right / n) * right_gini
        return parent_gini - weighted_gini
    
    def _build_tree(self, X, y, depth=0):
        """Build a decision tree recursively"""
        n_samples, n_features = X.shape
        
        # Stopping criteria
        if (depth >= self.max_depth or 
            n_samples < self.min_samples_split or
            len(np.unique(y)) == 1):
            # Return leaf node with majority class
            return {'leaf': True, 'prediction': np.round(np.mean(y))}
        
        # Feature subsampling
        if self.max_features == 'sqrt':
            max_features = int(np.sqrt(n_features))
        else:
            max_features = n_features
        
        feature_indices = np.random.choice(n_features, max_features, replace=False)
        
        # Find best split
        best_gain = -1
        best_feature = None
        best_threshold = None
        
        for feature_idx in feature_indices:
            # Try multiple threshold candidates
            thresholds = np.percentile(X[:, feature_idx], [10, 25, 50, 75, 90])
            
            for threshold in thresholds:
                left_mask = X[:, feature_idx] <= threshold
                right_mask = ~left_mask
                
                if np.sum(left_mask) < self.min_samples_leaf or np.sum(right_mask) < self.min_samples_leaf:
                    continue
                
                gain = self._information_gain(y, y[left_mask], y[right_mask])
                
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature_idx
                    best_threshold = threshold
        
        # If no good split found
        if best_feature is None:
            return {'leaf': True, 'prediction': np.round(np.mean(y))}
        
        # Split data
        left_mask = X[:, best_feature] <= best_threshold
        right_mask = ~left_mask
        
        # Build subtrees
        left_tree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_tree = self._build_tree(X[right_mask], y[right_mask], depth + 1)
        
        return {
            'leaf': False,
            'feature': best_feature,
            'threshold': best_threshold,
            'left': left_tree,
            'right': right_tree
        }
    
    def _predict_tree(self, tree, X):
        """Make predictions with a single tree"""
        if tree['leaf']:
            return np.full(len(X), tree['prediction'])
        
        predictions = np.zeros(len(X))
        left_mask = X[:, tree['feature']] <= tree['threshold']
        right_mask = ~left_mask
        
        if np.sum(left_mask) > 0:
            predictions[left_mask] = self._predict_tree(tree['left'], X[left_mask])
        if np.sum(right_mask) > 0:
            predictions[right_mask] = self._predict_tree(tree['right'], X[right_mask])
        
        return predictions
    
    def fit(self, X, y):
        """Fit Random Forest model"""
        n_samples = X.shape[0]
        
        for i in range(self.n_estimators):
            # Bootstrap sampling
            if self.bootstrap:
                indices = np.random.choice(n_samples, n_samples, replace=True)
                X_bootstrap = X[indices]
                y_bootstrap = y[indices]
                oob_idx = np.setdiff1d(np.arange(n_samples), indices)
                self.oob_indices.append(oob_idx)
            else:
                X_bootstrap = X
                y_bootstrap = y
                self.oob_indices.append(np.arange(n_samples))
            
            # Build tree
            tree = self._build_tree(X_bootstrap, y_bootstrap)
            self.trees.append(tree)
            
            # Progress update
            if (i + 1) % 10 == 0:                     
                oob_pred = self._get_oob_predictions(X, i + 1)
                oob_acc = np.mean(oob_pred == y)
                print(f"[{i + 1:3d}] OOB Accuracy = {oob_acc*100:.2f}%")

                ### MOD 1-4: 조기 중단
                if oob_acc > self.best_oob + 1e-6:
                    self.best_oob = oob_acc
                    self.no_improve = 0
                else:
                    self.no_improve += 1
                if self.no_improve >= self.patience:
                    print("Early-stop triggered")
                    break
    
    def _get_oob_predictions(self, X, n_trees):
        n_samples = X.shape[0]
        oob_votes = np.zeros(n_samples)
        oob_counts = np.zeros(n_samples)

        for t in range(n_trees):
            idx = self.oob_indices[t]                # ### MOD 1-5
            if idx.size == 0:
                continue
            preds = self._predict_tree(self.trees[t], X[idx])
            oob_votes[idx] += preds
            oob_counts[idx] += 1

        mask = oob_counts > 0
        oob_final = np.zeros(n_samples, dtype=int)
        oob_final[mask] = (oob_votes[mask] / oob_counts[mask] > 0.5).astype(int)
        return oob_final
    
    def predict_proba(self, X):
        """Get probability predictions"""
        n_samples = X.shape[0]
        predictions = np.zeros(n_samples)
        
        for tree in self.trees:
            predictions += self._predict_tree(tree, X)
        
        return predictions / len(self.trees)
    
    def predict(self, X):
        """Make final predictions"""
        probas = self.predict_proba(X)
        return (probas > 0.5).astype(int)
        
"""

# ================================
# Parameter Grid Search 자동화
# ================================

max_depth_list = [5, 10, 15, 20]
min_samples_split_list = [2, 5, 10]
min_samples_leaf_list = [1, 2, 4, 8]


for md in max_depth_list:
    for mss in min_samples_split_list:
        for msl in min_samples_leaf_list:
            model = Model()
            model.n_estimators = 300
            model.max_depth = md
            model.min_samples_split = mss
            model.min_samples_leaf = msl
            model.max_features = 'sqrt'
            model.bootstrap = True
            
            print(f"\n--- Parameters: max_depth={md}, min_samples_split={mss}, min_samples_leaf={msl} ---")
            
            model.fit(X_train, y_train)

            val_preds = model.predict(X_test)
            val_accuracy = np.mean(val_preds == y_test)
            print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
"""



In [128]:
# Instantiate and train
m = Model()
m.fit(X_train_eng, y_train)

# Predict and evaluate
predictions = m.predict(X_test_eng)

# Accuracy
accuracy = np.mean(predictions == y_test)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

[ 10] OOB Accuracy = 67.12%
[ 20] OOB Accuracy = 69.50%
[ 30] OOB Accuracy = 71.11%
[ 40] OOB Accuracy = 71.84%
[ 50] OOB Accuracy = 72.52%
[ 60] OOB Accuracy = 72.89%
[ 70] OOB Accuracy = 73.56%
[ 80] OOB Accuracy = 73.89%
[ 90] OOB Accuracy = 74.12%
[100] OOB Accuracy = 74.42%
[110] OOB Accuracy = 74.39%
[120] OOB Accuracy = 74.30%
[130] OOB Accuracy = 74.22%
[140] OOB Accuracy = 74.41%
[150] OOB Accuracy = 74.62%
[160] OOB Accuracy = 74.73%
[170] OOB Accuracy = 74.88%
[180] OOB Accuracy = 74.88%
[190] OOB Accuracy = 74.97%
[200] OOB Accuracy = 74.91%
[210] OOB Accuracy = 74.88%
[220] OOB Accuracy = 74.92%
[230] OOB Accuracy = 75.14%
[240] OOB Accuracy = 75.08%
[250] OOB Accuracy = 75.11%
[260] OOB Accuracy = 75.34%
[270] OOB Accuracy = 75.38%
[280] OOB Accuracy = 75.25%
[290] OOB Accuracy = 75.39%
[300] OOB Accuracy = 75.44%
[310] OOB Accuracy = 75.38%
[320] OOB Accuracy = 75.44%
[330] OOB Accuracy = 75.31%
[340] OOB Accuracy = 75.53%
[350] OOB Accuracy = 75.53%
[360] OOB Accuracy =

In [None]:
test_df = pd.read_csv('/kaggle/input/mldl-2025/test.csv')
test_ids = test_df.index  # use row index as ID

X_test = test_df.values

# Predict
preds = m.predict(X_test)
# preds = svm.predict(test_df.values)

# ===== Create and Save Submission =====
submission_df = pd.DataFrame({
    'ID': test_ids,
    'Potability': preds
})

submission_df.to_csv('submission.csv', index=False)
print("Saved predictions to submission.csv")