In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dummy-competition-pro-battle/train_set_dummy.csv
/kaggle/input/dummy-competition-pro-battle/test_set_dummy.csv
/kaggle/input/dummy-competition-pro-battle/sample_solution_dummy.csv


In [16]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, precision_recall_curve, roc_auc_score
from sklearn.impute import SimpleImputer


# Specific models
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score

In [27]:
def load_and_prepare_data(filepath, sample_size=None):
    """
    Load data with optional sampling for quick iterations
    
    Parameters:
    filepath: str - Path to the dataset
    sample_size: int or None - Number of samples to load (None for full dataset)
    
    Returns:
    X, y - Features and target variables
    """
    # Read the data
    df = pd.read_csv(filepath)
    
    # Optional sampling for quick iterations
    if sample_size is not None:
        df = df.sample(n=sample_size, random_state=42)
    
    # Separate features and target
    X = df.drop('target', axis=1)
    y = df['target']
    
    return X, y

# Usage example
X, y = load_and_prepare_data('/kaggle/input/dummy-competition-pro-battle/train_set_dummy.csv')
X = X.drop('row_id', axis=1)

# For quick iterations with sample
sX, sy = load_and_prepare_data('/kaggle/input/dummy-competition-pro-battle/train_set_dummy.csv', sample_size=1000)
sX = sX.drop('row_id', axis=1)



In [5]:
def initialize_models():
    """
    Initialize all models with their hyperparameters
    Returns dictionary of model instances
    """
    models = {
        'xgboost': xgb.XGBClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            random_state=42
        ),
        'random_forest': RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=42
        ),
        'logistic': LogisticRegression(
            max_iter=1000,
            random_state=42
        )
    }
    return models

In [12]:
# Custom transformer for feature selection based on percentage NOTE: THIS IS JUST A EXAMPLE CLASS, YOU CAN MAKE YOUR OWN TAYLOARED TO YOUR OWN NEEDS. WE RECOMMEND TO USE AI ASSISTANT TO CREATE THIS
class PercentageFeatureSelector(BaseEstimator, TransformerMixin):
    """
    Custom transformer that selects a percentage of features based on importance
    """
    def __init__(self, feature_percentage=0.8):
        self.feature_percentage = feature_percentage
        self.selected_features = None
    
    def fit(self, X, y=None):
        # Calculate feature importance using correlation with target
        if isinstance(X, pd.DataFrame):
            correlations = abs(X.corrwith(pd.Series(y)))
            n_features = int(len(correlations) * self.feature_percentage)
            self.selected_features = correlations.nlargest(n_features).index
        return self
    
    def transform(self, X):
        return X[self.selected_features] if isinstance(X, pd.DataFrame) else X

# Create the main feature processing pipeline
def create_feature_pipeline():
    """
    Creates a complete feature processing pipeline
    """
    feature_pipeline = Pipeline([
        ('feature_selector', PercentageFeatureSelector(feature_percentage=0.8)),
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', MinMaxScaler())
        # Add more transformers as needed
    ])
    return feature_pipeline

In [24]:
#Training, this is using k-fold cross validation as it is best for providing the model performance. If the data is large you can use simple training too, change it as you need.
def train_and_evaluate(X, y, models, k=5, model_name='xgboost'):
    """
    Train and evaluate multiple models using K-Fold cross-validation and the feature pipeline.
    
    Parameters:
    X: Features DataFrame
    y: Target Series
    feature_pipeline: Sklearn Pipeline object
    models: Dictionary of model instances
    k: Number of folds for cross-validation (default=5)
    
    Returns:
    Dictionary of results
    """
    # Converting to DataFrame to use iloc
    X_df = pd.DataFrame(X)
    y_df = pd.Series(y)
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    
    results = {name: {'accuracy': [], 'auc_roc': []} for name in models.keys()}
    
    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X_df.iloc[train_idx], X_df.iloc[test_idx]
        y_train, y_test = y_df.iloc[train_idx], y_df.iloc[test_idx]
        
        
        for name, model in models.items():
            # Train model
            model.fit(X_train, y_train)
            
            # Predict
            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)[:, 1]
            
            # Store metrics
            results[name]['accuracy'].append(accuracy_score(y_test, y_pred))
            results[name]['auc_roc'].append(roc_auc_score(y_test, y_proba))
    
    # Aggregate results
    final_results = {
        name: {
            'accuracy_mean': np.mean(metrics['accuracy']),
            'accuracy_std': np.std(metrics['accuracy']),
            'auc_roc_mean': np.mean(metrics['auc_roc']),
            'auc_roc_std': np.std(metrics['auc_roc']),
        }
        for name, metrics in results.items()
    }
    
    return final_results, results[model_name]['auc_roc']

In [28]:
# Initialize models
models = initialize_models()

# Create feature pipeline
feature_pipeline = create_feature_pipeline()
X_transformed = feature_pipeline.fit_transform(X)

# Train and evaluate
results, _ = train_and_evaluate(X_transformed, y, models)

# Display results
for model_name, metrics in results.items():
    print(f"\nResults for {model_name}:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")


Results for xgboost:
accuracy_mean: 0.5745
accuracy_std: 0.0162
auc_roc_mean: 0.6027
auc_roc_std: 0.0181

Results for random_forest:
accuracy_mean: 0.5605
accuracy_std: 0.0259
auc_roc_mean: 0.5883
auc_roc_std: 0.0253

Results for logistic:
accuracy_mean: 0.5030
accuracy_std: 0.0344
auc_roc_mean: 0.5107
auc_roc_std: 0.0356


In [30]:
#this will help you create solution files for submissions
def predict_and_save(test_file, feature_pipeline, model, output_file="solution.csv"):
    """
    Load the test file, preprocess features, predict probabilities, and save results.

    Parameters:
    test_file (str): Path to the test file (CSV format).
    feature_pipeline (Pipeline): Pretrained feature processing pipeline.
    model (object): Trained model with predict_proba method.
    output_file (str): Name of the output CSV file (default: 'solution.csv').

    Returns:
    None
    """
    # Load test data
    test_data = pd.read_csv(test_file)

    # Extract row IDs
    row_ids = test_data.pop("row_id")

    # Transform features
    X_test_processed = feature_pipeline.transform(test_data)

    # Predict probabilities
    y_pred_proba = model.predict_proba(X_test_processed)[:, 1]  # Probability for positive class

    # Create and save solution file
    submission = pd.DataFrame({"row_id": row_ids, "target": y_pred_proba})
    submission.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")


In [32]:
predict_and_save('/kaggle/input/dummy-competition-pro-battle/test_set_dummy.csv', feature_pipeline, models['xgboost'])

Predictions saved to solution.csv
