 # Task 1 Pipeline

 This modified notebook integrates **Optuna** for automated hyperparameter tuning and **correlation-based feature selection** into the `scikit-learn` pipeline structure.

 **Key Additions:**
 1.  **Optuna:** Used to efficiently search for the best `XGBClassifier` hyperparameters.
 2.  **Feature Selection:** Implemented inside the Optuna objective function to allow the model to try different feature subsets based on their correlation with the target.

 ## 1. Imports

In [60]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import RobustScaler
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, make_scorer, f1_score

 ## 2. Load Data and Feature Engineering

In [52]:
test_df = pd.read_csv('test.csv')
df = pd.read_csv('train.csv')

def feature_engineer(data):
    epsilon = 1e-6 # Small constant to prevent division by zero
    data['reports_per_day'] = data['reports_received'] / (data['account_age_days'] + epsilon)
    data['kdr_x_hs'] = data['kill_death_ratio'] * data['headshot_percentage']
    data['cheating_skill_metric'] = data['accuracy_score'] * data['headshot_percentage'] * data['spray_control_score']
    
    soft_skill_sum = data['game_sense_score'] + data['team_play_score']
    data['skill_discrepancy'] = (data['headshot_percentage'] + data['accuracy_score']) / (soft_skill_sum + epsilon)
    data['normalized_reaction_time'] = data['reaction_time_ms'] * data['first_blood_rate']
    data['stealth_score'] = data['utility_usage_rate'] / (data['movement_pattern_score'] + epsilon)
    data['technical_red_flags'] = data['device_changes_count'] / (data['input_consistency_score'] + epsilon)
    
    perf_growth_product = data['level_progression_speed'] * data['kill_consistency']
    data['performance_vs_longevity'] = perf_growth_product / (data['account_age_days'] + epsilon)
    data['social_isolation_index'] = data['reports_received'] / (data['friend_network_size'] + epsilon)
    
    return data

# Apply feature engineering to both training and test data
df = feature_engineer(df)
test_df = feature_engineer(test_df)

# Drop rows where the target is missing (if any)
df = df.dropna(subset=['is_cheater'])


 ## 3. Define Features and Split Data

In [53]:
# Assuming all columns except 'is_cheater', 'player_id', and 'id' are potential features.
initial_features = [col for col in df.columns if col not in ['is_cheater', 'player_id', 'id']]
X = df[initial_features]
y = df['is_cheater']

# features = ['reports_received', 'crosshair_placement', 'kdr_x_hs', 'cheating_skill_metric', 'headshot_percentage', 'kill_death_ratio', 'game_sense_score', 'account_age_days', 'accuracy_score', 'reports_per_day', 'level', 'spray_control_score', 'friend_network_size', 'win_rate', 'aiming_smoothness', 'level_progression_speed', 'kill_consistency', 'reaction_time_ms']
features = [
    'crosshair_placement',
    'reports_received',
    'account_age_days',
    'friend_network_size',
    'level',
    'kdr_x_hs',
    'cheating_skill_metric',
    'reports_per_day'
]

# Split Data (critical for preventing leakage)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f'Training set shape: {X_train.shape}')
print(f'Validation set shape: {X_val.shape}')

# Calculate the scale_pos_weight for XGBoost to handle class imbalance
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]


Training set shape: (78198, 40)
Validation set shape: (19550, 40)


 ---

 ## 4. Optuna Objective Function

In [54]:
def objective(trial):
    # --- 4.1 Feature Selection (Correlation-based) ---
    # Combine X_train and y_train to compute correlations
    train_data = X_train.copy()
    train_data['is_cheater'] = y_train
    
    # Prepare data subset for the pipeline
    X_train_sub = X_train[features]
    X_val_sub = X_val[features]

    # --- 4.2 Hyperparameter Tuning for XGBoost ---
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
    }

    # --- 4.3 Define Preprocessing Pipeline for Selected Features ---
    # The preprocessor is only applied to the selected features
    numeric_transformer = Pipeline(steps=[
        ('imputer', IterativeImputer(random_state=42)),
        ('scaler', RobustScaler())
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, features)
        ],
        remainder='passthrough'
    )
    
    # --- 4.4 Create and Train the Full Pipeline ---
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(
            random_state=42, 
            use_label_encoder=False, 
            eval_metric='logloss',
            scale_pos_weight=scale_pos_weight,
            **param
        ))
    ])

    # Train the pipeline
    model_pipeline.fit(X_train_sub, y_train)

    # Evaluate on the validation set. ROC AUC is a robust metric for imbalanced classification.
    y_pred_proba = model_pipeline.predict_proba(X_val_sub)[:, 1]
    roc_auc = roc_auc_score(y_val, y_pred_proba)
    
    return roc_auc


 ---

 ## 5. Run Optuna Study



 We run the Optuna study to find the best combination of feature selection and XGBoost hyperparameters.

In [None]:
study = optuna.create_study(direction="maximize", study_name="xgb_pipeline_tuning", load_if_exists=True)
study.optimize(objective, n_trials=50, show_progress_bar=True)

best_trial = study.best_trial

print("\n--- Optuna Study Results ---")
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best ROC AUC Score: {best_trial.value:.4f}")
print("Best Hyperparameters:")
for key, value in best_trial.params.items():
    print(f"  {key}: {value}")


 ---

 ## 6. Build and Train the Final Best Model



 We now use the best settings found by Optuna to train the final model on the full training set (X_train, y_train).

In [55]:
# Bypass Tuning
best_params = {
    'n_estimators': 1000,
    'max_depth': 8,
    'learning_rate': 0.007816711717369125,
    'subsample': 0.7415444084261885,
    'colsample_bytree': 0.6755835023784938,
    'gamma': 0.00013918059131830187,
    'min_child_weight': 9,
    'reg_lambda': 4.893454406264408e-06
}

In [56]:
# Separate best hyperparameters and the correlation threshold
best_params = {k: v for k, v in best_trial.params.items() if k != 'corr_threshold'}

# --- Final Preprocessing Pipeline for the Best Feature Set ---
# The preprocessor only needs to know about the selected features
numeric_transformer_final = Pipeline(steps=[
    ('imputer', IterativeImputer(random_state=42)),
    ('scaler', RobustScaler())
])

preprocessor_final = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer_final, features)
    ],
    remainder='passthrough'
)

# --- Final Model Pipeline ---
final_model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_final),
    ('classifier', XGBClassifier(
        random_state=42, 
        use_label_encoder=False, 
        eval_metric='logloss', 
        scale_pos_weight=scale_pos_weight, 
        **best_params
    ))
])

# Train the final pipeline on the selected features
print("\nTraining the final best model pipeline...")
final_model_pipeline.fit(X_train[features], y_train)
print("Training complete.")



Training the final best model pipeline...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Training complete.


 ## 7. Evaluate the Final Model

In [61]:
print("\nEvaluating the final model on the validation set...")
# Ensure validation data only contains the selected features
X_val_sub = X_val[features]

y_pred = final_model_pipeline.predict(X_val_sub)
y_pred_proba = final_model_pipeline.predict_proba(X_val_sub)[:, 1]

print("ROC AUC Score:", roc_auc_score(y_val, y_pred_proba))
print("F1 Score:", f1_score(y_val, y_pred))
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))



Evaluating the final model on the validation set...
ROC AUC Score: 0.883809966018385
F1 Score: 0.7095189600431319
Accuracy: 0.7519693094629156
Classification Report:
               precision    recall  f1-score   support

         0.0       0.91      0.69      0.78     12724
         1.0       0.60      0.87      0.71      6826

    accuracy                           0.75     19550
   macro avg       0.75      0.78      0.75     19550
weighted avg       0.80      0.75      0.76     19550



 ## 8. Make Predictions on the Test Set

In [58]:
test_features_sub = test_df[features] 

# The pipeline automatically applies all the same preprocessing steps
test_predictions = final_model_pipeline.predict(test_features_sub)

# Create submission file
submission_df = pd.DataFrame({'id': test_df['id'], 'is_cheater': test_predictions})
submission_df.to_csv('submission_optuna.csv', index=False)