In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
import lightgbm as lgb
import optuna
from typing import List, Dict
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
train = pd.read_csv(r'..\data\train.csv')
blind = pd.read_csv(r'..\data\test.csv')

In [10]:
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [9]:
class LightGBMClassifier:
    def __init__(self, categorical_features: List[str]):
        """
        Initialize the classifier with categorical feature names
        
        Args:
            categorical_features: List of categorical column names
        """
        self.categorical_features = categorical_features
        self.models = []
        self.best_params = None
        self.feature_importance = None
        
        # Configure LightGBM for GPU use
        self.gpu_params = {
            'device': 'gpu',
            'gpu_platform_id': 0,
            'gpu_device_id': 0,
            'gpu_use_dp': True,  # Set to False for single precision
            'tree_learner': 'gpu',
            'force_col_wise': True  # Required for GPU training
        }
        
    def objective(self, trial: optuna.Trial, X: pd.DataFrame, y: pd.Series) -> float:
        """
        Objective function for Optuna hyperparameter optimization
        """
        param = {
            'objective': 'binary',
            'metric': 'auc',
            'verbosity': -1,
            
            # GPU-specific parameters
            **self.gpu_params,
            
            # Hyperparameters to optimize
            'num_leaves': trial.suggest_int('num_leaves', 20, 100),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.6, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.6, 1.0),
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
            
            # GPU-specific suggestions
            'max_bin': trial.suggest_int('max_bin', 63, 255),
            'num_iterations': trial.suggest_int('num_iterations', 100, 500)
        }
        
        # 5-fold cross-validation
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scores = []
        
        for train_idx, val_idx in cv.split(X, y):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            train_data = lgb.Dataset(
                X_train, 
                label=y_train,
                categorical_feature=self.categorical_features,
                free_raw_data=False  # Keep raw data in memory for GPU
            )
            val_data = lgb.Dataset(
                X_val,
                label=y_val,
                categorical_feature=self.categorical_features,
                reference=train_data,
                free_raw_data=False
            )
            
            model = lgb.train(
                param,
                train_data,
                valid_sets=[val_data],
                early_stopping_rounds=50,
                verbose_eval=False
            )
            
            preds = model.predict(X_val)
            score = roc_auc_score(y_val, preds)
            scores.append(score)
            
        return np.mean(scores)
    
    def optimize_hyperparameters(self, X: pd.DataFrame, y: pd.Series, n_trials: int = 50) -> Dict:
        """
        Optimize hyperparameters using Optuna
        
        Args:
            X: Feature DataFrame
            y: Target series
            n_trials: Number of optimization trials
            
        Returns:
            Dictionary of best parameters
        """
        study = optuna.create_study(direction='maximize')
        study.optimize(
            lambda trial: self.objective(trial, X, y),
            n_trials=n_trials
        )
        
        self.best_params = study.best_params
        self.best_params.update({
            'objective': 'binary',
            'metric': 'auc',
            'verbosity': -1,
            **self.gpu_params  # Add GPU parameters to best params
        })
        
        return self.best_params
    
    def train(self, X: pd.DataFrame, y: pd.Series, n_folds: int = 5) -> List[float]:
        """
        Train the model using k-fold cross-validation
        
        Args:
            X: Feature DataFrame
            y: Target series
            n_folds: Number of cross-validation folds
            
        Returns:
            List of validation scores
        """
        # Verify GPU is available
        try:
            lgb.Dataset(X.head(), y.head()).construct()
            print("GPU acceleration is enabled and working.")
        except Exception as e:
            print(f"Warning: GPU acceleration might not be properly configured. Error: {e}")
            print("Falling back to CPU. To enable GPU, ensure LightGBM is built with GPU support.")
            self.gpu_params = {'device': 'cpu'}
        
        cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
        scores = []
        feature_importance_dfs = []
        
        for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
            print(f"Training fold {fold}")
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            train_data = lgb.Dataset(
                X_train,
                label=y_train,
                categorical_feature=self.categorical_features,
                free_raw_data=False
            )
            val_data = lgb.Dataset(
                X_val,
                label=y_val,
                categorical_feature=self.categorical_features,
                reference=train_data,
                free_raw_data=False
            )
            
            model = lgb.train(
                self.best_params,
                train_data,
                valid_sets=[val_data],
                early_stopping_rounds=50,
                verbose_eval=100
            )
            
            self.models.append(model)
            
            preds = model.predict(X_val)
            score = roc_auc_score(y_val, preds)
            scores.append(score)
            
            importance_df = pd.DataFrame({
                'feature': X.columns,
                'importance': model.feature_importance('gain')
            })
            importance_df['fold'] = fold
            feature_importance_dfs.append(importance_df)
            
        self.feature_importance = pd.concat(feature_importance_dfs)
        self.feature_importance = self.feature_importance.groupby('feature')['importance'].mean().reset_index()
        self.feature_importance = self.feature_importance.sort_values('importance', ascending=False)
        
        return scores
    
    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """
        Make predictions using the average of all models
        
        Args:
            X: Feature DataFrame
            
        Returns:
            Array of predictions
        """
        predictions = np.zeros(len(X))
        for model in self.models:
            predictions += model.predict(X)
        return predictions / len(self.models)
    
    def get_feature_importance(self) -> pd.DataFrame:
        """
        Get feature importance DataFrame
        
        Returns:
            DataFrame with feature importance information
        """
        return self.feature_importance

In [None]:
def main():
    # Load your data
    # df = pd.read_csv('your_data.csv')
    
    # For demonstration, create sample data
    np.random.seed(42)
    n_samples = 1000
    df = train
    
    # Prepare data
    X = df.drop('transported', axis=1)
    y = df['transported']
    
    # Initialize classifier
    categorical_features = ['cat1', 'cat2', 'cat3', 'cat4', 'cat5']
    classifier = LightGBMClassifier(categorical_features=categorical_features)
    
    # Optimize hyperparameters
    print("Optimizing hyperparameters...")
    best_params = classifier.optimize_hyperparameters(X, y, n_trials=20)
    print("Best parameters:", best_params)
    
    # Train model
    print("\nTraining model with cross-validation...")
    scores = classifier.train(X, y)
    print(f"\nCross-validation scores: {scores}")
    print(f"Mean CV score: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")
    
    # Get feature importance
    importance_df = classifier.get_feature_importance()
    print("\nFeature Importance:")
    print(importance_df)
    
    # Make predictions
    predictions = classifier.predict(X)
    print("\nSample predictions:", predictions[:5])

if __name__ == "__main__":
    main()