In [1]:
import pandas as pd
from datetime import datetime
import os
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
import matplotlib.pyplot as plt
import feat_eng
import importlib
import model
import optuna
from functools import partial
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
importlib.reload(feat_eng)

In [2]:
data = pd.read_csv(r'..\data\train.csv')
test = pd.read_csv(r'..\data\test.csv')
X_train, X_test, y_train, y_test, X_transformed = feat_eng.feat_eng(data, train=True)
blind = feat_eng.feat_eng(test)

In [None]:
data

In [7]:
X = data.drop(columns='Transported')
y = data['Transported']

In [None]:
def calculate_svm_accuracy(X_transformed, y, feature_names):
    """
    Calculate SVM prediction accuracy for each pair of features using PyTorch for GPU acceleration.
    
    :param X_transformed: Preprocessed and transformed features (after One-Hot Encoding and Scaling)
    :param y: True target values (e.g., 'Transported' in your dataset)
    :param feature_names: List of feature names for the transformed data
    :return: DataFrame containing accuracy score for each pair of features
    """
    import pandas as pd
    import numpy as np
    import torch
    from sklearn.model_selection import train_test_split
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score
    from joblib import Parallel, delayed
    
    # Check if GPU is available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Convert to numpy arrays if needed
    X_array = X_transformed.values if hasattr(X_transformed, 'values') else X_transformed
    feature_names = list(X_transformed.columns) if hasattr(X_transformed, 'columns') else feature_names
    
    # Create a consistent train/test split once
    X_indices = np.arange(X_array.shape[0])
    train_indices, test_indices = train_test_split(X_indices, test_size=0.3, random_state=42)
    
    # This function will process batches of feature pairs to maximize GPU utilization
    def process_feature_pairs_batch(feature_pairs_batch):
        results = []
        
        # Move data to GPU in batches
        X_batch_train = X_array[train_indices]
        X_batch_test = X_array[test_indices]
        y_batch_train = y[train_indices]
        y_batch_test = y[test_indices]
        
        # Convert to PyTorch tensors and move to GPU
        X_batch_train_tensor = torch.tensor(X_batch_train, dtype=torch.float32, device=device)
        X_batch_test_tensor = torch.tensor(X_batch_test, dtype=torch.float32, device=device)
        
        for i, j in feature_pairs_batch:
            # Extract feature pairs using PyTorch indexing
            X_pair_train = X_batch_train_tensor[:, [i, j]].cpu().numpy()
            X_pair_test = X_batch_test_tensor[:, [i, j]].cpu().numpy()
            
            # Train and evaluate (SVC on CPU as PyTorch doesn't have direct SVM implementation)
            svm = SVC(kernel='linear', C=1.0, cache_size=1000, max_iter=1000)
            svm.fit(X_pair_train, y_batch_train)
            y_pred = svm.predict(X_pair_test)
            accuracy = accuracy_score(y_batch_test, y_pred)
            
            results.append({
                'Feature 1': feature_names[i],
                'Feature 2': feature_names[j],
                'Accuracy': accuracy
            })
        
        return results
    
    # Generate all feature pairs
    num_features = X_array.shape[1]
    feature_pairs = [(i, j) for i in range(num_features) for j in range(i+1, num_features)]
    
    # Process in batches to better utilize GPU memory
    batch_size = 200  # Adjust batch size based on your GPU memory
    accuracy_results = []
    
    for i in range(0, len(feature_pairs), batch_size):
        batch_pairs = feature_pairs[i:i+batch_size]
        # Process each batch in parallel
        batch_results = process_feature_pairs_batch(batch_pairs)
        accuracy_results.extend(batch_results)
        
        # Print progress
        print(f"Processed {min(i+batch_size, len(feature_pairs))}/{len(feature_pairs)} feature pairs")
    
    # Convert results to DataFrame and sort by accuracy (descending)
    accuracy_df = pd.DataFrame(accuracy_results)
    accuracy_df = accuracy_df.sort_values('Accuracy', ascending=False)
    
    # Clean up GPU memory
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    
    return accuracy_df

# Usage
accuracy_df = calculate_svm_accuracy(X_transformed, y_train, X_transformed.columns)
# print(accuracy_df.head(10))  # Show top 10 feature pairs by accuracy

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import time

# Force GPU usage and print device information
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
        # Set this to prevent TF from grabbing all GPU memory if TF is installed
    import os
    os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
else:
    print("No GPU found, using CPU")

  from .autonotebook import tqdm as notebook_tqdm


No GPU found, using CPU


In [2]:
torch.__version__

'1.10.2'

In [None]:
def calculate_svm_accuracy(X_transformed, y, feature_names):
    """
    Calculate SVM prediction accuracy for each pair of features using TensorFlow for GPU acceleration.
    
    :param X_transformed: Preprocessed and transformed features (after One-Hot Encoding and Scaling)
    :param y: True target values (e.g., 'Transported' in your dataset)
    :param feature_names: List of feature names for the transformed data
    :return: DataFrame containing accuracy score for each pair of features
    """
    import pandas as pd
    import numpy as np
    import tensorflow as tf
    from sklearn.model_selection import train_test_split
    from joblib import Parallel, delayed
    
    # Check if GPU is available
    gpus = tf.config.list_physical_devices('GPU')
    print(f"TensorFlow detected {len(gpus)} GPU(s)")
    if gpus:
        for gpu in gpus:
            print(f" - {gpu.name}")
    
    # Convert to numpy arrays if needed
    X_array = X_transformed.values if hasattr(X_transformed, 'values') else X_transformed
    feature_names = list(X_transformed.columns) if hasattr(X_transformed, 'columns') else feature_names
    
    # Create a consistent train/test split once
    X_indices = np.arange(X_array.shape[0])
    train_indices, test_indices = train_test_split(X_indices, test_size=0.3, random_state=42)
    y_train, y_test = y[train_indices], y[test_indices]
    
    # Convert y to one-hot encoding if needed (for multi-class problems)
    if len(np.unique(y)) > 2:
        # For multi-class
        num_classes = len(np.unique(y))
        y_train_tf = tf.keras.utils.to_categorical(y_train, num_classes)
        y_test_tf = tf.keras.utils.to_categorical(y_test, num_classes)
    else:
        # For binary classification
        y_train_tf = y_train
        y_test_tf = y_test
    
    def process_feature_pair(i, j):
        # Extract feature pair
        X_pair_train = X_array[train_indices][:, [i, j]]
        X_pair_test = X_array[test_indices][:, [i, j]]
        
        # Define a TensorFlow model with a structure equivalent to a linear SVM
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(1, activation='linear', input_shape=(2,), 
                                  kernel_regularizer=tf.keras.regularizers.l2(0.01))
        ])
        
        # Use hinge loss for SVM-like behavior
        if len(np.unique(y)) == 2:
            # Binary classification
            model.compile(
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
                loss=tf.keras.losses.Hinge(),
                metrics=['accuracy']
            )
            # Convert labels to -1 and 1 for hinge loss
            y_train_svm = np.where(y_train_tf == 0, -1, 1)
            
        else:
            # Multi-class classification
            model.compile(
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
                loss=tf.keras.losses.CategoricalHinge(),
                metrics=['accuracy']
            )
            y_train_svm = y_train_tf
            
        # Train the model with early stopping
        early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)
        model.fit(
            X_pair_train, y_train_svm, 
            epochs=50, 
            batch_size=32, 
            verbose=0,
            validation_split=0.2,
            callbacks=[early_stop]
        )
        
        # Evaluate the model
        _, accuracy = model.evaluate(X_pair_test, y_test_tf if len(np.unique(y)) > 2 else np.where(y_test_tf == 0, -1, 1), verbose=0)
        
        # Clear TensorFlow session to free memory
        tf.keras.backend.clear_session()
        
        return {
            'Feature 1': feature_names[i],
            'Feature 2': feature_names[j],
            'Accuracy': accuracy
        }
    
    # Generate all feature pairs
    num_features = X_array.shape[1]
    feature_pairs = [(i, j) for i in range(num_features) for j in range(i+1, num_features)]
    
    # Process feature pairs in batches
    batch_size = 50  # Adjust based on your available memory
    accuracy_results = []
    
    for i in range(0, len(feature_pairs), batch_size):
        batch_pairs = feature_pairs[i:i+batch_size]
        batch_results = Parallel(n_jobs=4)(  # Limited parallelism to avoid GPU contention
            delayed(process_feature_pair)(i, j) for i, j in batch_pairs
        )
        accuracy_results.extend(batch_results)
        
        # Print progress
        print(f"Processed {min(i+batch_size, len(feature_pairs))}/{len(feature_pairs)} feature pairs")
    
    # Convert results to DataFrame and sort by accuracy (descending)
    accuracy_df = pd.DataFrame(accuracy_results)
    accuracy_df = accuracy_df.sort_values('Accuracy', ascending=False)
    
    return accuracy_df

# Usage
accuracy_df = calculate_svm_accuracy(X_transformed, y_train, X_transformed.columns)
# print(accuracy_df.head(10))  # Show top 10 feature pairs by accuracy

TensorFlow detected 1 GPU(s)
 - /physical_device:GPU:0


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize( partial(model.objective, X_train=X_train, y_train=y_train), n_trials=15, show_progress_bar=True)

# Get best parameters
best_params = study.best_params

# Add fixed parameters to best_params
best_params.update({
    'objective': 'binary:logistic',
    'eval_metric': ['error', 'auc'],
    'tree_method': 'gpu_hist',
    'device': 'cuda',
    'seed': 42
})

# Print results
print("\nBest parameters:", best_params)
print(f"Best CV score: {study.best_value:.4f}")

In [None]:
# Train final model with best parameters
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_test, label=y_test)

final_model = xgb.train(
    best_params,
    dtrain,
    num_boost_round=10000,
    early_stopping_rounds=50,
    evals=[(dval, 'validation')],
    verbose_eval=100
)

In [None]:
(data['VIP']==True).sum()

In [None]:
cols = ['RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
(data['sp_cols']==True).sum()

In [11]:
dblind = xgb.DMatrix(blind)
y_pred_prob = final_model.predict(dblind)
# Calculate the accuracy
predicted_y  = (y_pred_prob  > 0.5).astype(int)

sub = pd.merge(test[['PassengerId']], pd.DataFrame(predicted_y), left_index=True, right_index=True)#.to_csv('data/ak_submission.csv', index=False)
sub = sub.rename(columns={0:'Transported'})
sub['Transported'] = sub['Transported'].map({0:False, 1:True})
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"submissionxgboost_{timestamp}.csv"
filepath = os.path.join(r"../submissions", filename)  # Cross-platform

# Save DataFrame to CSV
sub.to_csv(filepath, index=False)