In [None]:
!pip install catboost

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)


In [None]:
# Create a directory for the Kaggle API token
!mkdir -p ~/.kaggle

# Move the uploaded token to the correct directory
!cp kaggle.json ~/.kaggle/

# Set the necessary permissions for the token
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c spaceship-titanic

In [None]:
!unzip /content/spaceship-titanic.zip

In [None]:
# Load data
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print("\nFirst few rows of training data:")
train_df.head()


In [None]:
# Check missing values
print("Missing values in training data:")
print(train_df.isnull().sum())
print("\nData types:")
print(train_df.dtypes)


In [None]:
# Feature Engineering Functions

def extract_group_features(df):
    """Extract group-related features from PassengerId"""
    df = df.copy()
    df['Group'] = df['PassengerId'].str.split('_').str[0].astype(int)
    df['GroupPosition'] = df['PassengerId'].str.split('_').str[1].astype(int)

    # Group size
    group_size = df.groupby('Group').size().reset_index(name='GroupSize')
    df = df.merge(group_size, on='Group', how='left')

    # Solo traveler indicator
    df['IsSolo'] = (df['GroupSize'] == 1).astype(int)

    # Family indicator (assuming groups of 2-4 are likely families)
    df['IsFamily'] = ((df['GroupSize'] >= 2) & (df['GroupSize'] <= 4)).astype(int)

    # Large group indicator
    df['IsLargeGroup'] = (df['GroupSize'] > 4).astype(int)

    # Position in group (first, middle, last)
    df['IsFirstInGroup'] = (df['GroupPosition'] == 1).astype(int)
    df['IsLastInGroup'] = (df['GroupPosition'] == df['GroupSize']).astype(int)

    return df

def extract_cabin_features(df):
    """Extract features from Cabin column"""
    df = df.copy()
    # Split cabin into deck, num, side
    cabin_split = df['Cabin'].str.split('/', expand=True)
    df['Deck'] = cabin_split[0]
    df['CabinNum'] = pd.to_numeric(cabin_split[1], errors='coerce')
    df['Side'] = cabin_split[2]

    # Create deck level (A=1, B=2, etc.)
    deck_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'T': 8}
    df['DeckLevel'] = df['Deck'].map(deck_map)

    # Port/Starboard encoding
    df['IsPort'] = (df['Side'] == 'P').astype(int)

    # Cabin region (front, middle, back of ship)
    df['CabinRegion'] = pd.cut(df['CabinNum'], bins=[0, 300, 600, 2000],
                               labels=['Front', 'Middle', 'Back'], include_lowest=True)

    # Special decks
    df['IsTopDeck'] = df['Deck'].isin(['A', 'B', 'T']).astype(int)
    df['IsBottomDeck'] = df['Deck'].isin(['F', 'G']).astype(int)

    return df

def extract_name_features(df):
    """Extract features from Name column"""
    df = df.copy()
    # Extract last name
    df['LastName'] = df['Name'].str.split(' ').str[0]

    # Count of people with same last name (potential family members)
    lastname_count = df.groupby('LastName').size().reset_index(name='LastNameCount')
    df = df.merge(lastname_count, on='LastName', how='left')

    # Has family member indicator
    df['HasFamilyMember'] = (df['LastNameCount'] > 1).astype(int)

    # Name length (might correlate with social status)
    df['NameLength'] = df['Name'].fillna('').str.len()

    return df

def create_spending_features(df):
    """Create features from spending columns"""
    df = df.copy()
    spending_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

    # Total spending
    df['TotalSpending'] = df[spending_cols].sum(axis=1)

    # Number of amenities used
    df['AmenitiesUsed'] = (df[spending_cols] > 0).sum(axis=1)

    # Spending categories
    df['IsHighSpender'] = (df['TotalSpending'] > df['TotalSpending'].quantile(0.75)).astype(int)
    df['IsZeroSpender'] = (df['TotalSpending'] == 0).astype(int)

    # Spending ratios
    for col in spending_cols:
        df[f'{col}_Ratio'] = df[col] / (df['TotalSpending'] + 1)

    # Luxury spending (Spa + VRDeck)
    df['LuxurySpending'] = df['Spa'] + df['VRDeck']
    df['LuxuryRatio'] = df['LuxurySpending'] / (df['TotalSpending'] + 1)

    # Basic spending (RoomService + FoodCourt)
    df['BasicSpending'] = df['RoomService'] + df['FoodCourt']
    df['BasicRatio'] = df['BasicSpending'] / (df['TotalSpending'] + 1)

    # Entertainment spending
    df['EntertainmentSpending'] = df['ShoppingMall'] + df['VRDeck']

    # Spending variance (how varied is their spending)
    df['SpendingVariance'] = df[spending_cols].var(axis=1)

    return df

def create_interaction_features(df):
    """Create interaction features"""
    df = df.copy()

    # CryoSleep interactions (key insight: cryosleep passengers shouldn't spend)
    df['Cryo_Age'] = df['CryoSleep'] * df['Age']
    df['Cryo_VIP'] = df['CryoSleep'] * df['VIP']
    df['Cryo_TotalSpending'] = df['CryoSleep'] * df['TotalSpending']
    df['Cryo_Anomaly'] = ((df['CryoSleep'] == 1) & (df['TotalSpending'] > 0)).astype(int)

    # Age interactions
    df['Age_VIP'] = df['Age'] * df['VIP']
    df['Age_TotalSpending'] = df['Age'] * df['TotalSpending']
    df['Age_GroupSize'] = df['Age'] * df['GroupSize']

    # Group interactions
    df['GroupSize_TotalSpending'] = df['GroupSize'] * df['TotalSpending']
    df['IsSolo_Age'] = df['IsSolo'] * df['Age']
    df['IsSolo_Spending'] = df['IsSolo'] * df['TotalSpending']

    # Planet-Destination interaction
    df['SamePlanetDest'] = (df['HomePlanet'] == df['Destination']).astype(int)

    # VIP interactions
    df['VIP_DeckLevel'] = df['VIP'] * df['DeckLevel']
    df['VIP_LuxurySpending'] = df['VIP'] * df['LuxurySpending']

    return df

def create_age_features(df):
    """Create age-related features"""
    df = df.copy()

    # Age groups
    df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 50, 65, 100],
                           labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior', 'Elder'])

    # Missing age indicator
    df['AgeMissing'] = df['Age'].isna().astype(int)

    # Age statistics within group
    age_group_stats = df.groupby('Group')['Age'].agg(['mean', 'min', 'max', 'std']).reset_index()
    age_group_stats.columns = ['Group', 'GroupAgeMean', 'GroupAgeMin', 'GroupAgeMax', 'GroupAgeStd']
    df = df.merge(age_group_stats, on='Group', how='left')

    # Is youngest/oldest in group
    df['IsYoungestInGroup'] = (df['Age'] == df['GroupAgeMin']).astype(int)
    df['IsOldestInGroup'] = (df['Age'] == df['GroupAgeMax']).astype(int)

    return df

def create_advanced_features(df):
    """Create additional advanced features"""
    df = df.copy()

    # Spending patterns by group
    group_spending = df.groupby('Group')['TotalSpending'].agg(['mean', 'sum', 'std']).reset_index()
    group_spending.columns = ['Group', 'GroupSpendingMean', 'GroupSpendingSum', 'GroupSpendingStd']
    df = df.merge(group_spending, on='Group', how='left')

    # Individual vs group spending ratio
    df['SpendingVsGroupMean'] = df['TotalSpending'] / (df['GroupSpendingMean'] + 1)

    # Cabin mate features (people in same cabin)
    cabin_counts = df.groupby('Cabin').size().reset_index(name='CabinMates')
    df = df.merge(cabin_counts, on='Cabin', how='left')
    df['HasCabinMate'] = (df['CabinMates'] > 1).astype(int)

    # Anomaly detection features
    df['VIP_NoSpending'] = ((df['VIP'] == 1) & (df['TotalSpending'] == 0)).astype(int)
    df['NonVIP_HighSpending'] = ((df['VIP'] == 0) & (df['IsHighSpender'] == 1)).astype(int)

    return df


In [None]:
def preprocess_data(train_df, test_df):
    """Main preprocessing function with advanced imputation and feature engineering"""
    # Combine train and test for consistent preprocessing
    train_df = train_df.copy()
    test_df = test_df.copy()
    train_df['is_train'] = 1
    test_df['is_train'] = 0
    df = pd.concat([train_df, test_df], ignore_index=True)

    # Save PassengerId for later
    passenger_ids = df['PassengerId'].copy()

    # Extract all features
    print("Extracting features...")
    df = extract_group_features(df)
    df = extract_cabin_features(df)
    df = extract_name_features(df)
    df = create_spending_features(df)
    df = create_age_features(df)
    df = create_interaction_features(df)
    df = create_advanced_features(df)

    # Handle missing values
    print("Handling missing values...")

    # Numerical features - use KNN imputer
    numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
                         'CabinNum', 'DeckLevel', 'GroupAgeMean', 'GroupAgeMin', 'GroupAgeMax',
                         'GroupAgeStd', 'GroupSpendingMean', 'GroupSpendingSum', 'GroupSpendingStd']

    # Remove features that might not exist
    numerical_features = [f for f in numerical_features if f in df.columns]

    # Use KNN imputer for numerical features
    imputer = KNNImputer(n_neighbors=5)
    df[numerical_features] = imputer.fit_transform(df[numerical_features])

    # Fill categorical missing values
    categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side',
                           'AgeGroup', 'CabinRegion']

    for col in categorical_features:
        if col in df.columns:
            # Fill with mode for most columns
            if col in ['CryoSleep', 'VIP']:
                df[col] = df[col].fillna(False)
            else:
                mode_val = df[col].mode()
                if len(mode_val) > 0:
                    df[col] = df[col].fillna(mode_val[0])
                else:
                    df[col] = df[col].fillna('Unknown')

    # Encode categorical variables
    print("Encoding categorical variables...")
    label_encoders = {}

    # One-hot encode some features
    one_hot_features = ['HomePlanet', 'Destination', 'Deck', 'CabinRegion']
    for col in one_hot_features:
        if col in df.columns:
            dummies = pd.get_dummies(df[col], prefix=col)
            df = pd.concat([df, dummies], axis=1)
            df = df.drop(col, axis=1)

    # Label encode other categorical features
    label_encode_features = ['AgeGroup', 'Side']
    for col in label_encode_features:
        if col in df.columns:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            label_encoders[col] = le

    # Convert boolean columns to int
    bool_cols = ['CryoSleep', 'VIP']
    for col in bool_cols:
        if col in df.columns:
            df[col] = df[col].astype(int)

    # Add back PassengerId
    df['PassengerId'] = passenger_ids

    # Split back to train and test
    train_df = df[df['is_train'] == 1].drop('is_train', axis=1)
    test_df = df[df['is_train'] == 0].drop('is_train', axis=1)

    return train_df, test_df


In [None]:
# Apply preprocessing
print("Preprocessing data...")
train_processed, test_processed = preprocess_data(train_df, test_df)

print(f"Processed training data shape: {train_processed.shape}")
print(f"Processed test data shape: {test_processed.shape}")

# Save test PassengerIds for submission
test_passenger_ids = test_df['PassengerId']


In [None]:
def fix_nan_values(df):
    """Fix any NaN values that might have been introduced during feature engineering"""
    # Replace inf values with NaN first
    df = df.replace([np.inf, -np.inf], np.nan)

    # Fill NaN values in different types of columns
    for col in df.columns:
        if df[col].dtype in ['float64', 'float32', 'int64', 'int32']:
            # For numerical columns, fill with median
            if df[col].isnull().any():
                df[col] = df[col].fillna(df[col].median())
        else:
            # For other columns, fill with mode or 0
            if df[col].isnull().any():
                mode_val = df[col].mode()
                if len(mode_val) > 0:
                    df[col] = df[col].fillna(mode_val[0])
                else:
                    df[col] = df[col].fillna(0)

    return df

In [None]:
# Apply the NaN fixing to our processed data
print("Applying NaN fixes to processed data...")
train_processed = fix_nan_values(train_processed)
test_processed = fix_nan_values(test_processed)

# Verify no NaN values remain
train_nan_count = train_processed.isnull().sum().sum()
test_nan_count = test_processed.isnull().sum().sum()
print(f"NaN values in train after fixing: {train_nan_count}")
print(f"NaN values in test after fixing: {test_nan_count}")


In [None]:
# Prepare features for modeling
features_to_drop = ['PassengerId', 'Name', 'LastName', 'Cabin', 'Transported', 'Group']
X = train_processed.drop(features_to_drop, axis=1, errors='ignore')
y = train_processed['Transported'].astype(int)
X_test = test_processed.drop(features_to_drop, axis=1, errors='ignore')

# Ensure same columns in train and test
common_cols = list(set(X.columns) & set(X_test.columns))
X = X[common_cols]
X_test = X_test[common_cols]

print(f"Number of features: {len(common_cols)}")
print(f"Training shape: {X.shape}")
print(f"Test shape: {X_test.shape}")

# Scale features using RobustScaler (handles outliers better)
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Create base models with optimized hyperparameters
print("Creating ensemble models...")

# Random Forest
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=12,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)

# Extra Trees (similar to RF but more random)
et = ExtraTreesClassifier(
    n_estimators=400,
    max_depth=12,
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Gradient Boosting
gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.03,
    max_depth=4,
    min_samples_split=4,
    subsample=0.8,
    random_state=42
)

# XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=400,
    learning_rate=0.03,
    max_depth=5,
    min_child_weight=2,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)

# LightGBM
lgb_model = lgb.LGBMClassifier(
    n_estimators=400,
    learning_rate=0.03,
    max_depth=5,
    num_leaves=25,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    verbose=-1
)

# CatBoost
cb_model = CatBoostClassifier(
    iterations=400,
    learning_rate=0.03,
    depth=6,
    l2_leaf_reg=3,
    bootstrap_type='Bernoulli',
    subsample=0.8,
    random_seed=42,
    verbose=False
)

# Logistic Regression
lr = LogisticRegression(
    C=0.1,
    max_iter=2000,
    solver='liblinear',
    random_state=42
)

# SVM
svm = SVC(
    C=1.0,
    kernel='rbf',
    gamma='scale',
    probability=True,
    random_state=42
)


In [None]:
# Cross-validation to evaluate models
print("Performing cross-validation...")
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = {
    'Random Forest': rf,
    'Extra Trees': et,
    'Gradient Boosting': gb,
    'XGBoost': xgb_model,
    'LightGBM': lgb_model,
    'CatBoost': cb_model,
    'Logistic Regression': lr,
    'SVM': svm
}

cv_scores = {}
for name, model in models.items():
    scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='accuracy', n_jobs=-1)
    cv_scores[name] = scores
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

# Create voting classifier with best models
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf),
        ('et', et),
        ('gb', gb),
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('cb', cb_model)
    ],
    voting='soft',
    weights=[1.2, 1.0, 1.1, 1.3, 1.3, 1.2]  # Weight better models more
)

# Evaluate ensemble
ensemble_scores = cross_val_score(voting_clf, X_scaled, y, cv=cv, scoring='accuracy', n_jobs=-1)
print(f"\nEnsemble: {ensemble_scores.mean():.4f} (+/- {ensemble_scores.std() * 2:.4f})")


In [None]:
# Train on full dataset
print("Training final ensemble on full dataset...")
voting_clf.fit(X_scaled, y)

# Make predictions
predictions = voting_clf.predict(X_test_scaled)

# Create submission
submission = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Transported': predictions.astype(bool)
})

# Save submission
submission.to_csv('submission_enhanced.csv', index=False)
print(f"Submission saved! Shape: {submission.shape}")
print("\nFirst few predictions:")
print(submission.head(10))
