In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


# Exploring dataset


In [2]:
# Load the train and test datasets into Pandas dataframes
train_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

# Explore the first few rows of the train dataset
print("Training DataFrame:")
print(train_df.head())

# Explore the first few rows of the test dataset
print("\nTest DataFrame:")
print(test_df.head())

# Get a summary of the train dataset
print("\nTraining DataFrame Info:")
print(train_df.info())

# Get basic statistics for the numerical features in the train dataset
print("\nTraining DataFrame Description:")
print(train_df.describe())

# Check for missing values in the training data
print("\nMissing Values in Training DataFrame:")
print(train_df.isnull().sum())


Training DataFrame:
  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  

In [3]:
import pandas as pd
from sklearn.impute import KNNImputer

# Step 1: Load the train and test datasets
train_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

# Remove the 'Transported' column from the train dataset before combining
train_df_no_target = train_df.drop(columns=['Transported'])

# Combine the train and test datasets
combined_df = pd.concat([train_df_no_target, test_df], ignore_index=True)

# Step 2: KNN Imputation
# Initialize the KNN Imputer
imputer = KNNImputer(n_neighbors=5)

# Apply KNN Imputation only on numeric columns
numeric_cols = combined_df.select_dtypes(include=[np.number]).columns
imputed_data = imputer.fit_transform(combined_df[numeric_cols])

# Convert the imputed data back into the combined DataFrame
combined_df[numeric_cols] = pd.DataFrame(imputed_data, columns=numeric_cols)

# Step 3: Feature Engineering - Example of splitting the 'Cabin' column
# Splitting the 'Cabin' column into 'Deck', 'Cabin_Number', and 'Side'
combined_df[['Deck', 'Cabin_Number', 'Side']] = combined_df['Cabin'].str.split('/', expand=True)

# Drop the original 'Cabin' column as it's now split
combined_df.drop(columns=['Cabin'], inplace=True)

# Continue with other feature engineering steps here...
# For example: One-hot encoding for categorical columns, creating new features, etc.

# NOTE: Do NOT split the dataset yet! Continue with any other feature engineering as needed.


In [4]:
# Check for missing values in the combined dataset after KNN imputation and feature engineering
print("Missing values in the combined dataset after KNN Imputation and feature engineering:")
print(combined_df.isnull().sum())


Missing values in the combined dataset after KNN Imputation and feature engineering:
PassengerId       0
HomePlanet      288
CryoSleep       310
Destination     274
Age               0
VIP             296
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Name            294
Deck            299
Cabin_Number    299
Side            299
dtype: int64


In [5]:
# Filling missing values in Deck, Cabin_Number, Side with "U" and -1 respectively
combined_df['Deck'].fillna('U', inplace=True)
combined_df['Cabin_Number'].fillna(-1, inplace=True)
combined_df['Side'].fillna('U', inplace=True)

# Filling missing values in HomePlanet, Destination, VIP, and CryoSleep with "U"
combined_df['HomePlanet'].fillna('U', inplace=True)
combined_df['Destination'].fillna('U', inplace=True)
combined_df['VIP'].fillna('U', inplace=True)
combined_df['CryoSleep'].fillna('U', inplace=True)

# Handling the Name column - optional, depending on its utility
combined_df['Name'].fillna('Unknown', inplace=True)

# Verify that there are no more missing values
print("Missing values after filling categorical columns:")
print(combined_df.isnull().sum())


Missing values after filling categorical columns:
PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Deck            0
Cabin_Number    0
Side            0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['Deck'].fillna('U', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['Cabin_Number'].fillna(-1, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

In [6]:
# Apply One-Hot Encoding to categorical columns
categorical_columns = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']

# Use pandas get_dummies to create one-hot encoded columns
combined_df_encoded = pd.get_dummies(combined_df, columns=categorical_columns, drop_first=True)

# Optional: Drop the 'Name' and 'PassengerId' columns if they are not needed for the model
combined_df_encoded.drop(columns=['Name', 'PassengerId'], inplace=True, errors='ignore')

# Check the new dataframe with encoded categorical variables
print("DataFrame after One-Hot Encoding:")
print(combined_df_encoded.head())


DataFrame after One-Hot Encoding:
    Age  RoomService  FoodCourt  ShoppingMall     Spa  VRDeck Cabin_Number  \
0  39.0          0.0        0.0           0.0     0.0     0.0            0   
1  24.0        109.0        9.0          25.0   549.0    44.0            0   
2  58.0         43.0     3576.0           0.0  6715.0    49.0            0   
3  33.0          0.0     1283.0         371.0  3329.0   193.0            0   
4  16.0        303.0       70.0         151.0   565.0     2.0            1   

   HomePlanet_Europa  HomePlanet_Mars  HomePlanet_U  ...  Deck_B  Deck_C  \
0               True            False         False  ...    True   False   
1              False            False         False  ...   False   False   
2               True            False         False  ...   False   False   
3               True            False         False  ...   False   False   
4              False            False         False  ...   False   False   

   Deck_D  Deck_E  Deck_F  Deck_G  Deck_

In [7]:
# Convert boolean columns to integers (0 and 1)
combined_df_encoded = combined_df_encoded.astype(int)

# Verify the changes
print("DataFrame after converting boolean columns to integers:")
print(combined_df_encoded.head())


DataFrame after converting boolean columns to integers:
   Age  RoomService  FoodCourt  ShoppingMall   Spa  VRDeck  Cabin_Number  \
0   39            0          0             0     0       0             0   
1   24          109          9            25   549      44             0   
2   58           43       3576             0  6715      49             0   
3   33            0       1283           371  3329     193             0   
4   16          303         70           151   565       2             1   

   HomePlanet_Europa  HomePlanet_Mars  HomePlanet_U  ...  Deck_B  Deck_C  \
0                  1                0             0  ...       1       0   
1                  0                0             0  ...       0       0   
2                  1                0             0  ...       0       0   
3                  1                0             0  ...       0       0   
4                  0                0             0  ...       0       0   

   Deck_D  Deck_E  Deck_F  Dec

In [8]:
# Split the combined data back into train and test datasets
train_df_final = combined_df_encoded.iloc[:len(train_df), :].copy()
test_df_final = combined_df_encoded.iloc[len(train_df):, :].copy()

# Reattach the 'Transported' column back to the train dataset
train_df_final['Transported'] = train_df['Transported'].values

# Verify the final train and test datasets
print("Final training dataset:")
print(train_df_final.head())

print("\nFinal test dataset:")
print(test_df_final.head())



Final training dataset:
   Age  RoomService  FoodCourt  ShoppingMall   Spa  VRDeck  Cabin_Number  \
0   39            0          0             0     0       0             0   
1   24          109          9            25   549      44             0   
2   58           43       3576             0  6715      49             0   
3   33            0       1283           371  3329     193             0   
4   16          303         70           151   565       2             1   

   HomePlanet_Europa  HomePlanet_Mars  HomePlanet_U  ...  Deck_C  Deck_D  \
0                  1                0             0  ...       0       0   
1                  0                0             0  ...       0       0   
2                  1                0             0  ...       0       0   
3                  1                0             0  ...       0       0   
4                  0                0             0  ...       0       0   

   Deck_E  Deck_F  Deck_G  Deck_T  Deck_U  Side_S  Side_U  Tra

In [9]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Step 1: Prepare the data
X = train_df_final.drop(columns=['Transported'])
y = train_df_final['Transported'].astype(int)  # Ensure that 'Transported' is binary (0 or 1)

# Step 2: Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Step 4: Evaluate the model
y_pred = xgb_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# Step 5: Make predictions on the test set
test_predictions = xgb_model.predict(test_df_final)

# Step 6: Prepare the submission
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Transported': test_predictions.astype(bool)  # Convert to boolean as expected by Kaggle
})

# Export submission to CSV
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")


Validation Accuracy: 0.7872
Submission file created successfully!
