In [1]:
#Imports
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor



# Building initial model

In [2]:
#Loading data
train_data = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv", index_col = 'PassengerId') #data used to build the model
display(train_data.head())
test_data = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv", index_col = 'PassengerId') #data used to create submission
display(test_data.head())

#Setting the target
y = train_data.Transported

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [3]:
#Getting overall information about data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Name          8493 non-null   object 
 12  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(6)
memory usage: 891.4+ KB


In [4]:
#Choosing initial features and setting initial X
features_in = ['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP']
X_in = train_data[features_in]
X_in.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0001_01,Europa,False,TRAPPIST-1e,39.0,False
0002_01,Earth,False,TRAPPIST-1e,24.0,False
0003_01,Europa,False,TRAPPIST-1e,58.0,True
0003_02,Europa,False,TRAPPIST-1e,33.0,False
0004_01,Earth,False,TRAPPIST-1e,16.0,False


In [5]:
X_in.HomePlanet.unique() #how many unique home planets there are

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [6]:
X_in.Destination.unique() #how many unique destinations there are

array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', nan], dtype=object)

In [7]:
#Splitting data into training and test set
train_X_in, val_X_in, train_y, val_y = train_test_split(X_in, y, random_state = 0)

In [8]:
#Looking for missing values
print(f'Shape of training data is {train_X_in.shape}')

missing_val_count_by_col = train_X_in.isnull().sum()
print(f'Columns with sum of missing values:\n {missing_val_count_by_col}')


Shape of training data is (6519, 5)
Columns with sum of missing values:
 HomePlanet     150
CryoSleep      160
Destination    137
Age            135
VIP            167
dtype: int64


In [9]:
#preprocessing for numerical data
numerical_col = ["Age"]
numerical_transformer = SimpleImputer(strategy = "median")

#preporcessing for categorical data
categorical_col = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
categorical_transformer = OneHotEncoder(handle_unknown = "ignore")

#Bundle preprocesisng
preprocessor = ColumnTransformer(
    transformers =[
        ('num', numerical_transformer, numerical_col),
        ('cat', categorical_transformer, categorical_col)
    ]
)

#Buildign the model
initial_model = XGBRegressor()

# Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', initial_model)
                          ])

#Preprocessing of training data and fitting the model
pipeline.fit(train_X_in, train_y)

#Preprocessing of validation data and getting predictions
preds = pipeline.predict(val_X_in)

print(f'MAE: {mean_absolute_error(preds, val_y)}')

MAE: 0.3606950640678406


In [10]:
#Creating test_X containing columns with features from test dataset
test_X = test_data[features_in]
#Making predicitions
test_pred = pipeline.predict(test_X)

In [11]:
threshold = 0.5
binary_labels = ["True" if p >= threshold else "False" for p in test_pred]
print(binary_labels)

['True', 'False', 'True', 'False', 'False', 'False', 'True', 'True', 'True', 'False', 'False', 'True', 'True', 'True', 'False', 'False', 'False', 'True', 'True', 'True', 'False', 'False', 'True', 'True', 'False', 'False', 'False', 'True', 'False', 'True', 'True', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'True', 'False', 'True', 'False', 'False', 'False', 'False', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'True', 'True', 'False', 'True', 'False', 'False', 'True', 'True', 'False', 'True', 'False', 'False', 'False', 'False', 'False', 'False', 'False', 'True', 'False', 'True', 'True', 'True', 'True', 'True', 'True', 'False', 'False', 'False', 'False', 'True', 'False', 'True', 'True', 'True', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'False', 'False', 'True', 'False', 'True', 'False', 'True', 'False', 'False', 'True', 'False', 'True', 'True', 'False', 'False', 'False', 'False', 'True', 'True

In [12]:
#Generating a submission
output = pd.DataFrame({'Transported': binary_labels}, index = test_data.index)
output.to_csv('my_submission.csv')
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [13]:
output

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,False
0023_01,False
...,...
9266_02,False
9269_01,False
9271_01,True
9273_01,True
