# Spaceship Titanic

In [85]:
import pandas as pd
import numpy as np

## Loading the data

In [86]:
train_data_csv = pd.read_csv("data/train.csv")
test_data_csv = pd.read_csv("data/test.csv")

## Looking at the data

In [87]:
train_data_csv.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [88]:
train_data_csv.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [89]:
missing_values = train_data_csv.isna().sum()
print(missing_values)

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64


In [90]:
print(train_data_csv.dtypes)

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object


In [91]:
convert_dict = {'PassengerId': 'string', 'HomePlanet': 'string', 'CryoSleep': bool, 'Cabin': 'string', 'Destination': 'string', 'VIP': bool, 'Name': 'string'}
# Convert columns using the dictionary
# train_data_csv = train_data_csv.convert_dtypes()
train_data_csv = train_data_csv.astype(dtype=convert_dict)
print(train_data_csv.dtypes)

PassengerId     string[python]
HomePlanet      string[python]
CryoSleep                 bool
Cabin           string[python]
Destination     string[python]
Age                    float64
VIP                       bool
RoomService            float64
FoodCourt              float64
ShoppingMall           float64
Spa                    float64
VRDeck                 float64
Name            string[python]
Transported               bool
dtype: object


In [92]:
train_data_csv.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [93]:
# Check if the number of unique indexes equal the length of the train_data_csv dataframe
print(train_data_csv['PassengerId'].nunique()==len(train_data_csv.index))

True


In [101]:
missing_values = train_data_csv.isna().sum()
print(missing_values)

PassengerId       0
HomePlanet      201
CryoSleep         0
Cabin           199
Destination     182
Age               0
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64


We can "fill in" the missing values for age using the mean value of the column or similar, we remove the rest.

In [102]:
mean_age = train_data_csv['Age'].mean()

train_data_csv['Age'] = train_data_csv['Age'].fillna(value=mean_age)
#train_data_csv['Age'].fillna(value=mean_age, inplace=True)

In [103]:
missing_values = train_data_csv.isna().sum()
print(missing_values)

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
dtype: int64


In [97]:
train_data_csv_copy.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,7100.0,7100.0,7100.0,7100.0,7100.0,7100.0
mean,28.849718,222.166901,476.503662,173.510845,313.330141,300.869014
std,14.389745,648.409495,1671.64887,562.140463,1140.373663,1118.693546
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,20.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,37.0,47.25,81.0,26.0,62.0,46.0
max,79.0,9920.0,29813.0,12253.0,22408.0,20336.0
