# Spaceship Titanic

In [107]:
import pandas as pd
import numpy as np

## Loading the data

In [126]:
train_data_csv = pd.read_csv("data/train.csv")
test_data_csv = pd.read_csv("data/test.csv")

## Looking at the data

In [127]:
train_data_csv.head()
print(len(train_data_csv.index))

8693


In [110]:
train_data_csv.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [111]:
missing_values = train_data_csv.isna().sum()
print(missing_values)

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64


In [112]:
print(train_data_csv.dtypes)

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object


In [113]:
convert_dict = {'PassengerId': 'string', 'HomePlanet': 'string', 'CryoSleep': bool, 'Cabin': 'string', 'Destination': 'string', 'VIP': bool, 'Name': 'string'}
# Convert columns using the dictionary
# train_data_csv = train_data_csv.convert_dtypes()
train_data_csv = train_data_csv.astype(dtype=convert_dict)
print(train_data_csv.dtypes)

PassengerId     string[python]
HomePlanet      string[python]
CryoSleep                 bool
Cabin           string[python]
Destination     string[python]
Age                    float64
VIP                       bool
RoomService            float64
FoodCourt              float64
ShoppingMall           float64
Spa                    float64
VRDeck                 float64
Name            string[python]
Transported               bool
dtype: object


We should split "Cabin" into 3 variables: deck num side.

In [114]:
train_data_csv[['Deck', 'Num', 'Side']] = train_data_csv['Cabin'].str.split('/', expand=True) 
train_data_csv.drop(columns=['Cabin'], inplace=True)

In [115]:
train_data_csv.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S


Check how many missing values there are for the numeric values when cryosleep is True

In [116]:
columns_of_interest = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
missing_overall = train_data_csv[columns_of_interest].isna().sum()
print("Missing values overall:")
print(missing_overall)
cryosleep_true_df = train_data_csv[train_data_csv['CryoSleep'] == True]

missing_cryosleep_true = cryosleep_true_df[columns_of_interest].isna().sum()
print("\nMissing values when CryoSleep is True:")
print(missing_cryosleep_true)

Missing values overall:
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
dtype: int64

Missing values when CryoSleep is True:
RoomService      70
FoodCourt        73
ShoppingMall    104
Spa              70
VRDeck           69
dtype: int64


In [117]:
numeric_cols = cryosleep_true_df.select_dtypes(include=['number'])
numeric_cols = numeric_cols.drop(columns=['Age'])
numeric_sums = numeric_cols.sum()
print(numeric_sums)
numeric_cols.describe()

RoomService     46959.0
FoodCourt       98637.0
ShoppingMall    30513.0
Spa             58860.0
VRDeck          61980.0
dtype: float64


Unnamed: 0,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,3184.0,3181.0,3150.0,3184.0,3185.0
mean,14.74843,31.008174,9.686667,18.486181,19.459969
std,196.94881,389.147151,101.819818,236.36884,286.142081
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0
max,8243.0,16263.0,2179.0,6075.0,9811.0


We can see that the median value is 0 for all numeric values when Cryosleep is True. We can conclude that we can fill all missing values with 0.

In [118]:
mask = train_data_csv['CryoSleep'] == True
train_data_csv.loc[mask, columns_of_interest] = train_data_csv.loc[mask, columns_of_interest].fillna(0)

In [119]:
cryosleep_true_df = train_data_csv[train_data_csv['CryoSleep'] == True]
missing_cryosleep_true = cryosleep_true_df[columns_of_interest].isna().sum()
print("\nMissing values when CryoSleep is True:")
print(missing_cryosleep_true)


Missing values when CryoSleep is True:
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64


In [120]:
# Check if the number of unique indexes equal the length of the train_data_csv dataframe
print(train_data_csv['PassengerId'].nunique()==len(train_data_csv.index))

True


In [121]:
missing_values = train_data_csv.isna().sum()
print(missing_values)

PassengerId       0
HomePlanet      201
CryoSleep         0
Destination     182
Age             179
VIP               0
RoomService     111
FoodCourt       110
ShoppingMall    104
Spa             113
VRDeck          119
Name            200
Transported       0
Deck            199
Num             199
Side            199
dtype: int64


We can "fill in" the missing values for age using the mean value of the column or similar, we remove the rest.

In [122]:
mean_age = train_data_csv['Age'].mean()

train_data_csv['Age'] = train_data_csv['Age'].fillna(value=mean_age)
#train_data_csv['Age'].fillna(value=mean_age, inplace=True)

## Dropping columns
train_data_csv = train_data_csv.drop(['Name'], axis=1)

In [123]:
missing_values = train_data_csv.isna().sum()
print(missing_values)

PassengerId       0
HomePlanet      201
CryoSleep         0
Destination     182
Age               0
VIP               0
RoomService     111
FoodCourt       110
ShoppingMall    104
Spa             113
VRDeck          119
Transported       0
Deck            199
Num             199
Side            199
dtype: int64


In [124]:
train_data_csv.dropna(inplace=True)
print(len(train_data_csv.index))

7615
