In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re

from sklearn.svm import LinearSVC
from sklearn import metrics as metrics
from sklearn.model_selection import train_test_split

from sklearn.dummy import DummyClassifier

# Import dataframes

In [2]:
df_train = pd.read_csv('data/train.csv', header=0) # -> treino
df_test = pd.read_csv('data/test.csv', header=0) # -> teste

# Feature engineering and data preparation
    - Not considering NaN still

In [3]:
# Define the WIP DataFrame for backup
df_full = pd.concat([df_train.drop(columns=['Transported']), df_test]).reset_index(drop=True)


wip_df = df_full.copy()

In [4]:
display(wip_df.head())
display(wip_df.info())
wip_df.describe()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Cabin         12671 non-null  object 
 4   Destination   12696 non-null  object 
 5   Age           12700 non-null  float64
 6   VIP           12674 non-null  object 
 7   RoomService   12707 non-null  float64
 8   FoodCourt     12681 non-null  float64
 9   ShoppingMall  12664 non-null  float64
 10  Spa           12686 non-null  float64
 11  VRDeck        12702 non-null  float64
 12  Name          12676 non-null  object 
dtypes: float64(6), object(7)
memory usage: 1.3+ MB


None

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,12700.0,12707.0,12681.0,12664.0,12686.0,12702.0
mean,28.771969,222.897852,451.961675,174.906033,308.476904,306.789482
std,14.387261,647.596664,1584.370747,590.55869,1130.279641,1180.097223
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,49.0,77.0,29.0,57.0,42.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


- Working with PassengerId
    - Separating Passenger groups within passengers

In [5]:
def get_passenger_group(passengerid: str) -> str:
    if re.search('_', passengerid):
        return passengerid[0:4]
    
    else:
        return 'check'
    
def set_passenger_groupsize(passengergroup: int, grouped_df: pd.DataFrame) -> str:
    if grouped_df['PassengerId'][passengergroup] == 1:
        return 'Alone'
    
    elif grouped_df['PassengerId'][passengergroup] == 2:
        return 'Pair'
    
    elif grouped_df['PassengerId'][passengergroup] == 3:
        return 'Trio'
    
    elif grouped_df['PassengerId'][passengergroup] >= 4:
        return 'Family'

In [6]:
wip_df['PassengerGroup'] = wip_df.apply(lambda x: get_passenger_group(x['PassengerId']), axis=1)
# wip_df.loc[wip_df['PassengerGroup'] == 'check'] <- no check found

In [7]:
grouped_df = wip_df[['PassengerGroup','PassengerId']].groupby('PassengerGroup').count()
wip_df['Riding'] = wip_df.apply(lambda x: set_passenger_groupsize(x['PassengerGroup'], grouped_df), axis=1)

In [8]:
wip_df.sample(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,PassengerGroup,Riding
5986,6335_01,Earth,False,F/1312/P,TRAPPIST-1e,32.0,False,18.0,0.0,0.0,1382.0,0.0,Joandy Potterry,6335,Alone
8818,0273_02,Mars,True,F/60/P,TRAPPIST-1e,20.0,False,0.0,0.0,0.0,0.0,0.0,Triesh Ste,273,Trio
6899,7307_01,Earth,False,F/1405/S,TRAPPIST-1e,39.0,False,19.0,102.0,0.0,678.0,0.0,Velyne Hubbarton,7307,Alone
11998,7243_03,Earth,True,G/1180/S,TRAPPIST-1e,2.0,False,0.0,0.0,0.0,0.0,0.0,Marlly Vales,7243,Family
279,0309_01,Earth,False,F/60/S,TRAPPIST-1e,18.0,False,1194.0,0.0,157.0,6.0,0.0,Billya Bowerson,309,Alone
1908,2044_01,Earth,False,F/396/S,TRAPPIST-1e,27.0,,0.0,0.0,634.0,1.0,23.0,Breney Conleydenan,2044,Alone
2675,2866_01,Europa,True,C/110/S,TRAPPIST-1e,36.0,True,0.0,0.0,0.0,0.0,0.0,Hadirk Wheededly,2866,Alone
12302,7874_01,Earth,False,F/1635/P,TRAPPIST-1e,21.0,False,1004.0,0.0,22.0,3.0,0.0,Karena Oneidson,7874,Pair
4554,4846_01,Earth,False,E/314/S,TRAPPIST-1e,38.0,False,161.0,3.0,0.0,624.0,0.0,Sonnie Canields,4846,Pair
6717,7087_01,Earth,True,G/1141/P,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,0.0,0.0,Dennie Moreman,7087,Alone


- Working with RoomService, FoodCourt, ShoppingMall, Spa and VRDeck bills
    - Resolving the problem with NaNs
    - Filling all other NaN with 0

In [9]:
columns = [
    'RoomService',
    'FoodCourt',
    'ShoppingMall',
    'Spa',
    'VRDeck'
]

wip_df['TotalBill'] = wip_df[columns].sum(axis=1)
wip_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,PassengerGroup,Riding,TotalBill
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,1,Alone,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,2,Alone,736.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,3,Pair,10383.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,3,Pair,5176.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,4,Alone,1091.0


In [10]:
wip_df[columns] = wip_df[columns].fillna(0)

- Working with Destination Objects
    - TRAPPIST-1e
    - 55 Cancri e
    - PSO J318.5-22

In [11]:
wip_df['Destination'] = pd.Categorical(wip_df['Destination'])

In [12]:
wip_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,PassengerGroup,Riding,TotalBill
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,1,Alone,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,2,Alone,736.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,3,Pair,10383.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,3,Pair,5176.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,4,Alone,1091.0


- Working with Cabins
    - Splitting into Deck/Side

In [13]:
wip_df[['Cabin_Deck','Cabin_Number','Cabin_Side']] = wip_df['Cabin'].str.split('/', expand=True)

In [14]:
wip_df['Cabin_Deck'] = pd.Categorical(wip_df['Cabin_Deck'])
wip_df['Cabin_Side'] = pd.Categorical(wip_df['Cabin_Side'])

- Working with CryoSleep
    - Transforming column to bool ignoring NaN

In [15]:
wip_df['CryoSleep'] = wip_df['CryoSleep'].loc[wip_df['CryoSleep'].notna()].astype(bool)

- Working with VIP
    - Transforming column to bool ignoring NaN

In [16]:
wip_df['VIP'] = wip_df['VIP'].loc[wip_df['VIP'].notna()].astype(bool)

- Working with HomePlanet
    - Europa, Mars, Earth and Empty HomePlanets

In [17]:
wip_df['HomePlanet'] = pd.Categorical(wip_df['HomePlanet'])

- Working with Age
    - Filling NaN with median

In [18]:
wip_df['Age'] = wip_df.Age.fillna(wip_df.Age.median())

In [19]:
wip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   PassengerId     12970 non-null  object  
 1   HomePlanet      12682 non-null  category
 2   CryoSleep       12660 non-null  object  
 3   Cabin           12671 non-null  object  
 4   Destination     12696 non-null  category
 5   Age             12970 non-null  float64 
 6   VIP             12674 non-null  object  
 7   RoomService     12970 non-null  float64 
 8   FoodCourt       12970 non-null  float64 
 9   ShoppingMall    12970 non-null  float64 
 10  Spa             12970 non-null  float64 
 11  VRDeck          12970 non-null  float64 
 12  Name            12676 non-null  object  
 13  PassengerGroup  12970 non-null  object  
 14  Riding          12970 non-null  object  
 15  TotalBill       12970 non-null  float64 
 16  Cabin_Deck      12671 non-null  category
 17  Cabin_Number

- Getting the dummies from cols

In [20]:
dummy_cols = ['HomePlanet','Destination','Riding','Cabin_Deck','Cabin_Side']

dummies = pd.get_dummies(wip_df[['PassengerId','HomePlanet','Destination','Riding','Cabin_Deck','Cabin_Side']], dummy_na=True, columns=dummy_cols)

In [21]:
# Adding dummies and saving final DataFrame
df_full = wip_df.drop(columns=
    dummy_cols + 
    [
        'Cabin',
        'Name',
        'PassengerGroup',
        'Cabin_Number'
    ]
)

In [22]:
df_full = df_full.merge(dummies, on='PassengerId', how='inner')

# Model selection

In [23]:
# Returning the preprocessed DataFrame to its original values
df_train = df_train[['PassengerId','Transported']].merge(df_full, on='PassengerId', how='inner').copy()
df_test = df_test[['PassengerId']].merge(df_full, on='PassengerId', how='inner').copy()

df_train = df_train.drop(columns='PassengerId')
df_test = df_test.drop(columns='PassengerId')

In [24]:
x = df_train.drop(columns='Transported')
y = df_train['Transported']

In [25]:
dm = DummyClassifier(random_state=28)
dm.fit(x, y)

dm_predictions = dm.predict(x)
metrics.accuracy_score(y, dm_predictions)

0.5036236051995858