In the following notebook I will show you the algorithmic approach I used to fill (nearly) every Cabin. This is not a guessing/probabilistic approach, cabins are filled in a structured order based on the passengers Homeplanet and its group (from its passengerID).

Cabins are filled in order based on their number, ie if a passenger is in cabin A/05/P, a passenger in a later group cannot be in A/04/P but they could be in A/01/S, or B/01/P

We are defining the components of the cabin by 
A/01/P
A = cabin deck, can take values 'A','B','C','D','E','F','G','T'
01 = cabin number, can take values 0,1,2...
P  = cabin side, can take values 'P', 'S' (presumably 'Port' and 'Starboard' )

Some assumptions
* If two passengers are in the same group then they are on the same side, Appendix A.1
* If two passengers are in the same group then they are from the same home planet, Appendix A.2
* If two passengers share a last name then they are from the same home planet, Appendix A.3
* Home planets restrict which decks a passenger is on, Appendix A.4
** Passengers with Mars as their home planet are in decks 'D','E' or 'F'
** Passengers with Earth as their home planet are in decks 'E','F' or 'G'
** Passengers with Europa as their home planet are in decks 'A','B','C','D','E','T'
** If a passenger has no bills (RoomService + ShoppingMall + Spa + VRDeck + FoodCourt) and has members in its group in different decks then they are restricted to these decks 
*** Earth :'G'
*** Europa: 'B'
*** Mars: 'E','F'
* Children <= 12 in age have no bills, Appendix A.5



# ?? df.iloc[1055] why is this had its homeplanet changed??

In [1]:
test_data[test_data.PassengerId == '0742_02']

NameError: name 'test_data' is not defined

# Feature engineering

In [209]:
import pandas as pd 
from collections import defaultdict # Slightly modified from a regular dictionary


training_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
training_data['Set'] = 'Train'
test_data['Set'] = 'Test'

# The combined dataframe we will be using for the rest of this project
df = pd.concat([training_data,test_data]) 



In [210]:
def column_splits(data_frame):
    data_frame[['Group', 'GroupNumber']] = data_frame['PassengerId'].str.split('_', expand=True)

    data_frame[['CabinDeck', 'CabinNumber', 'CabinSide']]= data_frame['Cabin'].str.split("/", expand = True)
    data_frame.CabinNumber = data_frame.CabinNumber.astype('Int64')
    
    data_frame[['FirstName','LastName']] = data_frame['Name'].str.split(" ",expand = True)

    return data_frame

df = column_splits(df)


df = df.sort_values(by = ['Group','GroupNumber'])
df = df.reset_index(drop = True)

In [248]:
df['Bills'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
df.loc[(df['Age'] < 13), 'Bills'] = 0
df.loc[(df['CryoSleep'] == True),'Bills'] = 0 
    

In [212]:
def add_group_size_column(dataframe):
    dataframe['GroupSize'] = dataframe.groupby('Group')['Group'].transform('count')
    return dataframe


df = add_group_size_column(df)


In [213]:
df[df.HomePlanet.isna()].

SyntaxError: invalid syntax (643836520.py, line 1)

In [214]:
df[df.PassengerId == '1645_01']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Set,Group,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize
2357,1645_01,,True,,55 Cancri e,18.0,False,0.0,0.0,0.0,...,Train,1645,1,,,,Andace,Thonyderson,0.0,1


In [249]:
def impute_attribute_by_shared_features(dataframe,attribute,shared_feature):
    
    # Iterates through all the rows that have nan for this attribute
    for index, row in dataframe[dataframe[attribute].isna()].iterrows():
        rows_with_shared_features = dataframe[dataframe[shared_feature] == row[shared_feature]].dropna(subset=[attribute])
        
        if not rows_with_shared_features.empty:
            dataframe.loc[index, attribute] = rows_with_shared_features[attribute].iloc[0]

    return dataframe

df = impute_attribute_by_shared_features(df,'HomePlanet','Group')
df = impute_attribute_by_shared_features(df,'HomePlanet','LastName')

In [250]:

def add_potential_decks_column(dataframe):
    
    potential_decks_by_homeplanet = {
    'Earth':['E','F','G'],
    'Europa': ['A','B','C','D','E','T'],
    'Mars': ['D','E','F']
    }

    potential_decks_by_homeplanet_no_bills = {
        'Earth':['G'],
        'Europa':['B'],
        'Mars': ['E','F']
    }
    
    def func_potential_decks_apply(row):
        if pd.isna(row.Cabin):
            if row.Bills == 0 and not pd.isna(row.HomePlanet) and row.GroupSize > 1:
                
                group_members = dataframe[(dataframe.Group == row.Group) & (dataframe.PassengerId != row.PassengerId)].CabinDeck
                # Checking if other members of group are in multiple different cabin decks
                if group_members.dropna().nunique() > 1:
                    return potential_decks_by_homeplanet_no_bills[row.HomePlanet]
                elif not group_members.isna().any():
                    return list(set(potential_decks_by_homeplanet_no_bills[row.HomePlanet] + list(group_members.dropna().unique())))

                if group_members.nunique() == 1:
                    if group_members.iloc[0] in potential_decks_by_homeplanet_no_bills[row.HomePlanet]:
                        return potential_decks_by_homeplanet_no_bills[row.HomePlanet]
                    
            # If not then it goes to the standard decks for their homeplanet
            if not pd.isna(row.HomePlanet):
                return potential_decks_by_homeplanet[row.HomePlanet]
            
            else:
                
                # If their homeplanet isn't known then they could be in any cabin deck
                return list(dataframe.CabinDeck.dropna().unique())
            
    dataframe['PotentialDecks'] = dataframe.apply(func_potential_decks_apply,axis = 1)
    return dataframe
    
            

def add_potential_sides_column(dataframe):
    
    def func_potential_sides_apply(row):
        if pd.isna(row.Cabin):
            
            # Checks to see if anyone else in their group has a known cabin side
            group = dataframe[dataframe.Group == row.Group].CabinSide.dropna()
            if group.nunique() > 0:
                return [group.iloc[0]]
            
            # If no one else is in their group or they haven't got a known cabin side then the passenger could be on either side
            return ['P','S']
        
    dataframe['PotentialSides'] = dataframe.apply(func_potential_sides_apply,axis = 1)
    return dataframe

    
df = add_potential_decks_column(df)
df = add_potential_sides_column(df)



Earth 7442_02
Earth


In [253]:
df[df.PassengerId == '7442_02']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
10411,7442_02,Earth,True,,PSO J318.5-22,17.0,False,0.0,0.0,0.0,...,2,,,,Franda,Morrencis,0.0,2,"[G, E]",[S]


In [254]:
df[df.Group == '7442']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
10410,7442_01,Earth,False,E/495/S,TRAPPIST-1e,43.0,False,20.0,0.0,726.0,...,1,E,495.0,S,Antony,Morrencis,,2,,
10411,7442_02,Earth,True,,PSO J318.5-22,17.0,False,0.0,0.0,0.0,...,2,,,,Franda,Morrencis,0.0,2,"[G, E]",[S]


In [255]:
df = df.sort_values(by = ['Group','GroupNumber'])
df = df.reset_index(drop = True)


In [256]:
df[df.Group == '0293']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
404,0293_01,Europa,True,,TRAPPIST-1e,47.0,False,0.0,0.0,0.0,...,1,,,,Tauxon,Suptibler,0.0,1,"[A, B, C, D, E, T]","[P, S]"


# Imputing

In [257]:
def impute_from_cabin_and_index(dataframe,cabin,index):
    dataframe.loc[index,['Cabin','CabinDeck','CabinNumber','CabinSide']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
    return dataframe

In [258]:
def passengers_empty_cabin_options(dataframe):
    
    df_passengers_without_cabin = dataframe[dataframe['Cabin'].isna()]
    all_passenger_cabin_options = {}

    for passenger_index, passenger in df_passengers_without_cabin.iterrows():
        all_passenger_cabin_options[passenger_index] = []

        for deck in passenger.PotentialDecks:
            for side in passenger.PotentialSides:
                
                # Filter dataframe for the current deck and side
                df_filtered = dataframe[(dataframe['CabinDeck'] == deck) & (dataframe['CabinSide'] == side)]

                # Split into cabins before and after the current passenger index
                max_cabin_no_before = max(df_filtered.loc[df_filtered.index < passenger_index, 'CabinNumber'].dropna().unique(), default = -1 )
                min_cabin_no_after = min(df_filtered.loc[df_filtered.index > passenger_index, 'CabinNumber'].dropna().unique(), default = -1)

                # If no cabins were found of that deck and side before or after the row
                if max_cabin_no_before == -1 or min_cabin_no_after == -1:
                    continue
                
                # If a cabin number is seen before the row and the next cabin number is more than 1 higher after the row
                # then there is an empty cabin it can potentially fill
                if max_cabin_no_before + 1 < min_cabin_no_after:
                    all_passenger_cabin_options[passenger_index] += [f"{deck}/{i}/{side}" for i in range(max_cabin_no_before + 1, min_cabin_no_after)]

    return all_passenger_cabin_options





# solo group and only one room that fits

In [259]:
def solo_group_one_cabin_option(dataframe):
    
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)

    # Iterates through all the passengers that haven't got a Cabin yet and are alone in their group (ie can't share)
    for passenger_index in list(df[(df.Cabin.isna()) & (df.GroupSize == 1)].index):

        # If they have only one free cabin that they could fill
        if len(all_passenger_cabin_options[passenger_index]) == 1:
            matching_cabin = all_passenger_cabin_options[passenger_index][0]
            dataframe = impute_from_cabin_and_index(dataframe,matching_cabin,passenger_index)

    return dataframe


# no free rooms so has to share

In [260]:
def no_suitable_cabin_so_shares(dataframe):
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)
    
    for passenger_index,passenger_cabin_options in all_passenger_cabin_options.items():
        
        # If there are no free cabins that the passenger can fill
        if not passenger_cabin_options:
            
            passenger_row = dataframe.loc[passenger_index]
            
            # Finding all other group members cabins and filtering them by whether they are in the same deck that the passenger must be in
            passengers_group_cabins = dataframe[(dataframe['Group'] == passenger_row['Group']) &
                                  (dataframe['CabinDeck'].isin(passenger_row['PotentialDecks']))].Cabin.dropna()
            
            # If there is only one Cabin from their group they could share with
            if passengers_group_cabins.nunique() == 1:
                matching_cabin = passengers_group_cabins.iloc[0]
                dataframe = impute_from_cabin_and_index(dataframe,matching_cabin,passenger_index)
                
    return dataframe
    


# only passenger that can take that cabin

In [261]:
def only_matching_passenger_for_cabin(dataframe):
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)
    
    cabins_to_fill = defaultdict(list)
    
    # Iterate over cabins to see which passengers can fit that cabin
    for passenger_index, cabin_options in all_passenger_cabin_options.items():
        for cabin in cabin_options:
            cabins_to_fill[cabin].append(passenger_index)
    
    # Iterate over cabin and impute passengers where only one fits
    for cabin, passengers_indices in cabins_to_fill.items():
        if len(passengers_indices) == 1:
            dataframe = impute_from_cabin_and_index(dataframe, cabin, passengers_indices[0])
    
    return dataframe


# all imputes

In [262]:
def all_imputes(dataframe):
    dataframe = solo_group_one_cabin_option(dataframe)
    dataframe = no_suitable_cabin_so_shares(dataframe)
    dataframe = only_matching_passenger_for_cabin(dataframe)

    dataframe = solo_group_one_cabin_option(dataframe)
    dataframe = no_suitable_cabin_so_shares(dataframe)
    dataframe = only_matching_passenger_for_cabin(dataframe)
    
    return dataframe
    
df = all_imputes(df)
df.isna().sum()

PassengerId           0
HomePlanet           13
CryoSleep           310
Cabin                29
Destination         274
Age                 270
VIP                 296
RoomService         263
FoodCourt           289
ShoppingMall        306
Spa                 284
VRDeck              268
Name                294
Transported        4277
Set                   0
Group                 0
GroupNumber           0
CabinDeck            29
CabinNumber          29
CabinSide            29
FirstName           294
LastName            294
Bills               785
GroupSize             0
PotentialDecks    12941
PotentialSides    12941
dtype: int64

# Manual workings


In [264]:
def all_cabin_options_for_each_row(dataframe):
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)
    for passenger_index, passenger_options in all_passenger_cabin_options.items():
        print()
        print("Index:",passenger_index, "GroupSize:", dataframe.iloc[passenger_index].GroupSize)
        print("Free cabins that match:")
        print(passenger_options)
        print(dataframe.iloc[passenger_index].Bills)

                
             


In [65]:
all_cabin_options_for_each_row(df)


Index: 404 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']

Index: 421 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']

Index: 479 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 505 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 517 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 1429 GroupSize: 2
Free cabins that match:
['E/58/P']

Index: 1466 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S', 'E/58/P']

Index: 1543 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S']

Index: 2442 GroupSize: 7
Free cabins that match:
[]

Index: 2970 GroupSize: 5
Free cabins that match:
[]

Index: 3529 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']

Index: 3530 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']

Index: 4233 GroupSize: 1
Free cabins that match:
['B/98/P', 'B/99/P']

Index: 4254 GroupSize: 1
Free cabins that match:
['B/98/P', 'B/99/P']

Index: 4569 GroupSize: 3
Free cabins that 

In [265]:
all_cabin_options_for_each_row(df)


Index: 404 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']
0.0

Index: 421 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']
nan

Index: 479 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']
2385.0

Index: 505 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']
1298.0

Index: 517 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']
789.0

Index: 1466 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S']
0.0

Index: 1543 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S']
0.0

Index: 2442 GroupSize: 7
Free cabins that match:
[]
1338.0

Index: 2970 GroupSize: 5
Free cabins that match:
[]
9597.0

Index: 3529 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']
711.0

Index: 3530 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']
791.0

Index: 4569 GroupSize: 3
Free cabins that match:
[]
770.0

Index: 4751 GroupSize: 7
Free cabins that match:
[]
3674.0

Index: 5016 GroupSize: 1
Free cabins that match:
['G/590/P', 'G/579/S']
67

In [266]:
df[df.Group == df.iloc[10411].Group]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
10410,7442_01,Earth,False,E/495/S,TRAPPIST-1e,43.0,False,20.0,0.0,726.0,...,1,E,495.0,S,Antony,Morrencis,,2,,
10411,7442_02,Earth,True,,PSO J318.5-22,17.0,False,0.0,0.0,0.0,...,2,,,,Franda,Morrencis,0.0,2,"[G, E]",[S]


In [246]:
df[(df.HomePlanet == 'Earth') & (df.CryoSleep == False) & (df.Destination == 'TRAPPIST-1e')].CabinDeck.value_counts()

CabinDeck
F    1703
G    1262
E     451
Name: count, dtype: int64

In [136]:
df[df.Group == df.iloc[10411].Group]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
10410,7442_01,Earth,False,E/495/S,TRAPPIST-1e,43.0,False,20.0,0.0,726.0,...,1,E,495.0,S,Antony,Morrencis,,2,,
10411,7442_02,Earth,True,,PSO J318.5-22,17.0,False,0.0,0.0,0.0,...,2,,,,Franda,Morrencis,0.0,2,"[E, F, G]",[S]


In [117]:
df[(df.Cabin.isna()) & (df.Bills == 0)]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
404,0293_01,Europa,True,,TRAPPIST-1e,47.0,False,0.0,0.0,0.0,...,1,,,,Tauxon,Suptibler,0.0,1,"[A, B, C, D, E, T]","[P, S]"
1466,1041_01,Europa,True,,TRAPPIST-1e,46.0,False,0.0,0.0,0.0,...,1,,,,Algrafi,Heedry,0.0,1,"[A, B, C, D, E, T]","[P, S]"
1543,1095_01,Europa,True,,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,...,1,,,,Alhail,Drelcate,0.0,1,"[A, B, C, D, E, T]","[P, S]"
8450,6048_01,Mars,True,,,25.0,False,0.0,0.0,0.0,...,1,,,,Mele,Clake,0.0,1,"[D, E, F]","[P, S]"
10082,7183_01,Earth,True,,TRAPPIST-1e,19.0,,0.0,0.0,0.0,...,1,,,,Janne,Brookes,0.0,1,"[E, F, G]","[P, S]"
10313,7368_01,Europa,True,,TRAPPIST-1e,31.0,False,0.0,0.0,0.0,...,1,,,,Pharmus,Frolestty,0.0,1,"[A, B, C, D, E, T]","[P, S]"
10411,7442_02,Earth,True,,PSO J318.5-22,17.0,False,0.0,0.0,0.0,...,2,,,,Franda,Morrencis,0.0,2,"[E, F, G]",[S]
11129,7983_01,Europa,True,,TRAPPIST-1e,53.0,False,0.0,0.0,0.0,...,1,,,,Alderak,Wassird,0.0,1,"[A, B, C, D, E, T]","[P, S]"
11148,7995_01,Europa,True,,TRAPPIST-1e,36.0,False,0.0,0.0,0.0,...,1,,,,Grum,Stiviorad,0.0,1,"[A, B, C, D, E, T]","[P, S]"


Earth :'G'
*** Europa: 'B'
*** Mars: 'E','F'

In [116]:
df[df.Group == df.iloc[4751].Group]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
4750,3411_01,Mars,False,E/232/S,TRAPPIST-1e,22.0,False,446.0,0.0,1630.0,...,1,E,232.0,S,Herk,Homin,2076.0,7,,
4751,3411_02,Mars,False,,TRAPPIST-1e,22.0,False,1127.0,0.0,2451.0,...,2,,,,Naish,Harta,3674.0,7,"[D, E, F]",[S]
4752,3411_03,Mars,False,F/645/S,TRAPPIST-1e,20.0,False,1481.0,0.0,2183.0,...,3,F,645.0,S,Eggies,Homin,3693.0,7,,
4753,3411_04,Mars,False,F/645/S,TRAPPIST-1e,47.0,False,1696.0,10.0,48.0,...,4,F,645.0,S,Wal,Sté,1754.0,7,,
4754,3411_05,Mars,True,F/645/S,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,...,5,F,645.0,S,Douse,Homin,0.0,7,,
4755,3411_06,Mars,True,F/645/S,TRAPPIST-1e,9.0,False,0.0,0.0,0.0,...,6,F,645.0,S,Kadown,Harta,0.0,7,,
4756,3411_07,Mars,False,F/645/S,TRAPPIST-1e,19.0,False,1954.0,0.0,508.0,...,7,F,645.0,S,Blues,Harta,2462.0,7,,


In [77]:
df[df.Group == df.iloc[12651].Group]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
12651,9057_01,Europa,False,A/94/P,55 Cancri e,36.0,True,132.0,3479.0,0.0,...,1,A,94,P,Coxan,Statch,7397.0,2,"[A, B, C, D, E, T]",[P]
12652,9057_02,Europa,True,B/296/P,TRAPPIST-1e,55.0,False,0.0,0.0,0.0,...,2,B,296,P,Sitron,Statch,0.0,2,,


In [228]:




cabin_list = [(1429,'E/58/P'),(4233,'B/98/P'),(4254,'B/99/P'),(6493,'E/300/S'),(6514,'E/301/S'),(8413,'A/57/P'), (12892,'F/1785/S'),(12893,'F/1785/S')]

for index,cabin in cabin_list:
    impute_from_cabin_and_index(df,cabin,index)




# End


In [202]:
traindata = df[df.Set == 'Train']
testdata = df[df.Set == 'Test']

In [203]:
df_to_comp = pd.read_csv('data/31remaining.csv')
df_to_comp = df_to_comp.rename(columns = {'Number':'CabinNumber'})
df_to_comp['CabinNumber'] = df_to_comp['CabinNumber'].astype('Int64')


In [204]:
df_to_comp[df_to_comp.Group == 6612]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,Deck,CabinNumber,Side,FirstName,LastName,GroupSize,Bills,potential_decks,potential_sides
9263,6612_01,Earth,False,G/1077/S,TRAPPIST-1e,,False,0.0,0.0,0.0,...,1,G,1077,S,Elanie,Ewiseston,6,0.0,,
9264,6612_02,Earth,False,G/1077/S,TRAPPIST-1e,27.0,False,864.0,1.0,0.0,...,2,G,1077,S,Kaye,Barks,6,888.0,,
9265,6612_03,Earth,False,F/1267/S,TRAPPIST-1e,29.0,False,121.0,0.0,55.0,...,3,F,1267,S,Daley,Dickley,6,826.0,,
9266,6612_04,Earth,False,G/1077/S,55 Cancri e,6.0,False,0.0,0.0,0.0,...,4,G,1077,S,Deanny,Barks,6,0.0,,
9267,6612_05,Earth,False,F/1267/S,TRAPPIST-1e,2.0,False,0.0,0.0,0.0,...,5,F,1267,S,Maurie,Dickley,6,0.0,,
9268,6612_06,Earth,True,G/1077/S,PSO J318.5-22,6.0,False,0.0,0.0,0.0,...,6,G,1077,S,Heryle,Dickley,6,0.0,,


In [236]:
for index,row in df.iterrows():
    if not (pd.isna(row.Cabin) and pd.isna(df_to_comp.iloc[index].Cabin)):
        if row.Cabin != df_to_comp.iloc[index].Cabin:
            print(index,row.Cabin, df_to_comp.iloc[index].Cabin)

9267 G/1077/S F/1267/S
12651 A/94/P nan
12668 B/297/P nan


In [226]:
df.iloc[404]

PassengerId                  0293_01
HomePlanet                    Europa
CryoSleep                       True
Cabin                            NaN
Destination              TRAPPIST-1e
Age                             47.0
VIP                            False
RoomService                      0.0
FoodCourt                        0.0
ShoppingMall                     0.0
Spa                              0.0
VRDeck                           0.0
Name                Tauxon Suptibler
Transported                      NaN
Set                             Test
Group                           0293
GroupNumber                       01
CabinDeck                        NaN
CabinNumber                     <NA>
CabinSide                        NaN
FirstName                     Tauxon
LastName                   Suptibler
Bills                            0.0
GroupSize                          1
PotentialDecks    [A, B, C, D, E, T]
PotentialSides                [P, S]
Name: 404, dtype: object

In [93]:
df[df.Group == df.iloc[12892].Group]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
12892,9223_01,Mars,True,,TRAPPIST-1e,24.0,False,0.0,0.0,,...,1,,,,Weessh,Sun,0.0,2,"[D, E, F]","[P, S]"
12893,9223_02,Mars,True,,TRAPPIST-1e,17.0,False,0.0,0.0,0.0,...,2,,,,Perit,Sun,0.0,2,"[D, E, F]","[P, S]"


# Appendix

Passengers with Mars as their home planet are in decks 'D','E' or 'F'
** Passengers with Earth as their home planet are in decks 'E','F' or 'G'
** Passengers with Europa as their home planet are in decks 'A','B','C','D','E','T'
** If a passenger has no bills (RoomService + ShoppingMall + Spa + VRDeck + FoodCourt) and has members in its group in different decks then they are restricted to these decks 
*** Earth :'G'
*** Europa: 'B'
*** Mars: 'E','F'

In [76]:
df[df.Group == '6612']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
9263,6612_01,Earth,False,G/1077/S,TRAPPIST-1e,,False,0.0,0.0,0.0,...,1,G,1077.0,S,Elanie,Ewiseston,0.0,6,,
9264,6612_02,Earth,False,G/1077/S,TRAPPIST-1e,27.0,False,864.0,1.0,0.0,...,2,G,1077.0,S,Kaye,Barks,888.0,6,,
9265,6612_03,Earth,False,,TRAPPIST-1e,29.0,False,121.0,0.0,55.0,...,3,,,,Daley,Dickley,826.0,6,"[E, F, G]",[S]
9266,6612_04,Earth,False,G/1077/S,55 Cancri e,6.0,False,0.0,0.0,0.0,...,4,G,1077.0,S,Deanny,Barks,0.0,6,,
9267,6612_05,Earth,False,,TRAPPIST-1e,2.0,False,0.0,0.0,0.0,...,5,,,,Maurie,Dickley,0.0,6,"[E, F, G]",[S]
9268,6612_06,Earth,True,G/1077/S,PSO J318.5-22,6.0,False,0.0,0.0,0.0,...,6,G,1077.0,S,Heryle,Dickley,0.0,6,,


In [163]:
for index,row in df_to_comp.iterrows():
    if row.HomePlanet == 'Earth':
        if row.Deck == 'F':
            if row.GroupSize == 1:
                if row.Bills == 0:
                    if df_to_comp[df_to_comp.Group == row.Group].Deck.nunique() == 1:
                        print(index)


144
1890
4364
9148
9882


In [19]:
training_data[training_data.PassengerId == '0742_01']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Set


In [23]:
df[df.Group == '0742']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
1054,0742_01,Mars,False,E/54/S,TRAPPIST-1e,33.0,False,4.0,153.0,892.0,...,1,E,54,S,Sugark,Min,1049.0,2,,
1055,0742_02,Earth,True,F/138/S,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,...,2,F,138,S,Frunk,Min,0.0,2,,


In [27]:
df.iloc[1055]

PassengerId           0742_02
HomePlanet              Earth
CryoSleep                True
Cabin                 F/138/S
Destination       TRAPPIST-1e
Age                      25.0
VIP                     False
RoomService               0.0
FoodCourt                 0.0
ShoppingMall              0.0
Spa                       0.0
VRDeck                    0.0
Name                Frunk Min
Transported               NaN
Set                      Test
Group                    0742
GroupNumber                02
CabinDeck                   F
CabinNumber               138
CabinSide                   S
FirstName               Frunk
LastName                  Min
Bills                     0.0
GroupSize                   2
PotentialDecks           None
PotentialSides           None
Name: 1055, dtype: object

In [35]:
df.iloc[1055]

PassengerId           0742_02
HomePlanet               Mars
CryoSleep                True
Cabin                 F/138/S
Destination       TRAPPIST-1e
Age                      25.0
VIP                     False
RoomService               0.0
FoodCourt                 0.0
ShoppingMall              0.0
Spa                       0.0
VRDeck                    0.0
Name                Frunk Min
Transported               NaN
Set                      Test
Group                    0742
GroupNumber                02
CabinDeck                   F
CabinNumber               138
CabinSide                   S
FirstName               Frunk
LastName                  Min
Bills                     0.0
GroupSize                   2
PotentialDecks           None
PotentialSides           None
Name: 1055, dtype: object

In [27]:
test_data[test_data.PassengerId == '0742_01']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Set
346,0742_01,Mars,False,E/54/S,TRAPPIST-1e,33.0,False,4.0,153.0,892.0,0.0,0.0,Sugark Min,Test


In [26]:
test_data[test_data.PassengerId == '0742_02']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Set
347,0742_02,Mars,True,F/138/S,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,0.0,0.0,Frunk Min,Test


In [23]:
training_data[training_data.PassengerId == '0742_01']
test_data[test_data.PassengerId == '0742_02']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Set


In [20]:
df[df.iloc[1055].Group == df.Group]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
1054,0742_01,Mars,False,E/54/S,TRAPPIST-1e,33.0,False,4.0,153.0,892.0,...,1,E,54,S,Sugark,Min,1049.0,2,,
1055,0742_02,Earth,True,F/138/S,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,...,2,F,138,S,Frunk,Min,0.0,2,,
