In the following notebook I will show you the algorithmic approach I used to fill (nearly) every Cabin. This is not a guessing/probabilistic approach, cabins are filled in a structured order based on the passengers Homeplanet and its group (from its passengerID).

Cabins are filled in order based on their number, ie if a passenger is in cabin A/05/P, a passenger in a later group cannot be in A/04/P but they could be in A/01/S, or B/01/P

We are defining the components of the cabin by 
A/01/P
A = cabin deck, can take values 'A','B','C','D','E','F','G','T'
01 = cabin number, can take values 0,1,2...
P  = cabin side, can take values 'P', 'S' (presumably 'Port' and 'Starboard' )

Some assumptions
* If two passengers are in the same group then they are on the same side, Appendix A.1
* If two passengers are in the same group then they are from the same home planet, Appendix A.2
* If two passengers share a last name then they are from the same home planet, Appendix A.3
* Home planets restrict which decks a passenger is on, Appendix A.4
** Passengers with Mars as their home planet are in decks 'D','E' or 'F'
** Passengers with Earth as their home planet are in decks 'E','F' or 'G'
** Passengers with Europa as their home planet are in decks 'A','B','C','D','E','T'
** If a passenger has no bills (RoomService + ShoppingMall + Spa + VRDeck + FoodCourt) and has members in its group in different decks then they are restricted to these decks 
*** Earth :'G'
*** Europa: 'B'
*** Mars: 'E','F'
* Children <= 12 in age have no bills, Appendix A.5



# ?? df.iloc[1055] why is this had its homeplanet changed??

In [1]:
test_data[test_data.PassengerId == '0742_02']

NameError: name 'test_data' is not defined

# Feature engineering

In [39]:
import pandas as pd 
from collections import defaultdict # Slightly modified from a regular dictionary


training_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
training_data['Set'] = 'Train'
test_data['Set'] = 'Test'

# The combined dataframe we will be using for the rest of this project
df = pd.concat([training_data,test_data]) 



In [40]:
def column_splits(data_frame):
    data_frame[['Group', 'GroupNumber']] = data_frame['PassengerId'].str.split('_', expand=True)

    data_frame[['CabinDeck', 'CabinNumber', 'CabinSide']]= data_frame['Cabin'].str.split("/", expand = True)
    data_frame.CabinNumber = data_frame.CabinNumber.astype('Int64')
    
    data_frame[['FirstName','LastName']] = data_frame['Name'].str.split(" ",expand = True)

    return data_frame

df = column_splits(df)


df = df.sort_values(by = ['Group','GroupNumber'])
df = df.reset_index(drop = True)

In [41]:
df[df.Group == '0742']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Name,Transported,Set,Group,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName
1054,0742_01,Mars,False,E/54/S,TRAPPIST-1e,33.0,False,4.0,153.0,892.0,...,Sugark Min,,Test,742,1,E,54,S,Sugark,Min
1055,0742_02,Mars,True,F/138/S,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,...,Frunk Min,,Test,742,2,F,138,S,Frunk,Min


In [42]:
df['Bills'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
df.loc[(df['Age'] < 13), 'Bills'] = 0
df.loc[(df['CryoSleep'] == True),'Bills'] = 0 
    

In [43]:
def add_group_size_column(dataframe):
    dataframe['GroupSize'] = dataframe.groupby('Group')['Group'].transform('count')
    return dataframe


df = add_group_size_column(df)


In [44]:
df[df.Group == '0742']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Set,Group,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize
1054,0742_01,Mars,False,E/54/S,TRAPPIST-1e,33.0,False,4.0,153.0,892.0,...,Test,742,1,E,54,S,Sugark,Min,1049.0,2
1055,0742_02,Mars,True,F/138/S,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,...,Test,742,2,F,138,S,Frunk,Min,0.0,2


In [45]:

def add_potential_decks_column(dataframe):
    
    potential_decks_by_homeplanet = {
    'Earth':['E','F','G'],
    'Europa': ['A','B','C','D','E','T'],
    'Mars': ['D','E','F']
    }

    potential_decks_by_homeplanet_no_bills = {
        'Earth':['G'],
        'Europa':['B'],
        'Mars': ['E','F']
    }
    
    def func_potential_decks_apply(row):
        if pd.isna(row.Cabin):
            if row.Bills == 0 and not pd.isna(row.HomePlanet):
                
                # Checking if other members of group are in multiple different cabin decks
                if dataframe[dataframe.Group == row.Group].CabinDeck.dropna().nunique() > 1:
                    return potential_decks_by_homeplanet_no_bills[row.HomePlanet]
            
            # If not then it goes to the standard decks for their homeplanet
            if not pd.isna(row.HomePlanet):
                return potential_decks_by_homeplanet[row.HomePlanet]
            
            else:
                
                # If their homeplanet isn't known then they could be in any cabin deck
                return list(dataframe.CabinDeck.dropna().unique())
            
    dataframe['PotentialDecks'] = dataframe.apply(func_potential_decks_apply,axis = 1)
    return dataframe
    
            

def add_potential_sides_column(dataframe):
    
    def func_potential_sides_apply(row):
        if pd.isna(row.Cabin):
            
            # Checks to see if anyone else in their group has a known cabin side
            group = dataframe[dataframe.Group == row.Group].CabinSide.dropna()
            if group.nunique() > 0:
                return [group.iloc[0]]
            
            # If no one else is in their group or they haven't got a known cabin side then the passenger could be on either side
            return ['P','S']
        
    dataframe['PotentialSides'] = dataframe.apply(func_potential_sides_apply,axis = 1)
    return dataframe

    
df = add_potential_decks_column(df)
df = add_potential_sides_column(df)



In [46]:
df[df.Group == '0742']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
1054,0742_01,Mars,False,E/54/S,TRAPPIST-1e,33.0,False,4.0,153.0,892.0,...,1,E,54,S,Sugark,Min,1049.0,2,,
1055,0742_02,Mars,True,F/138/S,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,...,2,F,138,S,Frunk,Min,0.0,2,,


In [47]:
dfc = df.copy()

In [48]:
def impute_attribute_by_shared_features(dataframe,attribute,shared_feature):
    
    # Iterates through all the rows that have nan for this attribute
    for index, row in dataframe[dataframe[attribute].isna()].iterrows():
        print(index)
        rows_with_shared_features = dataframe[dataframe[shared_feature] == row[shared_feature]].dropna(subset=[attribute])
        
        if not rows_with_shared_features.empty:
            dataframe.loc[index, attribute] = rows_with_shared_features[attribute].iloc[0]

    return dataframe



In [49]:
df = impute_attribute_by_shared_features(df,'HomePlanet','Group')


89
162
280
334
346
414
429
438
532
556
611
613
653
710
725
755
759
785
837
870
944
1039
1104
1212
1292
1324
1362
1393
1440
1445
1477
1483
1492
1502
1535
1589
1690
1925
1989
2096
2106
2112
2161
2227
2233
2357
2427
2437
2578
2592
2649
2658
2738
2805
2865
2907
2916
2920
2923
2979
3160
3258
3265
3293
3362
3371
3389
3425
3439
3450
3460
3483
3516
3518
3650
3678
3708
3766
3787
3825
3923
3941
3946
3953
3957
3998
4068
4125
4146
4169
4192
4201
4246
4349
4358
4407
4465
4567
4632
4646
4652
4680
4694
4764
4792
4893
4913
4918
4977
5003
5063
5177
5272
5273
5313
5351
5410
5457
5479
5654
5687
5702
5723
5773
5787
5832
5893
5900
5905
5986
5995
6026
6043
6112
6120
6180
6187
6213
6234
6240
6308
6405
6437
6475
6479
6499
6510
6518
6552
6683
6765
6786
6836
6899
6903
6970
7002
7110
7182
7184
7244
7382
7478
7487
7499
7538
7541
7569
7572
7595
7602
7617
7746
7794
7815
7918
8070
8085
8116
8133
8279
8343
8354
8423
8425
8435
8471
8491
8528
8546
8711
8741
8743
8825
8873
8896
8907
8927
8937
8969
8972
9097
9110
9134
91

In [50]:
df[df.Group == '0742']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
1054,0742_01,Mars,False,E/54/S,TRAPPIST-1e,33.0,False,4.0,153.0,892.0,...,1,E,54,S,Sugark,Min,1049.0,2,,
1055,0742_02,Mars,True,F/138/S,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,...,2,F,138,S,Frunk,Min,0.0,2,,


In [51]:
df = impute_attribute_by_shared_features(df,'HomePlanet','LastName')

280
334
346
414
429
438
532
556
725
870
1039
1212
1324
1362
1440
1477
1502
1925
1989
2096
2106
2227
2357
2427
2437
2578
2592
2738
2805
2923
2979
3258
3265
3371
3439
3450
3460
3678
3708
3766
3787
3941
3946
3998
4125
4146
4169
4201
4246
4407
4465
4567
4632
4764
4918
4977
5063
5272
5273
5313
5351
5410
5457
5687
5702
5723
5773
5787
5900
5995
6026
6043
6112
6180
6213
6234
6308
6405
6437
6479
6510
6683
6786
6836
7182
7184
7244
7382
7478
7499
7538
7541
7595
7602
7746
7815
8070
8116
8279
8354
8423
8471
8528
8711
8873
8896
8927
8937
9180
9182
9282
9516
9728
9795
9804
9812
9846
9916
9975
10182
10262
10268
10269
10440
10480
10555
10612
10672
10706
10765
10784
10867
10877
10884
10899
10951
11099
11115
11214
11240
11543
11561
11734
11757
11961
11974
12011
12023
12244
12351
12402
12410
12633
12703
12930
12940
12960


In [52]:
dfc = impute_attribute_by_shared_features(dfc,'HomePlanet','LastName')

89
162
280
334
346
414
429
438
532
556
611
613
653
710
725
755
759
785
837
870
944
1039
1104
1212
1292
1324
1362
1393
1440
1445
1477
1483
1492
1502
1535
1589
1690
1925
1989
2096
2106
2112
2161
2227
2233
2357
2427
2437
2578
2592
2649
2658
2738
2805
2865
2907
2916
2920
2923
2979
3160
3258
3265
3293
3362
3371
3389
3425
3439
3450
3460
3483
3516
3518
3650
3678
3708
3766
3787
3825
3923
3941
3946
3953
3957
3998
4068
4125
4146
4169
4192
4201
4246
4349
4358
4407
4465
4567
4632
4646
4652
4680
4694
4764
4792
4893
4913
4918
4977
5003
5063
5177
5272
5273
5313
5351
5410
5457
5479
5654
5687
5702
5723
5773
5787
5832
5893
5900
5905
5986
5995
6026
6043
6112
6120
6180
6187
6213
6234
6240
6308
6405
6437
6475
6479
6499
6510
6518
6552
6683
6765
6786
6836
6899
6903
6970
7002
7110
7182
7184
7244
7382
7478
7487
7499
7538
7541
7569
7572
7595
7602
7617
7746
7794
7815
7918
8070
8085
8116
8133
8279
8343
8354
8423
8425
8435
8471
8491
8528
8546
8711
8741
8743
8825
8873
8896
8907
8927
8937
8969
8972
9097
9110
9134
91

In [53]:
df.loc[347]

PassengerId             0252_01
HomePlanet                Earth
CryoSleep                 False
Cabin                    F/53/P
Destination         TRAPPIST-1e
Age                        30.0
VIP                       False
RoomService               335.0
FoodCourt                 695.0
ShoppingMall              541.0
Spa                         0.0
VRDeck                      0.0
Name              Leenny Byerry
Transported               False
Set                       Train
Group                      0252
GroupNumber                  01
CabinDeck                     F
CabinNumber                  53
CabinSide                     P
FirstName                Leenny
LastName                 Byerry
Bills                    1571.0
GroupSize                     1
PotentialDecks             None
PotentialSides             None
Name: 347, dtype: object

In [54]:
dfc[dfc.Group == '0742']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
1054,0742_01,Mars,False,E/54/S,TRAPPIST-1e,33.0,False,4.0,153.0,892.0,...,1,E,54,S,Sugark,Min,1049.0,2,,
1055,0742_02,Mars,True,F/138/S,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,...,2,F,138,S,Frunk,Min,0.0,2,,


In [55]:
df = df.sort_values(by = ['Group','GroupNumber'])
df = df.reset_index(drop = True)


In [9]:
df.iloc[1055]

PassengerId           0742_02
HomePlanet              Earth
CryoSleep                True
Cabin                 F/138/S
Destination       TRAPPIST-1e
Age                      25.0
VIP                     False
RoomService               0.0
FoodCourt                 0.0
ShoppingMall              0.0
Spa                       0.0
VRDeck                    0.0
Name                Frunk Min
Transported               NaN
Set                      Test
Group                    0742
GroupNumber                02
CabinDeck                   F
CabinNumber               138
CabinSide                   S
FirstName               Frunk
LastName                  Min
Bills                     0.0
GroupSize                   2
PotentialDecks           None
PotentialSides           None
Name: 1055, dtype: object

# Imputing

In [10]:
def impute_from_cabin_and_index(dataframe,cabin,index):
    dataframe.loc[index,['Cabin','CabinDeck','CabinNumber','CabinSide']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
    return dataframe

In [11]:
def passengers_empty_cabin_options(dataframe):
    
    df_passengers_without_cabin = dataframe[dataframe['Cabin'].isna()]
    all_passenger_cabin_options = {}

    for passenger_index, passenger in df_passengers_without_cabin.iterrows():
        all_passenger_cabin_options[passenger_index] = []

        for deck in passenger.PotentialDecks:
            for side in passenger.PotentialSides:
                
                # Filter dataframe for the current deck and side
                df_filtered = dataframe[(dataframe['CabinDeck'] == deck) & (dataframe['CabinSide'] == side)]

                # Split into cabins before and after the current passenger index
                max_cabin_no_before = max(df_filtered.loc[df_filtered.index < passenger_index, 'CabinNumber'].dropna().unique(), default = -1 )
                min_cabin_no_after = min(df_filtered.loc[df_filtered.index > passenger_index, 'CabinNumber'].dropna().unique(), default = -1)

                # If no cabins were found of that deck and side before or after the row
                if max_cabin_no_before == -1 or min_cabin_no_after == -1:
                    continue
                
                # If a cabin number is seen before the row and the next cabin number is more than 1 higher after the row
                # then there is an empty cabin it can potentially fill
                if max_cabin_no_before + 1 < min_cabin_no_after:
                    all_passenger_cabin_options[passenger_index] += [f"{deck}/{i}/{side}" for i in range(max_cabin_no_before + 1, min_cabin_no_after)]

    return all_passenger_cabin_options





# solo group and only one room that fits

In [12]:
def solo_group_one_cabin_option(dataframe):
    
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)

    # Iterates through all the passengers that haven't got a Cabin yet and are alone in their group (ie can't share)
    for passenger_index in list(df[(df.Cabin.isna()) & (df.GroupSize == 1)].index):

        # If they have only one free cabin that they could fill
        if len(all_passenger_cabin_options[passenger_index]) == 1:
            matching_cabin = all_passenger_cabin_options[passenger_index][0]
            dataframe = impute_from_cabin_and_index(dataframe,matching_cabin,passenger_index)

    return dataframe


# no free rooms so has to share

In [13]:
def no_suitable_cabin_so_shares(dataframe):
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)
    
    for passenger_index,passenger_cabin_options in all_passenger_cabin_options.items():
        
        # If there are no free cabins that the passenger can fill
        if not passenger_cabin_options:
            
            passenger_row = dataframe.loc[passenger_index]
            
            # Finding all other group members cabins and filtering them by whether they are in the same deck that the passenger must be in
            passengers_group_cabins = dataframe[(dataframe['Group'] == passenger_row['Group']) &
                                  (dataframe['CabinDeck'].isin(passenger_row['PotentialDecks']))].Cabin.dropna()
            
            # If there is only one Cabin from their group they could share with
            if passengers_group_cabins.nunique() == 1:
                matching_cabin = passengers_group_cabins.iloc[0]
                dataframe = impute_from_cabin_and_index(dataframe,matching_cabin,passenger_index)
                
    return dataframe
    


# only passenger that can take that cabin

In [14]:
def only_matching_passenger_for_cabin(dataframe):
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)
    
    cabins_to_fill = defaultdict(list)
    
    # Iterate over cabins to see which passengers can fit that cabin
    for passenger_index, cabin_options in all_passenger_cabin_options.items():
        for cabin in cabin_options:
            cabins_to_fill[cabin].append(passenger_index)
    
    # Iterate over cabin and impute passengers where only one fits
    for cabin, passengers_indices in cabins_to_fill.items():
        if len(passengers_indices) == 1:
            dataframe = impute_from_cabin_and_index(dataframe, cabin, passengers_indices[0])
    
    return dataframe


# all imputes

In [15]:
def all_imputes(dataframe):
    dataframe = solo_group_one_cabin_option(dataframe)
    dataframe = no_suitable_cabin_so_shares(dataframe)
    dataframe = only_matching_passenger_for_cabin(dataframe)

    dataframe = solo_group_one_cabin_option(dataframe)
    dataframe = no_suitable_cabin_so_shares(dataframe)
    dataframe = only_matching_passenger_for_cabin(dataframe)
    
    return dataframe
    
df = all_imputes(df)
df.isna().sum()

PassengerId           0
HomePlanet           12
CryoSleep           310
Cabin                41
Destination         274
Age                 270
VIP                 296
RoomService         263
FoodCourt           289
ShoppingMall        306
Spa                 284
VRDeck              268
Name                294
Transported        4277
Set                   0
Group                 0
GroupNumber           0
CabinDeck            41
CabinNumber          41
CabinSide            41
FirstName           294
LastName            294
Bills               785
GroupSize             0
PotentialDecks    12671
PotentialSides    12671
dtype: int64

In [17]:
df.iloc[1055]

PassengerId           0742_02
HomePlanet              Earth
CryoSleep                True
Cabin                 F/138/S
Destination       TRAPPIST-1e
Age                      25.0
VIP                     False
RoomService               0.0
FoodCourt                 0.0
ShoppingMall              0.0
Spa                       0.0
VRDeck                    0.0
Name                Frunk Min
Transported               NaN
Set                      Test
Group                    0742
GroupNumber                02
CabinDeck                   F
CabinNumber               138
CabinSide                   S
FirstName               Frunk
LastName                  Min
Bills                     0.0
GroupSize                   2
PotentialDecks           None
PotentialSides           None
Name: 1055, dtype: object

In [17]:
df[df.Group == '7442']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
10410,7442_01,Earth,False,E/495/S,TRAPPIST-1e,43.0,False,20.0,0.0,726.0,...,1,E,495.0,S,Antony,Morrencis,,2,,
10411,7442_02,Earth,True,,PSO J318.5-22,17.0,False,0.0,0.0,0.0,...,2,,,,Franda,Morrencis,0.0,2,"[E, F, G]",[S]


Earth :'G'
*** Europa: 'B'
*** Mars: 'E','F'

In [18]:
df[(df.Cabin.isna()) & (df.GroupSize > 1) & (df.Bills == 0)]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
9267,6612_05,Earth,False,,TRAPPIST-1e,2.0,False,0.0,0.0,0.0,...,5,,,,Maurie,Dickley,0.0,6,"[E, F, G]",[S]
10411,7442_02,Earth,True,,PSO J318.5-22,17.0,False,0.0,0.0,0.0,...,2,,,,Franda,Morrencis,0.0,2,"[E, F, G]",[S]
12668,9069_03,Europa,True,,55 Cancri e,25.0,False,0.0,0.0,0.0,...,3,,,,Bath,Brakeng,0.0,5,"[A, B, C, D, E, T]",[P]
12892,9223_01,Mars,True,,TRAPPIST-1e,24.0,False,0.0,0.0,,...,1,,,,Weessh,Sun,0.0,2,"[D, E, F]","[P, S]"
12893,9223_02,Mars,True,,TRAPPIST-1e,17.0,False,0.0,0.0,0.0,...,2,,,,Perit,Sun,0.0,2,"[D, E, F]","[P, S]"


In [19]:
df[df.Cabin.isna()]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
404,0293_01,Europa,True,,TRAPPIST-1e,47.0,False,0.0,0.0,0.0,...,1,,,,Tauxon,Suptibler,0.0,1,"[A, B, C, D, E, T]","[P, S]"
421,0310_01,Europa,False,,TRAPPIST-1e,67.0,False,,230.0,0.0,...,1,,,,Naviton,Coudered,,1,"[A, B, C, D, E, T]","[P, S]"
479,0348_02,Mars,,,TRAPPIST-1e,36.0,False,520.0,0.0,1865.0,...,2,,,,Weet,Mane,2385.0,2,"[D, E, F]",[P]
505,0364_02,Mars,False,,TRAPPIST-1e,37.0,False,731.0,0.0,517.0,...,2,,,,Anakes,Chité,1298.0,2,"[D, E, F]",[P]
517,0374_02,Earth,False,,TRAPPIST-1e,36.0,False,6.0,0.0,0.0,...,2,,,,Tamie,Sterreray,789.0,2,"[E, F, G]",[P]
1429,1011_01,Earth,False,,TRAPPIST-1e,31.0,False,1633.0,4.0,49.0,...,1,,,,Camie,Prinson,1692.0,2,"[E, F, G]",[P]
1466,1041_01,Europa,True,,TRAPPIST-1e,46.0,False,0.0,0.0,0.0,...,1,,,,Algrafi,Heedry,0.0,1,"[A, B, C, D, E, T]","[P, S]"
1543,1095_01,Europa,True,,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,...,1,,,,Alhail,Drelcate,0.0,1,"[A, B, C, D, E, T]","[P, S]"
2442,1709_03,Mars,False,,TRAPPIST-1e,35.0,False,1313.0,0.0,24.0,...,3,,,,Bleark,Minen,1338.0,7,"[D, E, F]",[S]
2970,2092_03,Mars,False,,TRAPPIST-1e,13.0,,6726.0,0.0,1605.0,...,3,,,,Cray,Stpie,9597.0,5,"[D, E, F]",[S]


# Manual workings


In [20]:
def all_cabin_options_for_each_row(dataframe):
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)
    for passenger_index, passenger_options in all_passenger_cabin_options.items():
        print()
        print("Index:",passenger_index, "GroupSize:", dataframe.iloc[passenger_index].GroupSize)
        print("Free cabins that match:")
        print(passenger_options)
                
             


In [22]:
all_cabin_options_for_each_row(df)


Index: 404 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']

Index: 421 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']

Index: 479 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 505 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 517 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']

Index: 1429 GroupSize: 2
Free cabins that match:
['E/58/P']

Index: 1466 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S', 'E/58/P']

Index: 1543 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S']

Index: 2442 GroupSize: 7
Free cabins that match:
[]

Index: 2970 GroupSize: 5
Free cabins that match:
[]

Index: 3529 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']

Index: 3530 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']

Index: 4233 GroupSize: 1
Free cabins that match:
['B/98/P', 'B/99/P']

Index: 4254 GroupSize: 1
Free cabins that match:
['B/98/P', 'B/99/P']

Index: 4569 GroupSize: 3
Free cabins that 

In [27]:




cabin_list = [(1429,'E/58/P'),(8413,'A/57/P'),(9265,'F/1267/S'),(9267,'F/1267/S'),(4233,'B/98/P'),(4254,'B/99/P'),(6493,'E/300/S'),(6514,'E/301/S')]

for index,cabin in cabin_list:
    impute_from_cabin_and_index(df,cabin,index)

"""
df.loc[12892,'Cabin'] = 'F/1785/S' # maybe only one is from this room and the other is joined in the other room
df.loc[12893,'Cabin'] = 'F/1785/S'
df.loc[9267,'Cabin] =  G/1077/S # has no bills so would have to be in G if the other is in F, but maybe could be in F if the other is in G
df.loc[9265,'Cabin'] = F/1267/S  

"""


"\ndf.loc[12892,'Cabin'] = 'F/1785/S' # maybe only one is from this room and the other is joined in the other room\ndf.loc[12893,'Cabin'] = 'F/1785/S'\ndf.loc[9267,'Cabin] =  G/1077/S # has no bills so would have to be in G if the other is in F, but maybe could be in F if the other is in G\ndf.loc[9265,'Cabin'] = F/1267/S  \n\n"

# End


In [28]:
traindata = df[df.Set == 'Train']
testdata = df[df.Set == 'Test']

In [24]:
df_to_comp = pd.read_csv('data/31remaining.csv')
df_to_comp = df_to_comp.rename(columns = {'Number':'CabinNumber'})
df_to_comp['CabinNumber'] = df_to_comp['CabinNumber'].astype('Int64')


In [25]:
for index,row in df.iterrows():
    if not (pd.isna(row.Cabin) and pd.isna(df_to_comp.iloc[index].Cabin)):
        if row.Cabin != df_to_comp.iloc[index].Cabin:
            print(index,row.Cabin, df_to_comp.iloc[index].Cabin)

1429 nan E/58/P
4233 nan B/98/P
4254 nan B/99/P
6493 nan E/300/S
6514 nan E/301/S
8413 nan A/57/P
9265 nan F/1267/S
9267 nan F/1267/S
12892 nan F/1785/S
12893 nan F/1785/S


# Appendix

Passengers with Mars as their home planet are in decks 'D','E' or 'F'
** Passengers with Earth as their home planet are in decks 'E','F' or 'G'
** Passengers with Europa as their home planet are in decks 'A','B','C','D','E','T'
** If a passenger has no bills (RoomService + ShoppingMall + Spa + VRDeck + FoodCourt) and has members in its group in different decks then they are restricted to these decks 
*** Earth :'G'
*** Europa: 'B'
*** Mars: 'E','F'

In [31]:
df[(df.Cabin.isna()) & (df.Bills == 0) & (df.GroupSize > 1)]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
10411,7442_02,Earth,True,,PSO J318.5-22,17.0,False,0.0,0.0,0.0,...,2,,,,Franda,Morrencis,0.0,2,"[E, F, G]",[S]
12668,9069_03,Europa,True,,55 Cancri e,25.0,False,0.0,0.0,0.0,...,3,,,,Bath,Brakeng,0.0,5,"[A, B, C, D, E, T]",[P]
12892,9223_01,Mars,True,,TRAPPIST-1e,24.0,False,0.0,0.0,,...,1,,,,Weessh,Sun,0.0,2,"[D, E, F]","[P, S]"
12893,9223_02,Mars,True,,TRAPPIST-1e,17.0,False,0.0,0.0,0.0,...,2,,,,Perit,Sun,0.0,2,"[D, E, F]","[P, S]"


In [32]:
df[df.Group == '6612']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
9263,6612_01,Earth,False,G/1077/S,TRAPPIST-1e,,False,0.0,0.0,0.0,...,1,G,1077,S,Elanie,Ewiseston,0.0,6,,
9264,6612_02,Earth,False,G/1077/S,TRAPPIST-1e,27.0,False,864.0,1.0,0.0,...,2,G,1077,S,Kaye,Barks,888.0,6,,
9265,6612_03,Earth,False,F/1267/S,TRAPPIST-1e,29.0,False,121.0,0.0,55.0,...,3,F,1267,S,Daley,Dickley,826.0,6,"[E, F, G]",[S]
9266,6612_04,Earth,False,G/1077/S,55 Cancri e,6.0,False,0.0,0.0,0.0,...,4,G,1077,S,Deanny,Barks,0.0,6,,
9267,6612_05,Earth,False,F/1267/S,TRAPPIST-1e,2.0,False,0.0,0.0,0.0,...,5,F,1267,S,Maurie,Dickley,0.0,6,"[E, F, G]",[S]
9268,6612_06,Earth,True,G/1077/S,PSO J318.5-22,6.0,False,0.0,0.0,0.0,...,6,G,1077,S,Heryle,Dickley,0.0,6,,


In [33]:
df[df.Group == a.iloc[4].Group]

NameError: name 'a' is not defined

In [26]:
for index,row in df.iterrows():
    if row.HomePlanet == 'Earth':
        if row.CabinDeck == 'F':
            if row.GroupSize > 1:
                if row.Bills == 0:
                    if df[df.Group == row.Group].CabinDeck.nunique() > 1:
                        print(index)


1055
1759
4966


In [19]:
training_data[training_data.PassengerId == '0742_01']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Set


In [23]:
df[df.Group == '0742']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
1054,0742_01,Mars,False,E/54/S,TRAPPIST-1e,33.0,False,4.0,153.0,892.0,...,1,E,54,S,Sugark,Min,1049.0,2,,
1055,0742_02,Earth,True,F/138/S,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,...,2,F,138,S,Frunk,Min,0.0,2,,


In [27]:
df.iloc[1055]

PassengerId           0742_02
HomePlanet              Earth
CryoSleep                True
Cabin                 F/138/S
Destination       TRAPPIST-1e
Age                      25.0
VIP                     False
RoomService               0.0
FoodCourt                 0.0
ShoppingMall              0.0
Spa                       0.0
VRDeck                    0.0
Name                Frunk Min
Transported               NaN
Set                      Test
Group                    0742
GroupNumber                02
CabinDeck                   F
CabinNumber               138
CabinSide                   S
FirstName               Frunk
LastName                  Min
Bills                     0.0
GroupSize                   2
PotentialDecks           None
PotentialSides           None
Name: 1055, dtype: object

In [35]:
df.iloc[1055]

PassengerId           0742_02
HomePlanet               Mars
CryoSleep                True
Cabin                 F/138/S
Destination       TRAPPIST-1e
Age                      25.0
VIP                     False
RoomService               0.0
FoodCourt                 0.0
ShoppingMall              0.0
Spa                       0.0
VRDeck                    0.0
Name                Frunk Min
Transported               NaN
Set                      Test
Group                    0742
GroupNumber                02
CabinDeck                   F
CabinNumber               138
CabinSide                   S
FirstName               Frunk
LastName                  Min
Bills                     0.0
GroupSize                   2
PotentialDecks           None
PotentialSides           None
Name: 1055, dtype: object

In [27]:
test_data[test_data.PassengerId == '0742_01']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Set
346,0742_01,Mars,False,E/54/S,TRAPPIST-1e,33.0,False,4.0,153.0,892.0,0.0,0.0,Sugark Min,Test


In [26]:
test_data[test_data.PassengerId == '0742_02']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Set
347,0742_02,Mars,True,F/138/S,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,0.0,0.0,Frunk Min,Test


In [23]:
training_data[training_data.PassengerId == '0742_01']
test_data[test_data.PassengerId == '0742_02']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Set


In [20]:
df[df.iloc[1055].Group == df.Group]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
1054,0742_01,Mars,False,E/54/S,TRAPPIST-1e,33.0,False,4.0,153.0,892.0,...,1,E,54,S,Sugark,Min,1049.0,2,,
1055,0742_02,Earth,True,F/138/S,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,...,2,F,138,S,Frunk,Min,0.0,2,,
