In the following notebook I will show you the algorithmic approach I used to fill (nearly) every Cabin. This is not a guessing/probabilistic approach, cabins are filled in a structured order based on the passengers Homeplanet and its group (from its passengerID).

Cabins are filled in order based on their number, ie if a passenger is in cabin A/05/P, a passenger in a later group cannot be in A/04/P but they could be in A/01/S, or B/01/P

We are defining the components of the cabin by 
A/01/P
A = cabin deck, can take values 'A','B','C','D','E','F','G','T'
01 = cabin number, can take values 0,1,2...
P  = cabin side, can take values 'P', 'S' (presumably 'Port' and 'Starboard' )

Some assumptions
* If two passengers are in the same group then they are on the same side, Appendix A.1
* If two passengers are in the same group then they are from the same home planet, Appendix A.2
* If two passengers share a last name then they are from the same home planet, Appendix A.3
* Children <= 12 in age have no bills, Appendix A.4
* Home planets restrict which decks a passenger is on, Appendix A.5
** Passengers with Mars as their home planet are in decks 'D','E' or 'F'
** Passengers with Earth as their home planet are in decks 'E','F' or 'G'
** Passengers with Europa as their home planet are in decks 'A','B','C','D','E','T'
** If a passenger has no bills (RoomService + ShoppingMall + Spa + VRDeck + FoodCourt) and has members in its group in different decks then they are restricted to these decks 
*** Earth :'G'
*** Europa: 'B'
*** Mars: 'E','F'



# ?? df.iloc[1055] why is this had its homeplanet changed??

In [88]:
test_data[test_data.PassengerId == '0742_02']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Set
347,0742_02,Mars,True,F/138/S,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,0.0,0.0,Frunk Min,Test


# Feature engineering

In [117]:
import pandas as pd 
from collections import defaultdict # Slightly modified from a regular dictionary


training_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
training_data['Set'] = 'Train'
test_data['Set'] = 'Test'

# The combined dataframe we will be using for the rest of this project
df = pd.concat([training_data,test_data]) 



In [118]:
def column_splits(data_frame):
    data_frame[['Group', 'GroupNumber']] = data_frame['PassengerId'].str.split('_', expand=True)

    data_frame[['CabinDeck', 'CabinNumber', 'CabinSide']]= data_frame['Cabin'].str.split("/", expand = True)
    data_frame.CabinNumber = data_frame.CabinNumber.astype('Int64')
    
    data_frame[['FirstName','LastName']] = data_frame['Name'].str.split(" ",expand = True)

    return data_frame

df = column_splits(df)


df = df.sort_values(by = ['Group','GroupNumber'])
df = df.reset_index(drop = True)

In [119]:
df['Bills'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']
df.loc[(df['Age'] < 13), 'Bills'] = 0
df.loc[(df['CryoSleep'] == True),'Bills'] = 0 
    

In [120]:
def add_group_size_column(dataframe):
    dataframe['GroupSize'] = dataframe.groupby('Group')['Group'].transform('count')
    return dataframe


df = add_group_size_column(df)


In [121]:
df[df.PassengerId == '1645_01']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Set,Group,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize
2357,1645_01,,True,,55 Cancri e,18.0,False,0.0,0.0,0.0,...,Train,1645,1,,,,Andace,Thonyderson,0.0,1


In [122]:
def impute_attribute_by_shared_features(dataframe,attribute,shared_feature):
    
    # Iterates through all the rows that have nan for this attribute
    for index, row in dataframe[dataframe[attribute].isna()].iterrows():
        rows_with_shared_features = dataframe[dataframe[shared_feature] == row[shared_feature]].dropna(subset=[attribute])
        
        if not rows_with_shared_features.empty:
            dataframe.loc[index, attribute] = rows_with_shared_features[attribute].iloc[0]

    return dataframe

df = impute_attribute_by_shared_features(df,'HomePlanet','Group')
df = impute_attribute_by_shared_features(df,'HomePlanet','LastName')

In [123]:

def add_potential_decks_column(dataframe):
    
    potential_decks_by_homeplanet = {
    'Earth':['E','F','G'],
    'Europa': ['A','B','C','D','E','T'],
    'Mars': ['D','E','F']
    }

    potential_decks_by_homeplanet_no_bills = {
        'Earth':['G'],
        'Europa':['B'],
        'Mars': ['E','F']
    }
    
    def func_potential_decks_apply(row):
        if pd.isna(row.Cabin):
            if row.Bills == 0 and not pd.isna(row.HomePlanet) and row.GroupSize > 1:
                
                group_members = dataframe[(dataframe.Group == row.Group) & (dataframe.PassengerId != row.PassengerId)].CabinDeck
                # Checking if other members of group are in multiple different cabin decks
                if group_members.dropna().nunique() > 1:
                    return potential_decks_by_homeplanet_no_bills[row.HomePlanet]
                elif not group_members.isna().any():
                    return list(set(potential_decks_by_homeplanet_no_bills[row.HomePlanet] + list(group_members.dropna().unique())))

                if group_members.nunique() == 1:
                    if group_members.iloc[0] in potential_decks_by_homeplanet_no_bills[row.HomePlanet]:
                        return potential_decks_by_homeplanet_no_bills[row.HomePlanet]
                    
            # If not then it goes to the standard decks for their homeplanet
            if not pd.isna(row.HomePlanet):
                return potential_decks_by_homeplanet[row.HomePlanet]
            
            else:
                
                # If their homeplanet isn't known then they could be in any cabin deck
                return list(dataframe.CabinDeck.dropna().unique())
            
    dataframe['PotentialDecks'] = dataframe.apply(func_potential_decks_apply,axis = 1)
    return dataframe
    
            

def add_potential_sides_column(dataframe):
    
    def func_potential_sides_apply(row):
        if pd.isna(row.Cabin):
            
            # Checks to see if anyone else in their group has a known cabin side
            group = dataframe[dataframe.Group == row.Group].CabinSide.dropna()
            if group.nunique() > 0:
                return [group.iloc[0]]
            
            # If no one else is in their group or they haven't got a known cabin side then the passenger could be on either side
            return ['P','S']
        
    dataframe['PotentialSides'] = dataframe.apply(func_potential_sides_apply,axis = 1)
    return dataframe

    
df = add_potential_decks_column(df)
df = add_potential_sides_column(df)



In [124]:
df[df.PassengerId == '7442_02']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
10411,7442_02,Earth,True,,PSO J318.5-22,17.0,False,0.0,0.0,0.0,...,2,,,,Franda,Morrencis,0.0,2,"[G, E]",[S]


In [125]:
df[df.Group == '7442']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
10410,7442_01,Earth,False,E/495/S,TRAPPIST-1e,43.0,False,20.0,0.0,726.0,...,1,E,495.0,S,Antony,Morrencis,,2,,
10411,7442_02,Earth,True,,PSO J318.5-22,17.0,False,0.0,0.0,0.0,...,2,,,,Franda,Morrencis,0.0,2,"[G, E]",[S]


In [126]:
df = df.sort_values(by = ['Group','GroupNumber'])
df = df.reset_index(drop = True)


In [127]:
df[df.Group == '0293']

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
404,0293_01,Europa,True,,TRAPPIST-1e,47.0,False,0.0,0.0,0.0,...,1,,,,Tauxon,Suptibler,0.0,1,"[A, B, C, D, E, T]","[P, S]"


# Imputing

In [128]:
def impute_from_cabin_and_index(dataframe,cabin,index):
    dataframe.loc[index,['Cabin','CabinDeck','CabinNumber','CabinSide']] = [cabin,cabin.split("/")[0],int(cabin.split("/")[1]),cabin.split("/")[2]]
    return dataframe

In [129]:
def passengers_empty_cabin_options(dataframe):
    
    df_passengers_without_cabin = dataframe[dataframe['Cabin'].isna()]
    all_passenger_cabin_options = {}

    for passenger_index, passenger in df_passengers_without_cabin.iterrows():
        all_passenger_cabin_options[passenger_index] = []

        for deck in passenger.PotentialDecks:
            for side in passenger.PotentialSides:
                
                # Filter dataframe for the current deck and side
                df_filtered = dataframe[(dataframe['CabinDeck'] == deck) & (dataframe['CabinSide'] == side)]

                # Split into cabins before and after the current passenger index
                max_cabin_no_before = max(df_filtered.loc[df_filtered.index < passenger_index, 'CabinNumber'].dropna().unique(), default = -1 )
                min_cabin_no_after = min(df_filtered.loc[df_filtered.index > passenger_index, 'CabinNumber'].dropna().unique(), default = -1)

                # If no cabins were found of that deck and side before or after the row
                if max_cabin_no_before == -1 or min_cabin_no_after == -1:
                    continue
                
                # If a cabin number is seen before the row and the next cabin number is more than 1 higher after the row
                # then there is an empty cabin it can potentially fill
                if max_cabin_no_before + 1 < min_cabin_no_after:
                    all_passenger_cabin_options[passenger_index] += [f"{deck}/{i}/{side}" for i in range(max_cabin_no_before + 1, min_cabin_no_after)]

    return all_passenger_cabin_options





# solo group and only one room that fits

In [130]:
def solo_group_one_cabin_option(dataframe):
    
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)

    # Iterates through all the passengers that haven't got a Cabin yet and are alone in their group (ie can't share)
    for passenger_index in list(df[(df.Cabin.isna()) & (df.GroupSize == 1)].index):

        # If they have only one free cabin that they could fill
        if len(all_passenger_cabin_options[passenger_index]) == 1:
            matching_cabin = all_passenger_cabin_options[passenger_index][0]
            dataframe = impute_from_cabin_and_index(dataframe,matching_cabin,passenger_index)

    return dataframe


# no free rooms so has to share

In [131]:
def no_suitable_cabin_so_shares(dataframe):
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)
    
    for passenger_index,passenger_cabin_options in all_passenger_cabin_options.items():
        
        # If there are no free cabins that the passenger can fill
        if not passenger_cabin_options:
            
            passenger_row = dataframe.loc[passenger_index]
            
            # Finding all other group members cabins and filtering them by whether they are in the same deck that the passenger must be in
            passengers_group_cabins = dataframe[(dataframe['Group'] == passenger_row['Group']) &
                                  (dataframe['CabinDeck'].isin(passenger_row['PotentialDecks']))].Cabin.dropna()
            
            # If there is only one Cabin from their group they could share with
            if passengers_group_cabins.nunique() == 1:
                matching_cabin = passengers_group_cabins.iloc[0]
                dataframe = impute_from_cabin_and_index(dataframe,matching_cabin,passenger_index)
                
    return dataframe
    


# only passenger that can take that cabin

In [132]:
def only_matching_passenger_for_cabin(dataframe):
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)
    
    cabins_to_fill = defaultdict(list)
    
    # Iterate over cabins to see which passengers can fit that cabin
    for passenger_index, cabin_options in all_passenger_cabin_options.items():
        for cabin in cabin_options:
            cabins_to_fill[cabin].append(passenger_index)
    
    # Iterate over cabin and impute passengers where only one fits
    for cabin, passengers_indices in cabins_to_fill.items():
        if len(passengers_indices) == 1:
            dataframe = impute_from_cabin_and_index(dataframe, cabin, passengers_indices[0])
    
    return dataframe


# all imputes

In [133]:
def all_imputes(dataframe):
    dataframe = solo_group_one_cabin_option(dataframe)
    dataframe = no_suitable_cabin_so_shares(dataframe)
    dataframe = only_matching_passenger_for_cabin(dataframe)

    dataframe = solo_group_one_cabin_option(dataframe)
    dataframe = no_suitable_cabin_so_shares(dataframe)
    dataframe = only_matching_passenger_for_cabin(dataframe)
    
    return dataframe
    
df = all_imputes(df)
df.isna().sum()

PassengerId           0
HomePlanet           13
CryoSleep           310
Cabin                37
Destination         274
Age                 270
VIP                 296
RoomService         263
FoodCourt           289
ShoppingMall        306
Spa                 284
VRDeck              268
Name                294
Transported        4277
Set                   0
Group                 0
GroupNumber           0
CabinDeck            37
CabinNumber          37
CabinSide            37
FirstName           294
LastName            294
Bills               785
GroupSize             0
PotentialDecks    12671
PotentialSides    12671
dtype: int64

# Manual workings


In [134]:
def all_cabin_options_for_each_row(dataframe):
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)
    for passenger_index, passenger_options in all_passenger_cabin_options.items():
        print()
        print("Index:",passenger_index, "GroupSize:", dataframe.iloc[passenger_index].GroupSize)
        print("Free cabins that match:")
        print(passenger_options)
        print(dataframe.iloc[passenger_index].Bills)

                
             


In [135]:
all_cabin_options_for_each_row(df)


Index: 404 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']
0.0

Index: 421 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']
nan

Index: 479 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']
2385.0

Index: 505 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']
1298.0

Index: 517 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P']
789.0

Index: 1429 GroupSize: 2
Free cabins that match:
['E/58/P']
1692.0

Index: 1466 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S', 'E/58/P']
0.0

Index: 1543 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S']
0.0

Index: 2442 GroupSize: 7
Free cabins that match:
[]
1338.0

Index: 2970 GroupSize: 5
Free cabins that match:
[]
9597.0

Index: 3529 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']
711.0

Index: 3530 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']
791.0

Index: 4233 GroupSize: 1
Free cabins that match:
['B/98/P', 'B/99/P']
0.0

Index: 4254 GroupSize: 1
Free cabins tha

In [136]:
df[df.Group == df.iloc[8413].Group]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize,PotentialDecks,PotentialSides
8410,6028_01,Europa,False,A/57/P,55 Cancri e,27.0,False,0.0,1801.0,0.0,...,1,A,57.0,P,Wasatz,Preeldy,3498.0,5,,
8411,6028_02,Europa,False,A/57/P,TRAPPIST-1e,32.0,True,0.0,792.0,0.0,...,2,A,57.0,P,Cherkab,Preeldy,2128.0,5,,
8412,6028_03,Europa,False,A/57/P,55 Cancri e,11.0,False,0.0,0.0,0.0,...,3,A,57.0,P,Nusakar,Preeldy,0.0,5,,
8413,6028_04,Europa,False,,TRAPPIST-1e,26.0,True,0.0,3638.0,0.0,...,4,,,,Kumark,Preeldy,4681.0,5,"[A, B, C, D, E, T]",[P]
8414,6028_05,Europa,False,A/57/P,TRAPPIST-1e,28.0,False,0.0,6921.0,0.0,...,5,A,57.0,P,Shelik,Preeldy,6995.0,5,,


### Manual imputation reasoning

* Index = 1429, Cabin = E/58/P
** Since the cabin can only be filled by 1429 and 1466, but one of 1466 and 1543 has to fill C/40/S and the other has to fill D/36/S as the only two that can fill those two, it leaves index 1429 to fill E/58/P
* Index = 4233,4254 , Cabin = B/98/P, B/99/P
** These indices weren't filled as the consecutive free cabins showed multiple options for each for those passengers, as no one else can fill it and index 4233 comes before 4254, they are filled in that order
* Index = 6493,6514 , Cabin = E/300/S, E/301/S
** As with the previous example they are the only two cabins that can fill these cabins and didn't get imputed as the free cabins are consecutive
* Index = 8413, Cabin = A/57/P
** As index 8465 and 8450 are each alone in their groups and with only two cabins to fill, one of them must fill D/191/P and one must fill E/387/P leaving index 8413 no other option but to join the only cabin that the rest of its group is in.
* Index 12892,12893 Cabin = F/1785/S, F/1785/S
** These two indices are the only members of a group together, they have one option for a cabin so they must both share F/1785/S

In [108]:




cabin_list = [(1429,'E/58/P'),(4233,'B/98/P'),(4254,'B/99/P'),(6493,'E/300/S'),(6514,'E/301/S'),(8413,'A/57/P'), (12892,'F/1785/S'),(12893,'F/1785/S')]

for index,cabin in cabin_list:
    impute_from_cabin_and_index(df,cabin,index)




# End


In [109]:
traindata = df[df.Set == 'Train']
testdata = df[df.Set == 'Test']

In [110]:
df_to_comp = pd.read_csv('data/31remaining.csv')
df_to_comp = df_to_comp.rename(columns = {'Number':'CabinNumber'})
df_to_comp['CabinNumber'] = df_to_comp['CabinNumber'].astype('Int64')


In [111]:
for index,row in df.iterrows():
    if not (pd.isna(row.Cabin) and pd.isna(df_to_comp.iloc[index].Cabin)):
        if row.Cabin != df_to_comp.iloc[index].Cabin:
            print(index,row.Cabin, df_to_comp.iloc[index].Cabin)

9267 G/1077/S F/1267/S
12651 A/94/P nan
12668 B/297/P nan


Some assumptions
* If passengers are in the same group then the cabin they are in is on the same side, Appendix A.1
* If two passengers are in the same group then they are from the same home planet, Appendix A.2
* If two passengers share a last name then they are from the same home planet, Appendix A.3
* Home planets restrict which decks a passenger is on, Appendix A.4
** Passengers with Mars as their home planet are in decks 'D','E' or 'F'
** Passengers with Earth as their home planet are in decks 'E','F' or 'G'
** Passengers with Europa as their home planet are in decks 'A','B','C','D','E','T'
** If a passenger has no bills (RoomService + ShoppingMall + Spa + VRDeck + FoodCourt) and has members in its group in different decks then they are restricted to these decks 
*** Earth :'G'
*** Europa: 'B'
*** Mars: 'E','F'
* Children <= 12 in age have no bills, Appendix A.5


# Appendix

For the Evidence in the Appendix I will reuse the combined original dataframes without any imputations as to not misrepresent the underlying distributions

In [138]:
training_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
df = pd.concat([training_data,test_data]) 

df = column_splits(df)
df = df.sort_values(by = ['Group','GroupNumber'])
df = df.reset_index(drop = True)

## A.1

Evidence of passengers sharing a group implying that their cabin is on the same side

In [144]:
# Group by 'Group' and check if all non-NaN 'CabinSide' values within each group are the same
consistent_count = 0
inconsistent_count = 0

# Iterate through each group
for group, group_df in df.groupby('Group'):
    if len(group_df) > 1:
        # Get unique non-NaN CabinSide values
        unique_sides = group_df['CabinSide'].dropna().unique()
        
        if len(unique_sides) <= 1:
            # All rows in this group are consistent
            consistent_count += len(group_df)
        else:
            # Some rows in this group are inconsistent
            inconsistent_count += len(group_df)

print(f"Number of rows with consistent cabin sides: {consistent_count}")
print(f"Number of rows with inconsistent cabin sides: {inconsistent_count}")

Number of rows with consistent cabin sides: 5825
Number of rows with inconsistent cabin sides: 0


## A.2

Evidence of passengers sharing a group implying that they have the same home planet

In [143]:
# Initialize counters
planet_consistent_count = 0
planet_inconsistent_count = 0

# Iterate through each group
for group, group_df in df.groupby('Group'):
    if len(group_df) > 1:
    # Get unique non-NaN HomePlanet values
        unique_home_planets = group_df['HomePlanet'].dropna().unique()
        
        if len(unique_home_planets) <= 1:
            # All rows in this group are consistent in HomePlanet
            planet_consistent_count += len(group_df)
        else:
            # Some rows in this group are inconsistent in HomePlanet
            planet_inconsistent_count += len(group_df)

print(f"Number of rows with consistent home planets: {planet_consistent_count}")
print(f"Number of rows with inconsistent home planets: {planet_inconsistent_count}")

Number of rows with consistent home planets: 5825
Number of rows with inconsistent home planets: 0


## A.3

Evidence of passengers sharing a last name implying that they have the same home planet

In [145]:
# Initialize counters
planet_consistent_count = 0
planet_inconsistent_count = 0

# Iterate through each last name group
for last_name, group_df in df.groupby('LastName'):
    if len(group_df) > 1:  # Exclude last names with only one passenger
        # Get unique non-NaN HomePlanet values
        unique_home_planets = group_df['HomePlanet'].dropna().unique()
        
        if len(unique_home_planets) <= 1:
            # All rows in this group are consistent in HomePlanet
            planet_consistent_count += len(group_df)
        else:
            # Some rows in this group are inconsistent in HomePlanet
            planet_inconsistent_count += len(group_df)

print(f"Number of rows with consistent home planets by last name: {planet_consistent_count}")
print(f"Number of rows with inconsistent home planets by last name: {planet_inconsistent_count}")

Number of rows with consistent home planets by last name: 12468
Number of rows with inconsistent home planets by last name: 0


## A.4

Evidence of children under the age of 13 having no bills

In [147]:
df['Bills'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']


In [163]:
# Filter out rows with NaN values in Bills
df_filtered = df[df['Bills'].notna()]

# Check if passengers under the age of 13 have bills = 0
under_13 = df_filtered[df_filtered['Age'] < 13]
under_13_bills_zero = under_13['Bills'] == 0

# Calculate summary statistics
total_under_13 = len(under_13)
bills_zero_under_13 = under_13_bills_zero.sum()
bills_not_zero_under_13 = total_under_13 - bills_zero_under_13

# Create a summary DataFrame
summary = pd.DataFrame({
    'Total Under 13': [total_under_13],
    'Bills = 0': [bills_zero_under_13],
    'Bills != 0': [bills_not_zero_under_13],
    'Consistency Ratio': [bills_zero_under_13 / total_under_13 if total_under_13 > 0 else np.nan]
})

# Print summary statistics
print("Summary statistics for passengers under the age of 13 (excluding NaN Bills):")
print(summary)

Summary statistics for passengers under the age of 13 (excluding NaN Bills):
   Total Under 13  Bills = 0  Bills != 0  Consistency Ratio
0            1030       1030           0                1.0


In [162]:
df[df.Age<13]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Name,Transported,Group,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills
20,0017_01,Earth,False,G/0/P,TRAPPIST-1e,0.0,False,0.0,0.0,0.0,...,Lyde Brighttt,True,0017,01,G,0,P,Lyde,Brighttt,0.0
24,0020_01,Earth,True,E/0/S,TRAPPIST-1e,1.0,False,0.0,0.0,0.0,...,Almary Brantuarez,False,0020,01,E,0,S,Almary,Brantuarez,0.0
27,0020_04,Earth,False,E/0/S,TRAPPIST-1e,10.0,False,0.0,0.0,0.0,...,Breney Jacostanley,True,0020,04,E,0,S,Breney,Jacostanley,0.0
28,0020_05,Earth,True,E/0/S,PSO J318.5-22,1.0,False,,0.0,0.0,...,Mael Brantuarez,False,0020,05,E,0,S,Mael,Brantuarez,
29,0020_06,Earth,False,E/0/S,TRAPPIST-1e,7.0,False,0.0,0.0,0.0,...,Terta Mcfaddennon,False,0020,06,E,0,S,Terta,Mcfaddennon,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12914,9238_01,Earth,False,G/1501/P,TRAPPIST-1e,10.0,False,0.0,0.0,0.0,...,Joana Garnettiz,,9238,01,G,1501,P,Joana,Garnettiz,0.0
12915,9238_02,Earth,True,G/1501/P,55 Cancri e,3.0,False,,0.0,0.0,...,Ricke Emenez,,9238,02,G,1501,P,Ricke,Emenez,
12919,9238_06,Earth,False,G/1501/P,55 Cancri e,0.0,False,0.0,0.0,0.0,...,Blance Garnettiz,,9238,06,G,1501,P,Blance,Garnettiz,0.0
12943,9260_01,Earth,True,G/1503/P,55 Cancri e,3.0,,0.0,0.0,0.0,...,Luisy Portananney,,9260,01,G,1503,P,Luisy,Portananney,0.0


## A.5

Evidence of home planets restricting which deck a passenger's cabin is on 

In [146]:
# Group by 'HomePlanet' and 'CabinDeck' and count occurrences
deck_counts = df.groupby(['HomePlanet', 'CabinDeck']).size().reset_index(name='Count')

# Pivot the table to get a better overview
pivot_table = deck_counts.pivot(index='CabinDeck', columns='HomePlanet', values='Count').fillna(0).astype(int)

print(pivot_table)

HomePlanet  Earth  Europa  Mars
CabinDeck                      
A               0     346     0
B               0    1124     0
C               0    1081     0
D               0     296   406
E             583     197   508
F            2426       0  1713
G            3700       0     0
T               0      10     0


In [156]:
# Initialize a dictionary to store results
results = {}

# Iterate through each group
for group, group_df in df.groupby('Group'):
    # Check if within the group there are more than one CabinDeck
    unique_decks = group_df['CabinDeck'].dropna().unique()
    
    if len(unique_decks) > 1:
        # Find passengers with bills = 0
        zero_bill_passengers = group_df[(group_df['Bills'] == 0)  & (group_df['HomePlanet'].notna())]
        
        for idx, passenger in zero_bill_passengers.iterrows():
            home_planet = passenger['HomePlanet']
            cabin_deck = passenger['CabinDeck']
            
            if home_planet not in results:
                results[home_planet] = []
            
            results[home_planet].append(cabin_deck)

# Print the results
print("CabinDecks for passengers with bills = 0 in groups with multiple CabinDecks:")
for home_planet, cabin_decks in results.items():
    cabin_deck_counts = pd.Series(cabin_decks).value_counts().to_dict()
    print(f"HomePlanet: {home_planet}")
    for deck, count in cabin_deck_counts.items():
        print(f"  CabinDeck: {deck}, Count: {count}")


CabinDecks for passengers with bills = 0 in groups with multiple CabinDecks:
HomePlanet: Earth
  CabinDeck: G, Count: 526
HomePlanet: Mars
  CabinDeck: F, Count: 160
  CabinDeck: E, Count: 47
HomePlanet: Europa
  CabinDeck: B, Count: 11


# End

In [112]:
for index,row in df_to_comp.iterrows():
    if row.HomePlanet == 'Earth':
        if row.Deck == 'E':
            if row.GroupSize == 2:
                if row.Bills == 0:
                    if df_to_comp[df_to_comp.Group == row.Group].Deck.nunique() == 1:
                        print(index)


1512
1513
2778
7216
