In [74]:

# Import necessary libraries
import pandas as pd 
from collections import defaultdict # Slightly modified from a regular dictionary

# Load the training and test data
training_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

# Add a column to distinguish between the training and test sets
training_data['Set'] = 'Train'
test_data['Set'] = 'Test'

# Combine the training and test datasets
df = pd.concat([training_data, test_data])

# Display the first few rows of the combined dataframe
df.head()

df2 = df.copy()

In [75]:
# Define a function to split columns into multiple components
def column_splits(data_frame):
    # Split PassengerId into Group and GroupNumber
    data_frame[['Group', 'GroupNumber']] = data_frame['PassengerId'].str.split('_', expand=True)
    
    # Split Cabin into CabinDeck, CabinNumber, and CabinSide
    data_frame[['CabinDeck', 'CabinNumber', 'CabinSide']] = data_frame['Cabin'].str.split("/", expand=True)
    data_frame['CabinNumber'] = data_frame['CabinNumber'].astype('Int64')
    
    # Split Name into FirstName and LastName
    data_frame[['FirstName', 'LastName']] = data_frame['Name'].str.split(" ", expand=True)

    return data_frame

# Apply the function to the combined dataframe
df = column_splits(df)

# Sort the dataframe by Group and GroupNumber
df = df.sort_values(by=['Group', 'GroupNumber'])
df = df.reset_index(drop=True)

# Display the first few rows of the modified dataframe
df.head()


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Name,Transported,Set,Group,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,Maham Ofracculy,False,Train,1,1,B,0,P,Maham,Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,Juanna Vines,True,Train,2,1,F,0,S,Juanna,Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,Altark Susent,False,Train,3,1,A,0,S,Altark,Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,Solam Susent,False,Train,3,2,A,0,S,Solam,Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,Willy Santantines,True,Train,4,1,F,1,S,Willy,Santantines


In [76]:
# Calculate total bills for each passenger
df['Bills'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']

# Impute bills to be zero for passengers under 13 or in cryosleep
df.loc[df['Age'] < 13, 'Bills'] = 0
df.loc[df['CryoSleep'] == True, 'Bills'] = 0


In [77]:
# Define a function to add a GroupSize column
def add_group_size_column(dataframe):
    dataframe['GroupSize'] = dataframe.groupby('Group')['Group'].transform('count')
    return dataframe

# Apply the function to the combined dataframe
df = add_group_size_column(df)

# Display the first few rows of the dataframe with the new GroupSize column
df.head()


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Set,Group,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,Train,1,1,B,0,P,Maham,Ofracculy,0.0,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,Train,2,1,F,0,S,Juanna,Vines,736.0,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,Train,3,1,A,0,S,Altark,Susent,10383.0,2
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,Train,3,2,A,0,S,Solam,Susent,5176.0,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,Train,4,1,F,1,S,Willy,Santantines,1091.0,1


In [78]:
# Define a function to impute attributes based on shared features
def impute_attribute_by_shared_features(dataframe, attribute, shared_feature):
    # Iterate through rows with missing values for the specified attribute
    for index, row in dataframe[dataframe[attribute].isna()].iterrows():
        # Find rows that share the specified feature and have known values for the attribute
        rows_with_shared_features = dataframe[dataframe[shared_feature] == row[shared_feature]].dropna(subset=[attribute])
        
        # Impute the attribute if there are rows with shared features and known values
        if not rows_with_shared_features.empty:
            dataframe.loc[index, attribute] = rows_with_shared_features[attribute].iloc[0]

    return dataframe

# Impute missing HomePlanet values based on shared group or last name
df = impute_attribute_by_shared_features(df, 'HomePlanet', 'Group')
df = impute_attribute_by_shared_features(df, 'HomePlanet', 'LastName')

# Display the first few rows of the dataframe with imputed HomePlanet values
df.head()


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Set,Group,GroupNumber,CabinDeck,CabinNumber,CabinSide,FirstName,LastName,Bills,GroupSize
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,Train,1,1,B,0,P,Maham,Ofracculy,0.0,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,Train,2,1,F,0,S,Juanna,Vines,736.0,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,Train,3,1,A,0,S,Altark,Susent,10383.0,2
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,Train,3,2,A,0,S,Solam,Susent,5176.0,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,Train,4,1,F,1,S,Willy,Santantines,1091.0,1


In [79]:
# Define a function to add a column for potential decks based on home planet and other conditions
def add_potential_decks_column(dataframe):
    # Define potential decks for each home planet
    potential_decks_by_homeplanet = {
        'Earth': ['E', 'F', 'G'],
        'Europa': ['A', 'B', 'C', 'D', 'E', 'T'],
        'Mars': ['D', 'E', 'F']
    }

    # Define restricted decks for passengers with no bills
    potential_decks_by_homeplanet_no_bills = {
        'Earth': ['G'],
        'Europa': ['B'],
        'Mars': ['E', 'F']
    }
    
    # Inner function to determine potential decks for each passenger
    def func_potential_decks_apply(row):
        # If the Cabin value is missing
        if pd.isna(row.Cabin):
            # If the passenger has no bills, a known HomePlanet, and is part of a group
            if row.Bills == 0 and not pd.isna(row.HomePlanet) and row.GroupSize > 1:
                # Get the decks of other group members
                group_members = dataframe[(dataframe.Group == row.Group) & (dataframe.PassengerId != row.PassengerId)].CabinDeck
                
                # If group members are in multiple different decks, restrict to specific decks for no bills
                if group_members.dropna().nunique() > 1:
                    return potential_decks_by_homeplanet_no_bills[row.HomePlanet]
                
                # If no group members have a known deck, return a combination of specific decks for no bills and known decks
                elif not group_members.isna().any():
                    return list(set(potential_decks_by_homeplanet_no_bills[row.HomePlanet] + list(group_members.dropna().unique())))
                
                # If group members are in one known deck, check if it matches the restricted decks
                if group_members.nunique() == 1:
                    if group_members.iloc[0] in potential_decks_by_homeplanet_no_bills[row.HomePlanet]:
                        return potential_decks_by_homeplanet_no_bills[row.HomePlanet]
            
            # If the passenger has bills, return the standard decks for their HomePlanet
            if not pd.isna(row.HomePlanet):
                return potential_decks_by_homeplanet[row.HomePlanet]
            
            # If the HomePlanet is unknown, return all unique decks in the dataframe
            else:
                return list(dataframe.CabinDeck.dropna().unique())
    
    # Apply the inner function to each row in the dataframe
    dataframe['PotentialDecks'] = dataframe.apply(func_potential_decks_apply, axis=1)
    return dataframe

# Define a function to add a column for potential sides based on group consistency
def add_potential_sides_column(dataframe):
    # Inner function to determine potential sides for each passenger
    def func_potential_sides_apply(row):
        # If the Cabin value is missing
        if pd.isna(row.Cabin):
            # Get the sides of other group members
            group_sides = dataframe[dataframe.Group == row.Group].CabinSide.dropna()
            
            # If other group members have a known side, return that side
            if group_sides.nunique() > 0:
                return [group_sides.iloc[0]]
            
            # If no group members have a known side, return both possible sides
            return ['P', 'S']
        
    # Apply the inner function to each row in the dataframe
    dataframe['PotentialSides'] = dataframe.apply(func_potential_sides_apply, axis=1)
    return dataframe

# Apply the functions to add potential decks and sides columns
df = add_potential_decks_column(df)
df = add_potential_sides_column(df)

In [80]:
# Sort the dataframe by Group and GroupNumber
df = df.sort_values(by=['Group', 'GroupNumber'])
df = df.reset_index(drop=True)


In [81]:
# Define a function to impute cabin details for a given passenger index
def impute_from_cabin_and_index(dataframe, cabin, index):
    # Split the cabin string into Deck, Number, and Side
    cabin_deck = cabin.split("/")[0]
    cabin_number = int(cabin.split("/")[1])
    cabin_side = cabin.split("/")[2]
    
    # Update the dataframe with the cabin details
    dataframe.loc[index, ['Cabin', 'CabinDeck', 'CabinNumber', 'CabinSide']] = [cabin, cabin_deck, cabin_number, cabin_side]
    
    return dataframe


In [82]:
# Define a function to find potential cabin options for passengers missing a cabin
def passengers_empty_cabin_options(dataframe):
    # Filter dataframe to find passengers without a cabin
    df_passengers_without_cabin = dataframe[dataframe['Cabin'].isna()]
    
    # Dictionary to store potential cabin options for each passenger
    all_passenger_cabin_options = {}

    # Iterate through each passenger without a cabin
    for passenger_index, passenger in df_passengers_without_cabin.iterrows():
        all_passenger_cabin_options[passenger_index] = []

        # Iterate through each potential deck for the passenger
        for deck in passenger.PotentialDecks:
            # Iterate through each potential side for the passenger
            for side in passenger.PotentialSides:
                # Filter dataframe for the current deck and side
                df_filtered = dataframe[(dataframe['CabinDeck'] == deck) & (dataframe['CabinSide'] == side)]

                # Find the maximum cabin number before the current passenger index
                max_cabin_no_before = max(df_filtered.loc[df_filtered.index < passenger_index, 'CabinNumber'].dropna().unique(), default=-1)
                
                # Find the minimum cabin number after the current passenger index
                min_cabin_no_after = min(df_filtered.loc[df_filtered.index > passenger_index, 'CabinNumber'].dropna().unique(), default=-1)

                # If no cabins were found of that deck and side before or after the row
                if max_cabin_no_before == -1 or min_cabin_no_after == -1:
                    continue
                
                # If there is a gap between the maximum cabin number before and the minimum cabin number after
                # Then there are potential cabins the passenger can fill
                if max_cabin_no_before + 1 < min_cabin_no_after:
                    potential_cabins = [f"{deck}/{i}/{side}" for i in range(max_cabin_no_before + 1, min_cabin_no_after)]
                    all_passenger_cabin_options[passenger_index].extend(potential_cabins)
        
        # If the passenger can share with someone in its group then put that as a cabin option
        if passenger.GroupSize > 1:
            for cabin in df[df['Group'] == passenger.Group]['Cabin'].dropna().unique():
                # check that cabin is compatible with passenger
                if cabin.split("/")[0] in passenger.PotentialDecks and cabin.split("/")[2] in passenger.PotentialSides:
                    all_passenger_cabin_options[passenger_index].append(cabin)

    return all_passenger_cabin_options



In [83]:
# if only one cabin is available for a passenger then impute it
def impute_single_cabin_option(dataframe):
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)
    for passenger_index, cabin_options in all_passenger_cabin_options.items():
        if len(cabin_options) == 1:
            dataframe = impute_from_cabin_and_index(dataframe, cabin_options[0], passenger_index)
            
    return dataframe

# imputing

In [84]:
df2 = df.copy()

In [107]:
df = df2.copy()

In [141]:
df = impute_single_cabin_option(df)

In [142]:
# Define a function to impute cabins for the only matching passenger for certain cabins
def only_matching_passenger_for_cabin(dataframe):
    # Get potential cabin options for passengers missing a cabin
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)
    
    # Dictionary to store which passengers can fit each cabin
    cabins_to_fill = defaultdict(list)
    
    # Iterate over each passenger and their potential cabin options
    for passenger_index, cabin_options in all_passenger_cabin_options.items():
        for cabin in cabin_options:
            # if no passengers are in the cabin then add it to the list
            if dataframe[dataframe['Cabin'] == cabin].shape[0] == 0:
                cabins_to_fill[cabin].append(passenger_index)
    
    # Iterate over each cabin and impute passengers where only one passenger fits
    for cabin, passengers_indices in cabins_to_fill.items():
        if len(passengers_indices) == 1:
            dataframe = impute_from_cabin_and_index(dataframe, cabin, passengers_indices[0])
    
    return dataframe




In [143]:
df = only_matching_passenger_for_cabin(df)

In [144]:
df.isna().sum()


PassengerId           0
HomePlanet           13
CryoSleep           310
Cabin                35
Destination         274
Age                 270
VIP                 296
RoomService         263
FoodCourt           289
ShoppingMall        306
Spa                 284
VRDeck              268
Name                294
Transported        4277
Set                   0
Group                 0
GroupNumber           0
CabinDeck            35
CabinNumber          35
CabinSide            35
FirstName           294
LastName            294
Bills               785
GroupSize             0
PotentialDecks    12671
PotentialSides    12671
dtype: int64

In [150]:
passengers_empty_cabin_options(df)

{404: ['B/13/P', 'C/13/S'],
 421: ['B/13/P', 'C/13/S'],
 479: ['E/20/P', 'E/21/P', 'D/12/P'],
 505: ['E/20/P', 'E/21/P', 'F/81/P'],
 517: ['E/20/P', 'E/21/P', 'F/86/P'],
 1429: ['E/58/P', 'G/148/P'],
 1466: ['C/40/S', 'D/36/S', 'E/58/P'],
 1543: ['C/40/S', 'D/36/S'],
 2442: ['F/326/S', 'D/61/S', 'E/127/S'],
 2970: ['D/70/S', 'E/153/S', 'F/410/S'],
 3529: ['E/150/P', 'F/519/P'],
 3530: ['E/150/P', 'F/519/P'],
 4233: ['B/98/P', 'B/99/P'],
 4254: ['B/98/P', 'B/99/P'],
 4569: ['G/522/S', 'F/621/S'],
 4751: ['E/232/S', 'F/645/S'],
 5016: ['G/590/P', 'G/579/S'],
 5017: ['G/590/P', 'G/579/S'],
 6493: ['E/300/S', 'E/301/S'],
 6514: ['E/300/S', 'E/301/S'],
 8413: ['D/191/P', 'A/57/P'],
 8450: ['D/191/P', 'E/387/P'],
 8465: ['D/191/P', 'E/387/P'],
 10081: ['F/1489/P', 'G/1157/P'],
 10082: ['F/1489/P', 'G/1157/P'],
 10290: ['C/270/S', 'C/269/S'],
 10313: ['C/270/S', 'D/235/P'],
 10394: ['D/235/P', 'F/1424/S'],
 10408: ['F/1424/S', 'G/1206/S'],
 10411: ['G/1206/S', 'E/495/S'],
 10434: ['F/1544/P',

In [146]:
# Define a function to print potential cabin options for each passenger with missing cabin information
def all_cabin_options_for_each_row(dataframe):
    # Get potential cabin options for passengers missing a cabin
    all_passenger_cabin_options = passengers_empty_cabin_options(dataframe)
    
    # Iterate through each passenger and their potential cabin options
    for passenger_index, passenger_options in all_passenger_cabin_options.items():
        print()
        print("PassengerId:", dataframe.iloc[passenger_index].PassengerId, "GroupSize:", dataframe.iloc[passenger_index].GroupSize)
        print("Free cabins that match:")
        print(passenger_options)




In [147]:
all_cabin_options_for_each_row(df)


PassengerId: 0293_01 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']

PassengerId: 0310_01 GroupSize: 1
Free cabins that match:
['B/13/P', 'C/13/S']

PassengerId: 0348_02 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P', 'D/12/P']

PassengerId: 0364_02 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P', 'F/81/P']

PassengerId: 0374_02 GroupSize: 2
Free cabins that match:
['E/20/P', 'E/21/P', 'F/86/P']

PassengerId: 1011_01 GroupSize: 2
Free cabins that match:
['E/58/P', 'G/148/P']

PassengerId: 1041_01 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S', 'E/58/P']

PassengerId: 1095_01 GroupSize: 1
Free cabins that match:
['C/40/S', 'D/36/S']

PassengerId: 1709_03 GroupSize: 7
Free cabins that match:
['F/326/S', 'D/61/S', 'E/127/S']

PassengerId: 2092_03 GroupSize: 5
Free cabins that match:
['D/70/S', 'E/153/S', 'F/410/S']

PassengerId: 2513_01 GroupSize: 1
Free cabins that match:
['E/150/P', 'F/519/P']

PassengerId: 2514_01 GroupSize: 1
Free cabins that m

In [148]:
df_to_comp = pd.read_csv('29remaining.csv')
df_to_comp = df_to_comp.rename(columns = {'Number':'CabinNumber'})
df_to_comp['CabinNumber'] = df_to_comp['CabinNumber'].astype('Int64')


In [149]:
for index,row in df.iterrows():
    if not (pd.isna(row.Cabin) and pd.isna(df_to_comp.iloc[index].Cabin)):
        if row.Cabin != df_to_comp.iloc[index].Cabin:
            print(index,row.Cabin, df_to_comp.iloc[index].Cabin)

1429 nan E/58/P
4233 nan B/98/P
4254 nan B/99/P
6493 nan E/300/S
6514 nan E/301/S
8413 nan A/57/P
